Merge "llkd: bootstat: propagate detailed livelock canonical boot reason"
This commit is contained in:
commit
481a8125a6
6 changed files with 52 additions and 17 deletions
|
|
@ -303,6 +303,9 @@ const std::map<std::string, int32_t> kBootReasonMap = {
|
||||||
{"kernel_panic,init", 158},
|
{"kernel_panic,init", 158},
|
||||||
{"kernel_panic,oom", 159},
|
{"kernel_panic,oom", 159},
|
||||||
{"kernel_panic,stack", 160},
|
{"kernel_panic,stack", 160},
|
||||||
|
{"kernel_panic,sysrq,livelock,alarm", 161}, // llkd
|
||||||
|
{"kernel_panic,sysrq,livelock,driver", 162}, // llkd
|
||||||
|
{"kernel_panic,sysrq,livelock,zombie", 163}, // llkd
|
||||||
};
|
};
|
||||||
|
|
||||||
// Converts a string value representing the reason the system booted to an
|
// Converts a string value representing the reason the system booted to an
|
||||||
|
|
|
||||||
|
|
@ -53,7 +53,9 @@ on purpose, and llkd effectively sweeps up processes that create these
|
||||||
conditions. If the test can, it will reconfigure llkd to expedite the test
|
conditions. If the test can, it will reconfigure llkd to expedite the test
|
||||||
duration by adjusting the ro.llk.* Android properties. Tests run the D state
|
duration by adjusting the ro.llk.* Android properties. Tests run the D state
|
||||||
with some scheduling progress to ensure that ABA checking prevents false
|
with some scheduling progress to ensure that ABA checking prevents false
|
||||||
triggers.
|
triggers. If 100% reliable ABA on platform, then ro.llk.killtest can be
|
||||||
|
set to false; however this will result in some of the unit tests to panic
|
||||||
|
kernel instead of deal with more graceful kill operation.
|
||||||
|
|
||||||
Android Properties
|
Android Properties
|
||||||
------------------
|
------------------
|
||||||
|
|
@ -108,13 +110,6 @@ default <empty>, comma separated list of uid numbers or names.
|
||||||
Architectural Concerns
|
Architectural Concerns
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
- Figure out how to communicate the kernel panic better to bootstat canonical
|
|
||||||
boot reason determination. This may require an alteration to bootstat, or
|
|
||||||
some logging from llkd. Would like to see boot reason to be
|
|
||||||
watchdog,livelock as a minimum requirement. Or more specifically would want
|
|
||||||
watchdog,livelock,device or watchdog,livelock,zombie be reported.
|
|
||||||
Currently reports panic,sysrq (user requested panic) or panic depending on
|
|
||||||
system support of pstore.
|
|
||||||
- Create kernel module and associated gTest to actually test panic.
|
- Create kernel module and associated gTest to actually test panic.
|
||||||
- Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally
|
- Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally
|
||||||
not be inputs). Could require more test-only interfaces to libllkd.
|
not be inputs). Could require more test-only interfaces to libllkd.
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,8 @@ unsigned llkCheckMilliseconds(void);
|
||||||
#define KHT_ENABLE_PROPERTY "ro." KHT_ENABLE_WRITEABLE_PROPERTY
|
#define KHT_ENABLE_PROPERTY "ro." KHT_ENABLE_WRITEABLE_PROPERTY
|
||||||
#define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall"
|
#define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall"
|
||||||
#define LLK_MLOCKALL_DEFAULT true
|
#define LLK_MLOCKALL_DEFAULT true
|
||||||
|
#define LLK_KILLTEST_PROPERTY "ro.llk.killtest"
|
||||||
|
#define LLK_KILLTEST_DEFAULT true
|
||||||
#define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms"
|
#define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms"
|
||||||
#define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout"
|
#define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout"
|
||||||
#define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms"
|
#define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms"
|
||||||
|
|
|
||||||
|
|
@ -70,6 +70,7 @@ milliseconds llkCycle; // ms to next thread check
|
||||||
bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled
|
bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled
|
||||||
bool llkRunning = false; // thread is running
|
bool llkRunning = false; // thread is running
|
||||||
bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked
|
bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked
|
||||||
|
bool llkTestWithKill = LLK_KILLTEST_DEFAULT; // issue test kills
|
||||||
milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout
|
milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout
|
||||||
enum { llkStateD, llkStateZ, llkNumStates }; // state indexes
|
enum { llkStateD, llkStateZ, llkNumStates }; // state indexes
|
||||||
milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state
|
milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state
|
||||||
|
|
@ -292,7 +293,7 @@ struct proc {
|
||||||
exeMissingValid(false),
|
exeMissingValid(false),
|
||||||
cmdlineValid(false),
|
cmdlineValid(false),
|
||||||
updated(true),
|
updated(true),
|
||||||
killed(false) {
|
killed(!llkTestWithKill) {
|
||||||
memset(comm, '\0', sizeof(comm));
|
memset(comm, '\0', sizeof(comm));
|
||||||
setComm(_comm);
|
setComm(_comm);
|
||||||
}
|
}
|
||||||
|
|
@ -475,8 +476,8 @@ bool llkWriteStringToFileConfirm(const std::string& string, const std::string& f
|
||||||
return android::base::Trim(content) == string;
|
return android::base::Trim(content) == string;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llkPanicKernel(bool dump, pid_t tid) __noreturn;
|
void llkPanicKernel(bool dump, pid_t tid, const char* state) __noreturn;
|
||||||
void llkPanicKernel(bool dump, pid_t tid) {
|
void llkPanicKernel(bool dump, pid_t tid, const char* state) {
|
||||||
auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger");
|
auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger");
|
||||||
if (sysrqTriggerFd < 0) {
|
if (sysrqTriggerFd < 0) {
|
||||||
// DYB
|
// DYB
|
||||||
|
|
@ -496,6 +497,8 @@ void llkPanicKernel(bool dump, pid_t tid) {
|
||||||
}
|
}
|
||||||
::usleep(200000); // let everything settle
|
::usleep(200000); // let everything settle
|
||||||
}
|
}
|
||||||
|
llkWriteStringToFile(std::string("SysRq : Trigger a crash : 'livelock,") + state + "'\n",
|
||||||
|
"/dev/kmsg");
|
||||||
android::base::WriteStringToFd("c", sysrqTriggerFd);
|
android::base::WriteStringToFd("c", sysrqTriggerFd);
|
||||||
// NOTREACHED
|
// NOTREACHED
|
||||||
// DYB
|
// DYB
|
||||||
|
|
@ -507,7 +510,7 @@ void llkPanicKernel(bool dump, pid_t tid) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void llkAlarmHandler(int) {
|
void llkAlarmHandler(int) {
|
||||||
llkPanicKernel(false, ::getpid());
|
llkPanicKernel(false, ::getpid(), "alarm");
|
||||||
}
|
}
|
||||||
|
|
||||||
milliseconds GetUintProperty(const std::string& key, milliseconds def) {
|
milliseconds GetUintProperty(const std::string& key, milliseconds def) {
|
||||||
|
|
@ -686,7 +689,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
|
||||||
(val != procp->nrSwitches)) {
|
(val != procp->nrSwitches)) {
|
||||||
procp->nrSwitches = val;
|
procp->nrSwitches = val;
|
||||||
procp->count = 0ms;
|
procp->count = 0ms;
|
||||||
procp->killed = false;
|
procp->killed = !llkTestWithKill;
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
@ -700,7 +703,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
|
||||||
if (schedUpdate != procp->schedUpdate) {
|
if (schedUpdate != procp->schedUpdate) {
|
||||||
procp->schedUpdate = schedUpdate;
|
procp->schedUpdate = schedUpdate;
|
||||||
procp->count = 0ms;
|
procp->count = 0ms;
|
||||||
procp->killed = false;
|
procp->killed = !llkTestWithKill;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -709,7 +712,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
|
||||||
if (static_cast<uint64_t>(val) != procp->nrSwitches) {
|
if (static_cast<uint64_t>(val) != procp->nrSwitches) {
|
||||||
procp->nrSwitches = val;
|
procp->nrSwitches = val;
|
||||||
procp->count = 0ms;
|
procp->count = 0ms;
|
||||||
procp->killed = false;
|
procp->killed = !llkTestWithKill;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -719,6 +722,7 @@ void llkLogConfig(void) {
|
||||||
<< LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n"
|
<< LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n"
|
||||||
<< KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n"
|
<< KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n"
|
||||||
<< LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n"
|
<< LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n"
|
||||||
|
<< LLK_KILLTEST_PROPERTY "=" << llkFormat(llkTestWithKill) << "\n"
|
||||||
<< KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n"
|
<< KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n"
|
||||||
<< LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n"
|
<< LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n"
|
||||||
<< LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n"
|
<< LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n"
|
||||||
|
|
@ -869,7 +873,7 @@ milliseconds llkCheck(bool checkRunning) {
|
||||||
procp->time = utime + stime;
|
procp->time = utime + stime;
|
||||||
if (procp->state != state) {
|
if (procp->state != state) {
|
||||||
procp->count = 0ms;
|
procp->count = 0ms;
|
||||||
procp->killed = false;
|
procp->killed = !llkTestWithKill;
|
||||||
procp->state = state;
|
procp->state = state;
|
||||||
} else {
|
} else {
|
||||||
procp->count += llkCycle;
|
procp->count += llkCycle;
|
||||||
|
|
@ -973,7 +977,7 @@ milliseconds llkCheck(bool checkRunning) {
|
||||||
// We are here because we have confirmed kernel live-lock
|
// We are here because we have confirmed kernel live-lock
|
||||||
LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid
|
LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid
|
||||||
<< "->" << tid << ' ' << procp->getComm() << " [panic]";
|
<< "->" << tid << ' ' << procp->getComm() << " [panic]";
|
||||||
llkPanicKernel(true, tid);
|
llkPanicKernel(true, tid, (state == 'Z') ? "zombie" : "driver");
|
||||||
}
|
}
|
||||||
LOG(VERBOSE) << "+closedir()";
|
LOG(VERBOSE) << "+closedir()";
|
||||||
}
|
}
|
||||||
|
|
@ -1045,6 +1049,7 @@ bool llkInit(const char* threadname) {
|
||||||
}
|
}
|
||||||
khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable);
|
khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable);
|
||||||
llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall);
|
llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall);
|
||||||
|
llkTestWithKill = android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, llkTestWithKill);
|
||||||
// if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set
|
// if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set
|
||||||
// KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value.
|
// KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value.
|
||||||
khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);
|
khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);
|
||||||
|
|
|
||||||
|
|
@ -44,5 +44,6 @@ service llkd /system/bin/llkd
|
||||||
user llkd
|
user llkd
|
||||||
group llkd readproc
|
group llkd readproc
|
||||||
capabilities KILL IPC_LOCK
|
capabilities KILL IPC_LOCK
|
||||||
|
file /dev/kmsg w
|
||||||
file /proc/sysrq-trigger w
|
file /proc/sysrq-trigger w
|
||||||
writepid /dev/cpuset/system-background/tasks
|
writepid /dev/cpuset/system-background/tasks
|
||||||
|
|
|
||||||
|
|
@ -154,6 +154,27 @@ inline void waitForPid(pid_t child_pid) {
|
||||||
ASSERT_EQ(WTERMSIG(wstatus), SIGKILL);
|
ASSERT_EQ(WTERMSIG(wstatus), SIGKILL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool checkKill(const char* reason) {
|
||||||
|
if (android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, LLK_KILLTEST_DEFAULT)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
auto bootreason = android::base::GetProperty("sys.boot.reason", "nothing");
|
||||||
|
if (bootreason == reason) {
|
||||||
|
GTEST_LOG_INFO << "Expected test result confirmed " << reason << "\n";
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
GTEST_LOG_WARNING << "Expected test result is " << reason << "\n";
|
||||||
|
|
||||||
|
// apct adjustment if needed (set LLK_KILLTEST_PROPERTY to "off" to allow test)
|
||||||
|
//
|
||||||
|
// if (android::base::GetProperty(LLK_KILLTEST_PROPERTY, "") == "false") {
|
||||||
|
// GTEST_LOG_WARNING << "Bypassing test\n";
|
||||||
|
// return true;
|
||||||
|
// }
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
// The tests that use this helper are to simulate processes stuck in 'D'
|
// The tests that use this helper are to simulate processes stuck in 'D'
|
||||||
|
|
@ -221,6 +242,10 @@ TEST(llkd, driver_ABA_glacial) {
|
||||||
// is that llkd will perform kill mitigation and not progress to kernel_panic.
|
// is that llkd will perform kill mitigation and not progress to kernel_panic.
|
||||||
|
|
||||||
TEST(llkd, zombie) {
|
TEST(llkd, zombie) {
|
||||||
|
if (checkKill("kernel_panic,sysrq,livelock,zombie")) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const auto period = llkdSleepPeriod('Z');
|
const auto period = llkdSleepPeriod('Z');
|
||||||
|
|
||||||
/* Create a Persistent Zombie Process */
|
/* Create a Persistent Zombie Process */
|
||||||
|
|
@ -241,6 +266,10 @@ TEST(llkd, zombie) {
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST(llkd, driver) {
|
TEST(llkd, driver) {
|
||||||
|
if (checkKill("kernel_panic,sysrq,livelock,driver")) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const auto period = llkdSleepPeriod('D');
|
const auto period = llkdSleepPeriod('D');
|
||||||
|
|
||||||
/* Create a Persistent Device Process */
|
/* Create a Persistent Device Process */
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue