diff --git a/bootstat/bootstat.cpp b/bootstat/bootstat.cpp index c2688e92a..8ce9dfcc3 100644 --- a/bootstat/bootstat.cpp +++ b/bootstat/bootstat.cpp @@ -303,6 +303,9 @@ const std::map kBootReasonMap = { {"kernel_panic,init", 158}, {"kernel_panic,oom", 159}, {"kernel_panic,stack", 160}, + {"kernel_panic,sysrq,livelock,alarm", 161}, // llkd + {"kernel_panic,sysrq,livelock,driver", 162}, // llkd + {"kernel_panic,sysrq,livelock,zombie", 163}, // llkd }; // Converts a string value representing the reason the system booted to an diff --git a/llkd/README.md b/llkd/README.md index 71319c8af..b2ba2a2f6 100644 --- a/llkd/README.md +++ b/llkd/README.md @@ -53,7 +53,9 @@ on purpose, and llkd effectively sweeps up processes that create these conditions. If the test can, it will reconfigure llkd to expedite the test duration by adjusting the ro.llk.* Android properties. Tests run the D state with some scheduling progress to ensure that ABA checking prevents false -triggers. +triggers. If 100% reliable ABA on platform, then ro.llk.killtest can be +set to false; however this will result in some of the unit tests to panic +kernel instead of deal with more graceful kill operation. Android Properties ------------------ @@ -108,13 +110,6 @@ default , comma separated list of uid numbers or names. Architectural Concerns ---------------------- -- Figure out how to communicate the kernel panic better to bootstat canonical - boot reason determination. This may require an alteration to bootstat, or - some logging from llkd. Would like to see boot reason to be - watchdog,livelock as a minimum requirement. Or more specifically would want - watchdog,livelock,device or watchdog,livelock,zombie be reported. - Currently reports panic,sysrq (user requested panic) or panic depending on - system support of pstore. - Create kernel module and associated gTest to actually test panic. - Create gTest to test out blacklist (ro.llk.blacklist. generally not be inputs). Could require more test-only interfaces to libllkd. diff --git a/llkd/include/llkd.h b/llkd/include/llkd.h index bd0739bb0..e3ae4bbd8 100644 --- a/llkd/include/llkd.h +++ b/llkd/include/llkd.h @@ -37,6 +37,8 @@ unsigned llkCheckMilliseconds(void); #define KHT_ENABLE_PROPERTY "ro." KHT_ENABLE_WRITEABLE_PROPERTY #define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall" #define LLK_MLOCKALL_DEFAULT true +#define LLK_KILLTEST_PROPERTY "ro.llk.killtest" +#define LLK_KILLTEST_DEFAULT true #define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms" #define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout" #define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms" diff --git a/llkd/libllkd.cpp b/llkd/libllkd.cpp index d82810572..f357cc2ca 100644 --- a/llkd/libllkd.cpp +++ b/llkd/libllkd.cpp @@ -70,6 +70,7 @@ milliseconds llkCycle; // ms to next thread check bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled bool llkRunning = false; // thread is running bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked +bool llkTestWithKill = LLK_KILLTEST_DEFAULT; // issue test kills milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout enum { llkStateD, llkStateZ, llkNumStates }; // state indexes milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state @@ -292,7 +293,7 @@ struct proc { exeMissingValid(false), cmdlineValid(false), updated(true), - killed(false) { + killed(!llkTestWithKill) { memset(comm, '\0', sizeof(comm)); setComm(_comm); } @@ -475,8 +476,8 @@ bool llkWriteStringToFileConfirm(const std::string& string, const std::string& f return android::base::Trim(content) == string; } -void llkPanicKernel(bool dump, pid_t tid) __noreturn; -void llkPanicKernel(bool dump, pid_t tid) { +void llkPanicKernel(bool dump, pid_t tid, const char* state) __noreturn; +void llkPanicKernel(bool dump, pid_t tid, const char* state) { auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger"); if (sysrqTriggerFd < 0) { // DYB @@ -496,6 +497,8 @@ void llkPanicKernel(bool dump, pid_t tid) { } ::usleep(200000); // let everything settle } + llkWriteStringToFile(std::string("SysRq : Trigger a crash : 'livelock,") + state + "'\n", + "/dev/kmsg"); android::base::WriteStringToFd("c", sysrqTriggerFd); // NOTREACHED // DYB @@ -507,7 +510,7 @@ void llkPanicKernel(bool dump, pid_t tid) { } void llkAlarmHandler(int) { - llkPanicKernel(false, ::getpid()); + llkPanicKernel(false, ::getpid(), "alarm"); } milliseconds GetUintProperty(const std::string& key, milliseconds def) { @@ -686,7 +689,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { (val != procp->nrSwitches)) { procp->nrSwitches = val; procp->count = 0ms; - procp->killed = false; + procp->killed = !llkTestWithKill; } return; } @@ -700,7 +703,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { if (schedUpdate != procp->schedUpdate) { procp->schedUpdate = schedUpdate; procp->count = 0ms; - procp->killed = false; + procp->killed = !llkTestWithKill; } } @@ -709,7 +712,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { if (static_cast(val) != procp->nrSwitches) { procp->nrSwitches = val; procp->count = 0ms; - procp->killed = false; + procp->killed = !llkTestWithKill; } } } @@ -719,6 +722,7 @@ void llkLogConfig(void) { << LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n" << KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n" << LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n" + << LLK_KILLTEST_PROPERTY "=" << llkFormat(llkTestWithKill) << "\n" << KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n" << LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n" << LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n" @@ -869,7 +873,7 @@ milliseconds llkCheck(bool checkRunning) { procp->time = utime + stime; if (procp->state != state) { procp->count = 0ms; - procp->killed = false; + procp->killed = !llkTestWithKill; procp->state = state; } else { procp->count += llkCycle; @@ -973,7 +977,7 @@ milliseconds llkCheck(bool checkRunning) { // We are here because we have confirmed kernel live-lock LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid << "->" << tid << ' ' << procp->getComm() << " [panic]"; - llkPanicKernel(true, tid); + llkPanicKernel(true, tid, (state == 'Z') ? "zombie" : "driver"); } LOG(VERBOSE) << "+closedir()"; } @@ -1045,6 +1049,7 @@ bool llkInit(const char* threadname) { } khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable); llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall); + llkTestWithKill = android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, llkTestWithKill); // if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set // KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value. khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout); diff --git a/llkd/llkd.rc b/llkd/llkd.rc index f762a5ced..e538cdb91 100644 --- a/llkd/llkd.rc +++ b/llkd/llkd.rc @@ -44,5 +44,6 @@ service llkd /system/bin/llkd user llkd group llkd readproc capabilities KILL IPC_LOCK + file /dev/kmsg w file /proc/sysrq-trigger w writepid /dev/cpuset/system-background/tasks diff --git a/llkd/tests/llkd_test.cpp b/llkd/tests/llkd_test.cpp index 2de18205c..3a15ff1e3 100644 --- a/llkd/tests/llkd_test.cpp +++ b/llkd/tests/llkd_test.cpp @@ -154,6 +154,27 @@ inline void waitForPid(pid_t child_pid) { ASSERT_EQ(WTERMSIG(wstatus), SIGKILL); } +bool checkKill(const char* reason) { + if (android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, LLK_KILLTEST_DEFAULT)) { + return false; + } + auto bootreason = android::base::GetProperty("sys.boot.reason", "nothing"); + if (bootreason == reason) { + GTEST_LOG_INFO << "Expected test result confirmed " << reason << "\n"; + return true; + } + GTEST_LOG_WARNING << "Expected test result is " << reason << "\n"; + + // apct adjustment if needed (set LLK_KILLTEST_PROPERTY to "off" to allow test) + // + // if (android::base::GetProperty(LLK_KILLTEST_PROPERTY, "") == "false") { + // GTEST_LOG_WARNING << "Bypassing test\n"; + // return true; + // } + + return false; +} + } // namespace // The tests that use this helper are to simulate processes stuck in 'D' @@ -221,6 +242,10 @@ TEST(llkd, driver_ABA_glacial) { // is that llkd will perform kill mitigation and not progress to kernel_panic. TEST(llkd, zombie) { + if (checkKill("kernel_panic,sysrq,livelock,zombie")) { + return; + } + const auto period = llkdSleepPeriod('Z'); /* Create a Persistent Zombie Process */ @@ -241,6 +266,10 @@ TEST(llkd, zombie) { } TEST(llkd, driver) { + if (checkKill("kernel_panic,sysrq,livelock,driver")) { + return; + } + const auto period = llkdSleepPeriod('D'); /* Create a Persistent Device Process */