init: Add reboot timeout handler

In order to prevent device stuck at reboot, we try to create shutdownt
monitor thread with a timeout (default 30s). It will dump init process
and blocked tasks call trace in last kmsg then trigger kernel panic to
reboot device.

Test: reboot device

bug: 128561401
Change-Id: Ieb400ab9fbd983544b61241a4f4b8aa2f4baa863
This commit is contained in:
josephjang 2019-04-16 18:46:24 +08:00
parent e7d0c83d3c
commit aaddf282ec

View file

@ -19,13 +19,14 @@
#include <dirent.h>
#include <fcntl.h>
#include <linux/fs.h>
#include <mntent.h>
#include <linux/loop.h>
#include <mntent.h>
#include <semaphore.h>
#include <sys/cdefs.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/swap.h>
#include <sys/stat.h>
#include <sys/swap.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
@ -57,11 +58,14 @@
#include "service.h"
#include "sigchld_handler.h"
#define PROC_SYSRQ "/proc/sysrq-trigger"
using android::base::GetBoolProperty;
using android::base::Split;
using android::base::StringPrintf;
using android::base::Timer;
using android::base::unique_fd;
using android::base::WriteStringToFile;
namespace android {
namespace init {
@ -207,8 +211,8 @@ static void DumpUmountDebuggingInfo() {
}
FindPartitionsToUmount(nullptr, nullptr, true);
// dump current CPU stack traces and uninterruptible tasks
android::base::WriteStringToFile("l", "/proc/sysrq-trigger");
android::base::WriteStringToFile("w", "/proc/sysrq-trigger");
WriteStringToFile("l", PROC_SYSRQ);
WriteStringToFile("w", PROC_SYSRQ);
}
static UmountStat UmountPartitions(std::chrono::milliseconds timeout) {
@ -248,7 +252,91 @@ static UmountStat UmountPartitions(std::chrono::milliseconds timeout) {
}
}
static void KillAllProcesses() { android::base::WriteStringToFile("i", "/proc/sysrq-trigger"); }
static void KillAllProcesses() {
WriteStringToFile("i", PROC_SYSRQ);
}
// Create reboot/shutdwon monitor thread
void RebootMonitorThread(unsigned int cmd, const std::string& rebootTarget, sem_t* reboot_semaphore,
std::chrono::milliseconds shutdown_timeout, bool* reboot_monitor_run) {
unsigned int remaining_shutdown_time = 0;
// 30 seconds more than the timeout passed to the thread as there is a final Umount pass
// after the timeout is reached.
constexpr unsigned int shutdown_watchdog_timeout_default = 30;
auto shutdown_watchdog_timeout = android::base::GetUintProperty(
"ro.build.shutdown.watchdog.timeout", shutdown_watchdog_timeout_default);
remaining_shutdown_time = shutdown_watchdog_timeout + shutdown_timeout.count() / 1000;
while (*reboot_monitor_run == true) {
if (TEMP_FAILURE_RETRY(sem_wait(reboot_semaphore)) == -1) {
LOG(ERROR) << "sem_wait failed and exit RebootMonitorThread()";
return;
}
timespec shutdown_timeout_timespec;
if (clock_gettime(CLOCK_MONOTONIC, &shutdown_timeout_timespec) == -1) {
LOG(ERROR) << "clock_gettime() fail! exit RebootMonitorThread()";
return;
}
// If there are some remaining shutdown time left from previous round, we use
// remaining time here.
shutdown_timeout_timespec.tv_sec += remaining_shutdown_time;
LOG(INFO) << "shutdown_timeout_timespec.tv_sec: " << shutdown_timeout_timespec.tv_sec;
int sem_return = 0;
while ((sem_return = sem_timedwait_monotonic_np(reboot_semaphore,
&shutdown_timeout_timespec)) == -1 &&
errno == EINTR) {
}
if (sem_return == -1) {
LOG(ERROR) << "Reboot thread timed out";
if (android::base::GetBoolProperty("ro.debuggable", false) == true) {
LOG(INFO) << "Try to dump init process call trace:";
const char* vdc_argv[] = {"/system/bin/debuggerd", "-b", "1"};
int status;
android_fork_execvp_ext(arraysize(vdc_argv), (char**)vdc_argv, &status, true,
LOG_KLOG, true, nullptr, nullptr, 0);
LOG(INFO) << "Show stack for all active CPU:";
WriteStringToFile("l", PROC_SYSRQ);
LOG(INFO) << "Show tasks that are in disk sleep(uninterruptable sleep), which are "
"like "
"blocked in mutex or hardware register access:";
WriteStringToFile("w", PROC_SYSRQ);
}
// In shutdown case,notify kernel to sync and umount fs to read-only before shutdown.
if (cmd == ANDROID_RB_POWEROFF || cmd == ANDROID_RB_THERMOFF) {
WriteStringToFile("s", PROC_SYSRQ);
WriteStringToFile("u", PROC_SYSRQ);
RebootSystem(cmd, rebootTarget);
}
LOG(ERROR) << "Trigger crash at last!";
WriteStringToFile("c", PROC_SYSRQ);
} else {
timespec current_time_timespec;
if (clock_gettime(CLOCK_MONOTONIC, &current_time_timespec) == -1) {
LOG(ERROR) << "clock_gettime() fail! exit RebootMonitorThread()";
return;
}
remaining_shutdown_time =
shutdown_timeout_timespec.tv_sec - current_time_timespec.tv_sec;
LOG(INFO) << "remaining_shutdown_time: " << remaining_shutdown_time;
}
}
}
/* Try umounting all emulated file systems R/W block device cfile systems.
* This will just try umount and give it up if it fails.
@ -259,7 +347,8 @@ static void KillAllProcesses() { android::base::WriteStringToFile("i", "/proc/sy
*
* return true when umount was successful. false when timed out.
*/
static UmountStat TryUmountAndFsck(bool runFsck, std::chrono::milliseconds timeout) {
static UmountStat TryUmountAndFsck(unsigned int cmd, const std::string& rebootTarget, bool runFsck,
std::chrono::milliseconds timeout, sem_t* reboot_semaphore) {
Timer t;
std::vector<MountEntry> block_devices;
std::vector<MountEntry> emulated_devices;
@ -279,11 +368,17 @@ static UmountStat TryUmountAndFsck(bool runFsck, std::chrono::milliseconds timeo
}
if (stat == UMOUNT_STAT_SUCCESS && runFsck) {
LOG(INFO) << "Pause reboot monitor thread before fsck";
sem_post(reboot_semaphore);
// fsck part is excluded from timeout check. It only runs for user initiated shutdown
// and should not affect reboot time.
for (auto& entry : block_devices) {
entry.DoFsck();
}
LOG(INFO) << "Resume reboot monitor thread after fsck";
sem_post(reboot_semaphore);
}
return stat;
}
@ -311,7 +406,7 @@ static void KillZramBackingDevice() {
}
LOG(INFO) << "swapoff() took " << swap_timer;;
if (!android::base::WriteStringToFile("1", ZRAM_RESET)) {
if (!WriteStringToFile("1", ZRAM_RESET)) {
LOG(ERROR) << "zram_backing_dev: reset (" << backing_dev << ")" << " failed";
return;
}
@ -369,6 +464,23 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
}
LOG(INFO) << "Shutdown timeout: " << shutdown_timeout.count() << " ms";
sem_t reboot_semaphore;
if (sem_init(&reboot_semaphore, false, 0) == -1) {
// These should never fail, but if they do, skip the graceful reboot and reboot immediately.
LOG(ERROR) << "sem_init() fail and RebootSystem() return!";
RebootSystem(cmd, rebootTarget);
}
// Start a thread to monitor init shutdown process
LOG(INFO) << "Create reboot monitor thread.";
bool reboot_monitor_run = true;
std::thread reboot_monitor_thread(&RebootMonitorThread, cmd, rebootTarget, &reboot_semaphore,
shutdown_timeout, &reboot_monitor_run);
reboot_monitor_thread.detach();
// Start reboot monitor thread
sem_post(&reboot_semaphore);
// keep debugging tools until non critical ones are all gone.
const std::set<std::string> kill_after_apps{"tombstoned", "logd", "adbd"};
// watchdogd is a vendor specific component but should be alive to complete shutdown safely.
@ -497,7 +609,8 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
// 5. drop caches and disable zram backing device, if exist
KillZramBackingDevice();
UmountStat stat = TryUmountAndFsck(runFsck, shutdown_timeout - t.duration());
UmountStat stat = TryUmountAndFsck(cmd, rebootTarget, runFsck, shutdown_timeout - t.duration(),
&reboot_semaphore);
// Follow what linux shutdown is doing: one more sync with little bit delay
{
Timer sync_timer;
@ -507,6 +620,11 @@ static void DoReboot(unsigned int cmd, const std::string& reason, const std::str
}
if (!is_thermal_shutdown) std::this_thread::sleep_for(100ms);
LogShutdownTime(stat, &t);
// Send signal to terminate reboot monitor thread.
reboot_monitor_run = false;
sem_post(&reboot_semaphore);
// Reboot regardless of umount status. If umount fails, fsck after reboot will fix it.
RebootSystem(cmd, rebootTarget);
abort();