From 471643a90943f5d432f3457db43986c4449d3dd6 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Wed, 19 Jan 2022 20:53:03 -0800 Subject: [PATCH] init: Add diagnostics for snapuserd hangs This patch attempts to diagnose snapuserd hangs by performing reads immediately after entering second-stage init. This is done by spawning two threads: one to perform the reads, and another to wait for the read thread to finish. If any aspect of the read fails, or the read thread does not complete in 10 seconds, then a list of snapuserd's open file descriptors are logged. Bug: 207298357 Test: apply working OTA, check logcat for success apply broken OTA, check logcat for fd map Change-Id: I549e07b7d576fcdaca9b2d6ff33e0924c3812c07 --- .../snapuserd/snapuserd_daemon.cpp | 2 + init/init.cpp | 83 +++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/fs_mgr/libsnapshot/snapuserd/snapuserd_daemon.cpp b/fs_mgr/libsnapshot/snapuserd/snapuserd_daemon.cpp index ddb1f7999..a08274271 100644 --- a/fs_mgr/libsnapshot/snapuserd/snapuserd_daemon.cpp +++ b/fs_mgr/libsnapshot/snapuserd/snapuserd_daemon.cpp @@ -209,6 +209,8 @@ void Daemon::SignalHandler(int signal) { int main(int argc, char** argv) { android::base::InitLogging(argv, &android::base::KernelLogger); + LOG(INFO) << "snapuserd daemon about to start"; + android::snapshot::Daemon& daemon = android::snapshot::Daemon::Instance(); if (!daemon.StartDaemon(argc, argv)) { diff --git a/init/init.cpp b/init/init.cpp index e3596cbac..1df4c4489 100644 --- a/init/init.cpp +++ b/init/init.cpp @@ -33,6 +33,7 @@ #define _REALLY_INCLUDE_SYS__SYSTEM_PROPERTIES_H_ #include +#include #include #include #include @@ -46,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -773,6 +775,82 @@ static Result ConnectEarlyStageSnapuserdAction(const BuiltinArguments& arg return {}; } +static bool SystemReadSmokeTest() { + std::string dev = "/dev/block/mapper/system"s + fs_mgr_get_slot_suffix(); + android::base::unique_fd fd(open(dev.c_str(), O_RDONLY)); + if (fd < 0) { + PLOG(ERROR) << "open " << dev << " failed, will not diangose snapuserd hangs"; + return false; + } + + for (size_t i = 1; i <= 100; i++) { + // Skip around the partition a bit. + size_t offset = i * 4096 * 512; + + char b; + ssize_t n = TEMP_FAILURE_RETRY(pread(fd.get(), &b, 1, offset)); + if (n < 0) { + PLOG(ERROR) << "snapuserd smoke test read failed"; + return false; + } + } + return true; +} + +static void DiagnoseSnapuserdHang(pid_t pid) { + bool succeeded = false; + + std::mutex m; + std::condition_variable cv; + + // Enforce an ordering between this and the thread startup, by taking the + // lock before we lanuch the thread. + std::unique_lock cv_lock(m); + + std::thread t([&]() -> void { + std::lock_guard lock(m); + succeeded = SystemReadSmokeTest(); + cv.notify_all(); + }); + + auto join = android::base::make_scope_guard([&]() -> void { + // If the smoke test is hung, then this will too. We expect the device to + // automatically reboot once the watchdog kicks in. + t.join(); + }); + + auto now = std::chrono::system_clock::now(); + auto deadline = now + 10s; + auto status = cv.wait_until(cv_lock, deadline); + if (status == std::cv_status::timeout) { + LOG(ERROR) << "snapuserd smoke test timed out"; + } else if (!succeeded) { + LOG(ERROR) << "snapuserd smoke test failed"; + } + + if (succeeded) { + LOG(INFO) << "snapuserd smoke test succeeded"; + return; + } + + while (true) { + LOG(ERROR) << "snapuserd problem detected, printing open fds"; + + std::error_code ec; + std::string proc_dir = "/proc/" + std::to_string(pid) + "/fd"; + for (const auto& entry : std::filesystem::directory_iterator(proc_dir)) { + std::string target; + if (android::base::Readlink(entry.path(), &target)) { + LOG(ERROR) << "snapuserd opened: " << target; + } else { + LOG(ERROR) << "snapuserd opened: " << entry.path(); + } + } + + std::this_thread::sleep_for(10s); + } +} + int SecondStageMain(int argc, char** argv) { if (REBOOT_BOOTLOADER_ON_PANIC) { InstallRebootSignalHandlers(); @@ -786,6 +864,11 @@ int SecondStageMain(int argc, char** argv) { InitKernelLogging(argv); LOG(INFO) << "init second stage started!"; + if (auto pid = GetSnapuserdFirstStagePid()) { + std::thread t(DiagnoseSnapuserdHang, *pid); + t.detach(); + } + // Update $PATH in the case the second stage init is newer than first stage init, where it is // first set. if (setenv("PATH", _PATH_DEFPATH, 1) != 0) {