diff --git a/fs_mgr/libsnapshot/snapuserd/snapuserd_daemon.cpp b/fs_mgr/libsnapshot/snapuserd/snapuserd_daemon.cpp index ddb1f7999..a08274271 100644 --- a/fs_mgr/libsnapshot/snapuserd/snapuserd_daemon.cpp +++ b/fs_mgr/libsnapshot/snapuserd/snapuserd_daemon.cpp @@ -209,6 +209,8 @@ void Daemon::SignalHandler(int signal) { int main(int argc, char** argv) { android::base::InitLogging(argv, &android::base::KernelLogger); + LOG(INFO) << "snapuserd daemon about to start"; + android::snapshot::Daemon& daemon = android::snapshot::Daemon::Instance(); if (!daemon.StartDaemon(argc, argv)) { diff --git a/init/init.cpp b/init/init.cpp index e3596cbac..1df4c4489 100644 --- a/init/init.cpp +++ b/init/init.cpp @@ -33,6 +33,7 @@ #define _REALLY_INCLUDE_SYS__SYSTEM_PROPERTIES_H_ #include +#include #include #include #include @@ -46,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -773,6 +775,82 @@ static Result ConnectEarlyStageSnapuserdAction(const BuiltinArguments& arg return {}; } +static bool SystemReadSmokeTest() { + std::string dev = "/dev/block/mapper/system"s + fs_mgr_get_slot_suffix(); + android::base::unique_fd fd(open(dev.c_str(), O_RDONLY)); + if (fd < 0) { + PLOG(ERROR) << "open " << dev << " failed, will not diangose snapuserd hangs"; + return false; + } + + for (size_t i = 1; i <= 100; i++) { + // Skip around the partition a bit. + size_t offset = i * 4096 * 512; + + char b; + ssize_t n = TEMP_FAILURE_RETRY(pread(fd.get(), &b, 1, offset)); + if (n < 0) { + PLOG(ERROR) << "snapuserd smoke test read failed"; + return false; + } + } + return true; +} + +static void DiagnoseSnapuserdHang(pid_t pid) { + bool succeeded = false; + + std::mutex m; + std::condition_variable cv; + + // Enforce an ordering between this and the thread startup, by taking the + // lock before we lanuch the thread. + std::unique_lock cv_lock(m); + + std::thread t([&]() -> void { + std::lock_guard lock(m); + succeeded = SystemReadSmokeTest(); + cv.notify_all(); + }); + + auto join = android::base::make_scope_guard([&]() -> void { + // If the smoke test is hung, then this will too. We expect the device to + // automatically reboot once the watchdog kicks in. + t.join(); + }); + + auto now = std::chrono::system_clock::now(); + auto deadline = now + 10s; + auto status = cv.wait_until(cv_lock, deadline); + if (status == std::cv_status::timeout) { + LOG(ERROR) << "snapuserd smoke test timed out"; + } else if (!succeeded) { + LOG(ERROR) << "snapuserd smoke test failed"; + } + + if (succeeded) { + LOG(INFO) << "snapuserd smoke test succeeded"; + return; + } + + while (true) { + LOG(ERROR) << "snapuserd problem detected, printing open fds"; + + std::error_code ec; + std::string proc_dir = "/proc/" + std::to_string(pid) + "/fd"; + for (const auto& entry : std::filesystem::directory_iterator(proc_dir)) { + std::string target; + if (android::base::Readlink(entry.path(), &target)) { + LOG(ERROR) << "snapuserd opened: " << target; + } else { + LOG(ERROR) << "snapuserd opened: " << entry.path(); + } + } + + std::this_thread::sleep_for(10s); + } +} + int SecondStageMain(int argc, char** argv) { if (REBOOT_BOOTLOADER_ON_PANIC) { InstallRebootSignalHandlers(); @@ -786,6 +864,11 @@ int SecondStageMain(int argc, char** argv) { InitKernelLogging(argv); LOG(INFO) << "init second stage started!"; + if (auto pid = GetSnapuserdFirstStagePid()) { + std::thread t(DiagnoseSnapuserdHang, *pid); + t.detach(); + } + // Update $PATH in the case the second stage init is newer than first stage init, where it is // first set. if (setenv("PATH", _PATH_DEFPATH, 1) != 0) {