From 14f9c15e056061df338b258505d1ad2e3473baed Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 15 Mar 2022 21:55:23 -0700 Subject: [PATCH] init: Add more diagnostics for signalfd hangs. This adds two new diagnostics. First, signalfd reads are now non-blocking. If the read takes more than 10 seconds, we log an error. Second, init now wakes up from epoll() every 10 seconds. If it waits on an "exec" command for more than 10 seconds, it logs an error. This change will be reverted as soon as we get feedback. Bug: 223076262 Test: device boots Change-Id: I7ee98d159599217a641b3de2564a92c2435f57ef --- init/init.cpp | 36 ++++++++++++++++++++++++++++++------ init/service.cpp | 2 ++ init/service.h | 5 +++++ 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/init/init.cpp b/init/init.cpp index 1df4c4489..d5c79f865 100644 --- a/init/init.cpp +++ b/init/init.cpp @@ -580,12 +580,29 @@ static void HandleSigtermSignal(const signalfd_siginfo& siginfo) { HandlePowerctlMessage("shutdown,container"); } +static constexpr std::chrono::milliseconds kDiagnosticTimeout = 10s; + static void HandleSignalFd() { signalfd_siginfo siginfo; - ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo))); - if (bytes_read != sizeof(siginfo)) { - PLOG(ERROR) << "Failed to read siginfo from signal_fd"; - return; + auto started = std::chrono::steady_clock::now(); + for (;;) { + ssize_t bytes_read = TEMP_FAILURE_RETRY(read(signal_fd, &siginfo, sizeof(siginfo))); + if (bytes_read < 0 && errno == EAGAIN) { + auto now = std::chrono::steady_clock::now(); + std::chrono::duration waited = now - started; + if (waited >= kDiagnosticTimeout) { + LOG(ERROR) << "epoll() woke us up, but we waited with no SIGCHLD!"; + started = now; + } + + std::this_thread::sleep_for(100ms); + continue; + } + if (bytes_read != sizeof(siginfo)) { + PLOG(ERROR) << "Failed to read siginfo from signal_fd"; + return; + } + break; } switch (siginfo.ssi_signo) { @@ -641,7 +658,7 @@ static void InstallSignalFdHandler(Epoll* epoll) { LOG(FATAL) << "Failed to register a fork handler: " << strerror(result); } - signal_fd = signalfd(-1, &mask, SFD_CLOEXEC); + signal_fd = signalfd(-1, &mask, SFD_CLOEXEC | SFD_NONBLOCK); if (signal_fd == -1) { PLOG(FATAL) << "failed to create signalfd"; } @@ -1020,7 +1037,7 @@ int SecondStageMain(int argc, char** argv) { setpriority(PRIO_PROCESS, 0, 0); while (true) { // By default, sleep until something happens. - auto epoll_timeout = std::optional{}; + auto epoll_timeout = std::optional{kDiagnosticTimeout}; auto shutdown_command = shutdown_state.CheckShutdown(); if (shutdown_command) { @@ -1060,6 +1077,13 @@ int SecondStageMain(int argc, char** argv) { for (const auto& function : *pending_functions) { (*function)(); } + } else if (Service::is_exec_service_running()) { + std::chrono::duration waited = + std::chrono::steady_clock::now() - Service::exec_service_started(); + if (waited >= kDiagnosticTimeout) { + LOG(ERROR) << "Exec service is hung? Waited " << waited.count() + << " without SIGCHLD"; + } } if (!IsShuttingDown()) { HandleControlMessages(); diff --git a/init/service.cpp b/init/service.cpp index f6dd9b9fa..19e32b817 100644 --- a/init/service.cpp +++ b/init/service.cpp @@ -127,6 +127,7 @@ static bool ExpandArgsAndExecv(const std::vector& args, bool sigsto unsigned long Service::next_start_order_ = 1; bool Service::is_exec_service_running_ = false; +std::chrono::time_point Service::exec_service_started_; Service::Service(const std::string& name, Subcontext* subcontext_for_restart_commands, const std::vector& args, bool from_apex) @@ -388,6 +389,7 @@ Result Service::ExecStart() { flags_ |= SVC_EXEC; is_exec_service_running_ = true; + exec_service_started_ = std::chrono::steady_clock::now(); LOG(INFO) << "SVC_EXEC service '" << name_ << "' pid " << pid_ << " (uid " << proc_attr_.uid << " gid " << proc_attr_.gid << "+" << proc_attr_.supp_gids.size() << " context " diff --git a/init/service.h b/init/service.h index 3289f5407..a9dc96677 100644 --- a/init/service.h +++ b/init/service.h @@ -102,6 +102,9 @@ class Service { size_t CheckAllCommands() const { return onrestart_.CheckAllCommands(); } static bool is_exec_service_running() { return is_exec_service_running_; } + static std::chrono::time_point exec_service_started() { + return exec_service_started_; + } const std::string& name() const { return name_; } const std::set& classnames() const { return classnames_; } @@ -148,6 +151,8 @@ class Service { static unsigned long next_start_order_; static bool is_exec_service_running_; + static std::chrono::time_point exec_service_started_; + static pid_t exec_service_pid_; std::string name_; std::set classnames_;