diff --git a/fs_mgr/libsnapshot/android/snapshot/snapshot.proto b/fs_mgr/libsnapshot/android/snapshot/snapshot.proto index 1ebc29f95..012f2aee7 100644 --- a/fs_mgr/libsnapshot/android/snapshot/snapshot.proto +++ b/fs_mgr/libsnapshot/android/snapshot/snapshot.proto @@ -139,7 +139,28 @@ enum UpdateState { Cancelled = 7; }; -// Next: 7 +// Next 14: +// +// To understand the source of each failure, read snapshot.cpp. To handle new +// sources of failure, avoid reusing an existing code; add a new code instead. +enum MergeFailureCode { + Ok = 0; + ReadStatus = 1; + GetTableInfo = 2; + UnknownTable = 3; + GetTableParams = 4; + ActivateNewTable = 5; + AcquireLock = 6; + ListSnapshots = 7; + WriteStatus = 8; + UnknownTargetType = 9; + QuerySnapshotStatus = 10; + ExpectedMergeTarget = 11; + UnmergedSectorsAfterCompletion = 12; + UnexpectedMergeState = 13; +}; + +// Next: 8 message SnapshotUpdateStatus { UpdateState state = 1; @@ -160,6 +181,9 @@ message SnapshotUpdateStatus { // Merge phase (if state == MERGING). MergePhase merge_phase = 6; + + // Merge failure code, filled if state == MergeFailed. + MergeFailureCode merge_failure_code = 7; } // Next: 9 diff --git a/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h b/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h index 126e1a0a5..81cb64058 100644 --- a/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h +++ b/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h @@ -381,6 +381,7 @@ class SnapshotManager final : public ISnapshotManager { FRIEND_TEST(SnapshotTest, MapPartialSnapshot); FRIEND_TEST(SnapshotTest, MapSnapshot); FRIEND_TEST(SnapshotTest, Merge); + FRIEND_TEST(SnapshotTest, MergeFailureCode); FRIEND_TEST(SnapshotTest, NoMergeBeforeReboot); FRIEND_TEST(SnapshotTest, UpdateBootControlHal); FRIEND_TEST(SnapshotUpdateTest, DaemonTransition); @@ -532,7 +533,8 @@ class SnapshotManager final : public ISnapshotManager { // Interact with /metadata/ota/state. UpdateState ReadUpdateState(LockedFile* file); SnapshotUpdateStatus ReadSnapshotUpdateStatus(LockedFile* file); - bool WriteUpdateState(LockedFile* file, UpdateState state); + bool WriteUpdateState(LockedFile* file, UpdateState state, + MergeFailureCode failure_code = MergeFailureCode::Ok); bool WriteSnapshotUpdateStatus(LockedFile* file, const SnapshotUpdateStatus& status); std::string GetStateFilePath() const; @@ -541,12 +543,12 @@ class SnapshotManager final : public ISnapshotManager { std::string GetMergeStateFilePath() const; // Helpers for merging. - bool MergeSecondPhaseSnapshots(LockedFile* lock); - bool SwitchSnapshotToMerge(LockedFile* lock, const std::string& name); - bool RewriteSnapshotDeviceTable(const std::string& dm_name); + MergeFailureCode MergeSecondPhaseSnapshots(LockedFile* lock); + MergeFailureCode SwitchSnapshotToMerge(LockedFile* lock, const std::string& name); + MergeFailureCode RewriteSnapshotDeviceTable(const std::string& dm_name); bool MarkSnapshotMergeCompleted(LockedFile* snapshot_lock, const std::string& snapshot_name); void AcknowledgeMergeSuccess(LockedFile* lock); - void AcknowledgeMergeFailure(); + void AcknowledgeMergeFailure(MergeFailureCode failure_code); MergePhase DecideMergePhase(const SnapshotStatus& status); std::unique_ptr ReadCurrentMetadata(); @@ -573,14 +575,22 @@ class SnapshotManager final : public ISnapshotManager { const SnapshotStatus& status); bool CollapseSnapshotDevice(const std::string& name, const SnapshotStatus& status); + struct MergeResult { + explicit MergeResult(UpdateState state, + MergeFailureCode failure_code = MergeFailureCode::Ok) + : state(state), failure_code(failure_code) {} + UpdateState state; + MergeFailureCode failure_code; + }; + // Only the following UpdateStates are used here: // UpdateState::Merging // UpdateState::MergeCompleted // UpdateState::MergeFailed // UpdateState::MergeNeedsReboot - UpdateState CheckMergeState(const std::function& before_cancel); - UpdateState CheckMergeState(LockedFile* lock, const std::function& before_cancel); - UpdateState CheckTargetMergeState(LockedFile* lock, const std::string& name, + MergeResult CheckMergeState(const std::function& before_cancel); + MergeResult CheckMergeState(LockedFile* lock, const std::function& before_cancel); + MergeResult CheckTargetMergeState(LockedFile* lock, const std::string& name, const SnapshotUpdateStatus& update_status); // Interact with status files under /metadata/ota/snapshots. diff --git a/fs_mgr/libsnapshot/snapshot.cpp b/fs_mgr/libsnapshot/snapshot.cpp index c50435564..64a434c4e 100644 --- a/fs_mgr/libsnapshot/snapshot.cpp +++ b/fs_mgr/libsnapshot/snapshot.cpp @@ -746,32 +746,35 @@ bool SnapshotManager::InitiateMerge() { return false; } - bool rewrote_all = true; + auto reported_code = MergeFailureCode::Ok; for (const auto& snapshot : *merge_group) { // If this fails, we have no choice but to continue. Everything must // be merged. This is not an ideal state to be in, but it is safe, // because we the next boot will try again. - if (!SwitchSnapshotToMerge(lock.get(), snapshot)) { + auto code = SwitchSnapshotToMerge(lock.get(), snapshot); + if (code != MergeFailureCode::Ok) { LOG(ERROR) << "Failed to switch snapshot to a merge target: " << snapshot; - rewrote_all = false; + if (reported_code == MergeFailureCode::Ok) { + reported_code = code; + } } } // If we couldn't switch everything to a merge target, pre-emptively mark // this merge as failed. It will get acknowledged when WaitForMerge() is // called. - if (!rewrote_all) { - WriteUpdateState(lock.get(), UpdateState::MergeFailed); + if (reported_code != MergeFailureCode::Ok) { + WriteUpdateState(lock.get(), UpdateState::MergeFailed, reported_code); } // Return true no matter what, because a merge was initiated. return true; } -bool SnapshotManager::SwitchSnapshotToMerge(LockedFile* lock, const std::string& name) { +MergeFailureCode SnapshotManager::SwitchSnapshotToMerge(LockedFile* lock, const std::string& name) { SnapshotStatus status; if (!ReadSnapshotStatus(lock, name, &status)) { - return false; + return MergeFailureCode::ReadStatus; } if (status.state() != SnapshotState::CREATED) { LOG(WARNING) << "Snapshot " << name @@ -780,8 +783,8 @@ bool SnapshotManager::SwitchSnapshotToMerge(LockedFile* lock, const std::string& // After this, we return true because we technically did switch to a merge // target. Everything else we do here is just informational. - if (!RewriteSnapshotDeviceTable(name)) { - return false; + if (auto code = RewriteSnapshotDeviceTable(name); code != MergeFailureCode::Ok) { + return code; } status.set_state(SnapshotState::MERGING); @@ -795,26 +798,26 @@ bool SnapshotManager::SwitchSnapshotToMerge(LockedFile* lock, const std::string& if (!WriteSnapshotStatus(lock, status)) { LOG(ERROR) << "Could not update status file for snapshot: " << name; } - return true; + return MergeFailureCode::Ok; } -bool SnapshotManager::RewriteSnapshotDeviceTable(const std::string& name) { +MergeFailureCode SnapshotManager::RewriteSnapshotDeviceTable(const std::string& name) { auto& dm = DeviceMapper::Instance(); std::vector old_targets; if (!dm.GetTableInfo(name, &old_targets)) { LOG(ERROR) << "Could not read snapshot device table: " << name; - return false; + return MergeFailureCode::GetTableInfo; } if (old_targets.size() != 1 || DeviceMapper::GetTargetType(old_targets[0].spec) != "snapshot") { LOG(ERROR) << "Unexpected device-mapper table for snapshot: " << name; - return false; + return MergeFailureCode::UnknownTable; } std::string base_device, cow_device; if (!DmTargetSnapshot::GetDevicesFromParams(old_targets[0].data, &base_device, &cow_device)) { LOG(ERROR) << "Could not derive underlying devices for snapshot: " << name; - return false; + return MergeFailureCode::GetTableParams; } DmTable table; @@ -822,10 +825,10 @@ bool SnapshotManager::RewriteSnapshotDeviceTable(const std::string& name) { SnapshotStorageMode::Merge, kSnapshotChunkSize); if (!dm.LoadTableAndActivate(name, table)) { LOG(ERROR) << "Could not swap device-mapper tables on snapshot device " << name; - return false; + return MergeFailureCode::ActivateNewTable; } LOG(INFO) << "Successfully switched snapshot device to a merge target: " << name; - return true; + return MergeFailureCode::Ok; } enum class TableQuery { @@ -897,20 +900,20 @@ bool SnapshotManager::QuerySnapshotStatus(const std::string& dm_name, std::strin UpdateState SnapshotManager::ProcessUpdateState(const std::function& callback, const std::function& before_cancel) { while (true) { - UpdateState state = CheckMergeState(before_cancel); - LOG(INFO) << "ProcessUpdateState handling state: " << state; + auto result = CheckMergeState(before_cancel); + LOG(INFO) << "ProcessUpdateState handling state: " << result.state; - if (state == UpdateState::MergeFailed) { - AcknowledgeMergeFailure(); + if (result.state == UpdateState::MergeFailed) { + AcknowledgeMergeFailure(result.failure_code); } - if (state != UpdateState::Merging) { + if (result.state != UpdateState::Merging) { // Either there is no merge, or the merge was finished, so no need // to keep waiting. - return state; + return result.state; } if (callback && !callback()) { - return state; + return result.state; } // This wait is not super time sensitive, so we have a relatively @@ -919,36 +922,36 @@ UpdateState SnapshotManager::ProcessUpdateState(const std::function& cal } } -UpdateState SnapshotManager::CheckMergeState(const std::function& before_cancel) { +auto SnapshotManager::CheckMergeState(const std::function& before_cancel) -> MergeResult { auto lock = LockExclusive(); if (!lock) { - return UpdateState::MergeFailed; + return MergeResult(UpdateState::MergeFailed, MergeFailureCode::AcquireLock); } - UpdateState state = CheckMergeState(lock.get(), before_cancel); - LOG(INFO) << "CheckMergeState for snapshots returned: " << state; + auto result = CheckMergeState(lock.get(), before_cancel); + LOG(INFO) << "CheckMergeState for snapshots returned: " << result.state; - if (state == UpdateState::MergeCompleted) { + if (result.state == UpdateState::MergeCompleted) { // Do this inside the same lock. Failures get acknowledged without the // lock, because flock() might have failed. AcknowledgeMergeSuccess(lock.get()); - } else if (state == UpdateState::Cancelled) { + } else if (result.state == UpdateState::Cancelled) { if (!device_->IsRecovery() && !RemoveAllUpdateState(lock.get(), before_cancel)) { LOG(ERROR) << "Failed to remove all update state after acknowleding cancelled update."; } } - return state; + return result; } -UpdateState SnapshotManager::CheckMergeState(LockedFile* lock, - const std::function& before_cancel) { +auto SnapshotManager::CheckMergeState(LockedFile* lock, const std::function& before_cancel) + -> MergeResult { SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock); switch (update_status.state()) { case UpdateState::None: case UpdateState::MergeCompleted: // Harmless races are allowed between two callers of WaitForMerge, // so in both of these cases we just propagate the state. - return update_status.state(); + return MergeResult(update_status.state()); case UpdateState::Merging: case UpdateState::MergeNeedsReboot: @@ -963,26 +966,26 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock, // via the merge poll below, but if we never started a merge, we // need to also check here. if (HandleCancelledUpdate(lock, before_cancel)) { - return UpdateState::Cancelled; + return MergeResult(UpdateState::Cancelled); } - return update_status.state(); + return MergeResult(update_status.state()); default: - return update_status.state(); + return MergeResult(update_status.state()); } std::vector snapshots; if (!ListSnapshots(lock, &snapshots)) { - return UpdateState::MergeFailed; + return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ListSnapshots); } auto other_suffix = device_->GetOtherSlotSuffix(); bool cancelled = false; - bool failed = false; bool merging = false; bool needs_reboot = false; bool wrong_phase = false; + MergeFailureCode failure_code = MergeFailureCode::Ok; for (const auto& snapshot : snapshots) { if (android::base::EndsWith(snapshot, other_suffix)) { // This will have triggered an error message in InitiateMerge already. @@ -990,12 +993,15 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock, continue; } - UpdateState snapshot_state = CheckTargetMergeState(lock, snapshot, update_status); - LOG(INFO) << "CheckTargetMergeState for " << snapshot << " returned: " << snapshot_state; + auto result = CheckTargetMergeState(lock, snapshot, update_status); + LOG(INFO) << "CheckTargetMergeState for " << snapshot << " returned: " << result.state; - switch (snapshot_state) { + switch (result.state) { case UpdateState::MergeFailed: - failed = true; + // Take the first failure code in case other failures compound. + if (failure_code == MergeFailureCode::Ok) { + failure_code = result.failure_code; + } break; case UpdateState::Merging: merging = true; @@ -1013,8 +1019,10 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock, break; default: LOG(ERROR) << "Unknown merge status for \"" << snapshot << "\": " - << "\"" << snapshot_state << "\""; - failed = true; + << "\"" << result.state << "\""; + if (failure_code == MergeFailureCode::Ok) { + failure_code = MergeFailureCode::UnexpectedMergeState; + } break; } } @@ -1023,24 +1031,25 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock, // Note that we handle "Merging" before we handle anything else. We // want to poll until *nothing* is merging if we can, so everything has // a chance to get marked as completed or failed. - return UpdateState::Merging; + return MergeResult(UpdateState::Merging); } - if (failed) { + if (failure_code != MergeFailureCode::Ok) { // Note: since there are many drop-out cases for failure, we acknowledge // it in WaitForMerge rather than here and elsewhere. - return UpdateState::MergeFailed; + return MergeResult(UpdateState::MergeFailed, failure_code); } if (wrong_phase) { // If we got here, no other partitions are being merged, and nothing // failed to merge. It's safe to move to the next merge phase. - if (!MergeSecondPhaseSnapshots(lock)) { - return UpdateState::MergeFailed; + auto code = MergeSecondPhaseSnapshots(lock); + if (code != MergeFailureCode::Ok) { + return MergeResult(UpdateState::MergeFailed, code); } - return UpdateState::Merging; + return MergeResult(UpdateState::Merging); } if (needs_reboot) { WriteUpdateState(lock, UpdateState::MergeNeedsReboot); - return UpdateState::MergeNeedsReboot; + return MergeResult(UpdateState::MergeNeedsReboot); } if (cancelled) { // This is an edge case, that we handle as correctly as we sensibly can. @@ -1048,16 +1057,17 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock, // removed the snapshot as a result. The exact state of the update is // undefined now, but this can only happen on an unlocked device where // partitions can be flashed without wiping userdata. - return UpdateState::Cancelled; + return MergeResult(UpdateState::Cancelled); } - return UpdateState::MergeCompleted; + return MergeResult(UpdateState::MergeCompleted); } -UpdateState SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::string& name, - const SnapshotUpdateStatus& update_status) { +auto SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::string& name, + const SnapshotUpdateStatus& update_status) + -> MergeResult { SnapshotStatus snapshot_status; if (!ReadSnapshotStatus(lock, name, &snapshot_status)) { - return UpdateState::MergeFailed; + return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ReadStatus); } std::unique_ptr current_metadata; @@ -1070,7 +1080,7 @@ UpdateState SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std:: if (!current_metadata || GetMetadataPartitionState(*current_metadata, name) != MetadataPartitionState::Updated) { DeleteSnapshot(lock, name); - return UpdateState::Cancelled; + return MergeResult(UpdateState::Cancelled); } // During a check, we decided the merge was complete, but we were unable to @@ -1081,11 +1091,11 @@ UpdateState SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std:: if (snapshot_status.state() == SnapshotState::MERGE_COMPLETED) { // NB: It's okay if this fails now, we gave cleanup our best effort. OnSnapshotMergeComplete(lock, name, snapshot_status); - return UpdateState::MergeCompleted; + return MergeResult(UpdateState::MergeCompleted); } LOG(ERROR) << "Expected snapshot or snapshot-merge for device: " << name; - return UpdateState::MergeFailed; + return MergeResult(UpdateState::MergeFailed, MergeFailureCode::UnknownTargetType); } // This check is expensive so it is only enabled for debugging. @@ -1095,29 +1105,30 @@ UpdateState SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std:: std::string target_type; DmTargetSnapshot::Status status; if (!QuerySnapshotStatus(name, &target_type, &status)) { - return UpdateState::MergeFailed; + return MergeResult(UpdateState::MergeFailed, MergeFailureCode::QuerySnapshotStatus); } if (target_type == "snapshot" && DecideMergePhase(snapshot_status) == MergePhase::SECOND_PHASE && update_status.merge_phase() == MergePhase::FIRST_PHASE) { // The snapshot is not being merged because it's in the wrong phase. - return UpdateState::None; + return MergeResult(UpdateState::None); } if (target_type != "snapshot-merge") { // We can get here if we failed to rewrite the target type in // InitiateMerge(). If we failed to create the target in first-stage // init, boot would not succeed. LOG(ERROR) << "Snapshot " << name << " has incorrect target type: " << target_type; - return UpdateState::MergeFailed; + return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ExpectedMergeTarget); } // These two values are equal when merging is complete. if (status.sectors_allocated != status.metadata_sectors) { if (snapshot_status.state() == SnapshotState::MERGE_COMPLETED) { LOG(ERROR) << "Snapshot " << name << " is merging after being marked merge-complete."; - return UpdateState::MergeFailed; + return MergeResult(UpdateState::MergeFailed, + MergeFailureCode::UnmergedSectorsAfterCompletion); } - return UpdateState::Merging; + return MergeResult(UpdateState::Merging); } // Merging is done. First, update the status file to indicate the merge @@ -1130,18 +1141,18 @@ UpdateState SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std:: // snapshot device for this partition. snapshot_status.set_state(SnapshotState::MERGE_COMPLETED); if (!WriteSnapshotStatus(lock, snapshot_status)) { - return UpdateState::MergeFailed; + return MergeResult(UpdateState::MergeFailed, MergeFailureCode::WriteStatus); } if (!OnSnapshotMergeComplete(lock, name, snapshot_status)) { - return UpdateState::MergeNeedsReboot; + return MergeResult(UpdateState::MergeNeedsReboot); } - return UpdateState::MergeCompleted; + return MergeResult(UpdateState::MergeCompleted, MergeFailureCode::Ok); } -bool SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) { +MergeFailureCode SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) { std::vector snapshots; if (!ListSnapshots(lock, &snapshots)) { - return UpdateState::MergeFailed; + return MergeFailureCode::ListSnapshots; } SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock); @@ -1150,24 +1161,27 @@ bool SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) { update_status.set_merge_phase(MergePhase::SECOND_PHASE); if (!WriteSnapshotUpdateStatus(lock, update_status)) { - return false; + return MergeFailureCode::WriteStatus; } - bool rewrote_all = true; + MergeFailureCode result = MergeFailureCode::Ok; for (const auto& snapshot : snapshots) { SnapshotStatus snapshot_status; if (!ReadSnapshotStatus(lock, snapshot, &snapshot_status)) { - return UpdateState::MergeFailed; + return MergeFailureCode::ReadStatus; } if (DecideMergePhase(snapshot_status) != MergePhase::SECOND_PHASE) { continue; } - if (!SwitchSnapshotToMerge(lock, snapshot)) { + auto code = SwitchSnapshotToMerge(lock, snapshot); + if (code != MergeFailureCode::Ok) { LOG(ERROR) << "Failed to switch snapshot to a second-phase merge target: " << snapshot; - rewrote_all = false; + if (result == MergeFailureCode::Ok) { + result = code; + } } } - return rewrote_all; + return result; } std::string SnapshotManager::GetSnapshotBootIndicatorPath() { @@ -1199,7 +1213,7 @@ void SnapshotManager::AcknowledgeMergeSuccess(LockedFile* lock) { RemoveAllUpdateState(lock); } -void SnapshotManager::AcknowledgeMergeFailure() { +void SnapshotManager::AcknowledgeMergeFailure(MergeFailureCode failure_code) { // Log first, so worst case, we always have a record of why the calls below // were being made. LOG(ERROR) << "Merge could not be completed and will be marked as failed."; @@ -1216,7 +1230,7 @@ void SnapshotManager::AcknowledgeMergeFailure() { return; } - WriteUpdateState(lock.get(), UpdateState::MergeFailed); + WriteUpdateState(lock.get(), UpdateState::MergeFailed, failure_code); } bool SnapshotManager::OnSnapshotMergeComplete(LockedFile* lock, const std::string& name, @@ -2414,10 +2428,15 @@ SnapshotUpdateStatus SnapshotManager::ReadSnapshotUpdateStatus(LockedFile* lock) return status; } -bool SnapshotManager::WriteUpdateState(LockedFile* lock, UpdateState state) { +bool SnapshotManager::WriteUpdateState(LockedFile* lock, UpdateState state, + MergeFailureCode failure_code) { SnapshotUpdateStatus status; status.set_state(state); + if (state == UpdateState::MergeFailed) { + status.set_merge_failure_code(failure_code); + } + // If we're transitioning between two valid states (eg, we're not beginning // or ending an OTA), then make sure to propagate the compression bit. if (!(state == UpdateState::Initiated || state == UpdateState::None)) { diff --git a/fs_mgr/libsnapshot/snapshot_test.cpp b/fs_mgr/libsnapshot/snapshot_test.cpp index 8fae00b75..3554dce8e 100644 --- a/fs_mgr/libsnapshot/snapshot_test.cpp +++ b/fs_mgr/libsnapshot/snapshot_test.cpp @@ -676,6 +676,18 @@ TEST_F(SnapshotTest, UpdateBootControlHal) { ASSERT_EQ(test_device->merge_status(), MergeStatus::MERGING); } +TEST_F(SnapshotTest, MergeFailureCode) { + ASSERT_TRUE(AcquireLock()); + + ASSERT_TRUE(sm->WriteUpdateState(lock_.get(), UpdateState::MergeFailed, + MergeFailureCode::ListSnapshots)); + ASSERT_EQ(test_device->merge_status(), MergeStatus::MERGING); + + SnapshotUpdateStatus status = sm->ReadSnapshotUpdateStatus(lock_.get()); + ASSERT_EQ(status.state(), UpdateState::MergeFailed); + ASSERT_EQ(status.merge_failure_code(), MergeFailureCode::ListSnapshots); +} + enum class Request { UNKNOWN, LOCK_SHARED, LOCK_EXCLUSIVE, UNLOCK, EXIT }; std::ostream& operator<<(std::ostream& os, Request request) { switch (request) {