Merge "libsnapshot: Add a merge failure code to the update state."

This commit is contained in:
David Anderson 2021-04-20 16:20:48 +00:00 committed by Gerrit Code Review
commit 53a06bffe1
4 changed files with 152 additions and 87 deletions

View file

@ -139,7 +139,28 @@ enum UpdateState {
Cancelled = 7;
};
// Next: 7
// Next 14:
//
// To understand the source of each failure, read snapshot.cpp. To handle new
// sources of failure, avoid reusing an existing code; add a new code instead.
enum MergeFailureCode {
Ok = 0;
ReadStatus = 1;
GetTableInfo = 2;
UnknownTable = 3;
GetTableParams = 4;
ActivateNewTable = 5;
AcquireLock = 6;
ListSnapshots = 7;
WriteStatus = 8;
UnknownTargetType = 9;
QuerySnapshotStatus = 10;
ExpectedMergeTarget = 11;
UnmergedSectorsAfterCompletion = 12;
UnexpectedMergeState = 13;
};
// Next: 8
message SnapshotUpdateStatus {
UpdateState state = 1;
@ -160,6 +181,9 @@ message SnapshotUpdateStatus {
// Merge phase (if state == MERGING).
MergePhase merge_phase = 6;
// Merge failure code, filled if state == MergeFailed.
MergeFailureCode merge_failure_code = 7;
}
// Next: 9

View file

@ -381,6 +381,7 @@ class SnapshotManager final : public ISnapshotManager {
FRIEND_TEST(SnapshotTest, MapPartialSnapshot);
FRIEND_TEST(SnapshotTest, MapSnapshot);
FRIEND_TEST(SnapshotTest, Merge);
FRIEND_TEST(SnapshotTest, MergeFailureCode);
FRIEND_TEST(SnapshotTest, NoMergeBeforeReboot);
FRIEND_TEST(SnapshotTest, UpdateBootControlHal);
FRIEND_TEST(SnapshotUpdateTest, DaemonTransition);
@ -532,7 +533,8 @@ class SnapshotManager final : public ISnapshotManager {
// Interact with /metadata/ota/state.
UpdateState ReadUpdateState(LockedFile* file);
SnapshotUpdateStatus ReadSnapshotUpdateStatus(LockedFile* file);
bool WriteUpdateState(LockedFile* file, UpdateState state);
bool WriteUpdateState(LockedFile* file, UpdateState state,
MergeFailureCode failure_code = MergeFailureCode::Ok);
bool WriteSnapshotUpdateStatus(LockedFile* file, const SnapshotUpdateStatus& status);
std::string GetStateFilePath() const;
@ -541,12 +543,12 @@ class SnapshotManager final : public ISnapshotManager {
std::string GetMergeStateFilePath() const;
// Helpers for merging.
bool MergeSecondPhaseSnapshots(LockedFile* lock);
bool SwitchSnapshotToMerge(LockedFile* lock, const std::string& name);
bool RewriteSnapshotDeviceTable(const std::string& dm_name);
MergeFailureCode MergeSecondPhaseSnapshots(LockedFile* lock);
MergeFailureCode SwitchSnapshotToMerge(LockedFile* lock, const std::string& name);
MergeFailureCode RewriteSnapshotDeviceTable(const std::string& dm_name);
bool MarkSnapshotMergeCompleted(LockedFile* snapshot_lock, const std::string& snapshot_name);
void AcknowledgeMergeSuccess(LockedFile* lock);
void AcknowledgeMergeFailure();
void AcknowledgeMergeFailure(MergeFailureCode failure_code);
MergePhase DecideMergePhase(const SnapshotStatus& status);
std::unique_ptr<LpMetadata> ReadCurrentMetadata();
@ -573,14 +575,22 @@ class SnapshotManager final : public ISnapshotManager {
const SnapshotStatus& status);
bool CollapseSnapshotDevice(const std::string& name, const SnapshotStatus& status);
struct MergeResult {
explicit MergeResult(UpdateState state,
MergeFailureCode failure_code = MergeFailureCode::Ok)
: state(state), failure_code(failure_code) {}
UpdateState state;
MergeFailureCode failure_code;
};
// Only the following UpdateStates are used here:
// UpdateState::Merging
// UpdateState::MergeCompleted
// UpdateState::MergeFailed
// UpdateState::MergeNeedsReboot
UpdateState CheckMergeState(const std::function<bool()>& before_cancel);
UpdateState CheckMergeState(LockedFile* lock, const std::function<bool()>& before_cancel);
UpdateState CheckTargetMergeState(LockedFile* lock, const std::string& name,
MergeResult CheckMergeState(const std::function<bool()>& before_cancel);
MergeResult CheckMergeState(LockedFile* lock, const std::function<bool()>& before_cancel);
MergeResult CheckTargetMergeState(LockedFile* lock, const std::string& name,
const SnapshotUpdateStatus& update_status);
// Interact with status files under /metadata/ota/snapshots.

View file

@ -746,32 +746,35 @@ bool SnapshotManager::InitiateMerge() {
return false;
}
bool rewrote_all = true;
auto reported_code = MergeFailureCode::Ok;
for (const auto& snapshot : *merge_group) {
// If this fails, we have no choice but to continue. Everything must
// be merged. This is not an ideal state to be in, but it is safe,
// because we the next boot will try again.
if (!SwitchSnapshotToMerge(lock.get(), snapshot)) {
auto code = SwitchSnapshotToMerge(lock.get(), snapshot);
if (code != MergeFailureCode::Ok) {
LOG(ERROR) << "Failed to switch snapshot to a merge target: " << snapshot;
rewrote_all = false;
if (reported_code == MergeFailureCode::Ok) {
reported_code = code;
}
}
}
// If we couldn't switch everything to a merge target, pre-emptively mark
// this merge as failed. It will get acknowledged when WaitForMerge() is
// called.
if (!rewrote_all) {
WriteUpdateState(lock.get(), UpdateState::MergeFailed);
if (reported_code != MergeFailureCode::Ok) {
WriteUpdateState(lock.get(), UpdateState::MergeFailed, reported_code);
}
// Return true no matter what, because a merge was initiated.
return true;
}
bool SnapshotManager::SwitchSnapshotToMerge(LockedFile* lock, const std::string& name) {
MergeFailureCode SnapshotManager::SwitchSnapshotToMerge(LockedFile* lock, const std::string& name) {
SnapshotStatus status;
if (!ReadSnapshotStatus(lock, name, &status)) {
return false;
return MergeFailureCode::ReadStatus;
}
if (status.state() != SnapshotState::CREATED) {
LOG(WARNING) << "Snapshot " << name
@ -780,8 +783,8 @@ bool SnapshotManager::SwitchSnapshotToMerge(LockedFile* lock, const std::string&
// After this, we return true because we technically did switch to a merge
// target. Everything else we do here is just informational.
if (!RewriteSnapshotDeviceTable(name)) {
return false;
if (auto code = RewriteSnapshotDeviceTable(name); code != MergeFailureCode::Ok) {
return code;
}
status.set_state(SnapshotState::MERGING);
@ -795,26 +798,26 @@ bool SnapshotManager::SwitchSnapshotToMerge(LockedFile* lock, const std::string&
if (!WriteSnapshotStatus(lock, status)) {
LOG(ERROR) << "Could not update status file for snapshot: " << name;
}
return true;
return MergeFailureCode::Ok;
}
bool SnapshotManager::RewriteSnapshotDeviceTable(const std::string& name) {
MergeFailureCode SnapshotManager::RewriteSnapshotDeviceTable(const std::string& name) {
auto& dm = DeviceMapper::Instance();
std::vector<DeviceMapper::TargetInfo> old_targets;
if (!dm.GetTableInfo(name, &old_targets)) {
LOG(ERROR) << "Could not read snapshot device table: " << name;
return false;
return MergeFailureCode::GetTableInfo;
}
if (old_targets.size() != 1 || DeviceMapper::GetTargetType(old_targets[0].spec) != "snapshot") {
LOG(ERROR) << "Unexpected device-mapper table for snapshot: " << name;
return false;
return MergeFailureCode::UnknownTable;
}
std::string base_device, cow_device;
if (!DmTargetSnapshot::GetDevicesFromParams(old_targets[0].data, &base_device, &cow_device)) {
LOG(ERROR) << "Could not derive underlying devices for snapshot: " << name;
return false;
return MergeFailureCode::GetTableParams;
}
DmTable table;
@ -822,10 +825,10 @@ bool SnapshotManager::RewriteSnapshotDeviceTable(const std::string& name) {
SnapshotStorageMode::Merge, kSnapshotChunkSize);
if (!dm.LoadTableAndActivate(name, table)) {
LOG(ERROR) << "Could not swap device-mapper tables on snapshot device " << name;
return false;
return MergeFailureCode::ActivateNewTable;
}
LOG(INFO) << "Successfully switched snapshot device to a merge target: " << name;
return true;
return MergeFailureCode::Ok;
}
enum class TableQuery {
@ -897,20 +900,20 @@ bool SnapshotManager::QuerySnapshotStatus(const std::string& dm_name, std::strin
UpdateState SnapshotManager::ProcessUpdateState(const std::function<bool()>& callback,
const std::function<bool()>& before_cancel) {
while (true) {
UpdateState state = CheckMergeState(before_cancel);
LOG(INFO) << "ProcessUpdateState handling state: " << state;
auto result = CheckMergeState(before_cancel);
LOG(INFO) << "ProcessUpdateState handling state: " << result.state;
if (state == UpdateState::MergeFailed) {
AcknowledgeMergeFailure();
if (result.state == UpdateState::MergeFailed) {
AcknowledgeMergeFailure(result.failure_code);
}
if (state != UpdateState::Merging) {
if (result.state != UpdateState::Merging) {
// Either there is no merge, or the merge was finished, so no need
// to keep waiting.
return state;
return result.state;
}
if (callback && !callback()) {
return state;
return result.state;
}
// This wait is not super time sensitive, so we have a relatively
@ -919,36 +922,36 @@ UpdateState SnapshotManager::ProcessUpdateState(const std::function<bool()>& cal
}
}
UpdateState SnapshotManager::CheckMergeState(const std::function<bool()>& before_cancel) {
auto SnapshotManager::CheckMergeState(const std::function<bool()>& before_cancel) -> MergeResult {
auto lock = LockExclusive();
if (!lock) {
return UpdateState::MergeFailed;
return MergeResult(UpdateState::MergeFailed, MergeFailureCode::AcquireLock);
}
UpdateState state = CheckMergeState(lock.get(), before_cancel);
LOG(INFO) << "CheckMergeState for snapshots returned: " << state;
auto result = CheckMergeState(lock.get(), before_cancel);
LOG(INFO) << "CheckMergeState for snapshots returned: " << result.state;
if (state == UpdateState::MergeCompleted) {
if (result.state == UpdateState::MergeCompleted) {
// Do this inside the same lock. Failures get acknowledged without the
// lock, because flock() might have failed.
AcknowledgeMergeSuccess(lock.get());
} else if (state == UpdateState::Cancelled) {
} else if (result.state == UpdateState::Cancelled) {
if (!device_->IsRecovery() && !RemoveAllUpdateState(lock.get(), before_cancel)) {
LOG(ERROR) << "Failed to remove all update state after acknowleding cancelled update.";
}
}
return state;
return result;
}
UpdateState SnapshotManager::CheckMergeState(LockedFile* lock,
const std::function<bool()>& before_cancel) {
auto SnapshotManager::CheckMergeState(LockedFile* lock, const std::function<bool()>& before_cancel)
-> MergeResult {
SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock);
switch (update_status.state()) {
case UpdateState::None:
case UpdateState::MergeCompleted:
// Harmless races are allowed between two callers of WaitForMerge,
// so in both of these cases we just propagate the state.
return update_status.state();
return MergeResult(update_status.state());
case UpdateState::Merging:
case UpdateState::MergeNeedsReboot:
@ -963,26 +966,26 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock,
// via the merge poll below, but if we never started a merge, we
// need to also check here.
if (HandleCancelledUpdate(lock, before_cancel)) {
return UpdateState::Cancelled;
return MergeResult(UpdateState::Cancelled);
}
return update_status.state();
return MergeResult(update_status.state());
default:
return update_status.state();
return MergeResult(update_status.state());
}
std::vector<std::string> snapshots;
if (!ListSnapshots(lock, &snapshots)) {
return UpdateState::MergeFailed;
return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ListSnapshots);
}
auto other_suffix = device_->GetOtherSlotSuffix();
bool cancelled = false;
bool failed = false;
bool merging = false;
bool needs_reboot = false;
bool wrong_phase = false;
MergeFailureCode failure_code = MergeFailureCode::Ok;
for (const auto& snapshot : snapshots) {
if (android::base::EndsWith(snapshot, other_suffix)) {
// This will have triggered an error message in InitiateMerge already.
@ -990,12 +993,15 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock,
continue;
}
UpdateState snapshot_state = CheckTargetMergeState(lock, snapshot, update_status);
LOG(INFO) << "CheckTargetMergeState for " << snapshot << " returned: " << snapshot_state;
auto result = CheckTargetMergeState(lock, snapshot, update_status);
LOG(INFO) << "CheckTargetMergeState for " << snapshot << " returned: " << result.state;
switch (snapshot_state) {
switch (result.state) {
case UpdateState::MergeFailed:
failed = true;
// Take the first failure code in case other failures compound.
if (failure_code == MergeFailureCode::Ok) {
failure_code = result.failure_code;
}
break;
case UpdateState::Merging:
merging = true;
@ -1013,8 +1019,10 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock,
break;
default:
LOG(ERROR) << "Unknown merge status for \"" << snapshot << "\": "
<< "\"" << snapshot_state << "\"";
failed = true;
<< "\"" << result.state << "\"";
if (failure_code == MergeFailureCode::Ok) {
failure_code = MergeFailureCode::UnexpectedMergeState;
}
break;
}
}
@ -1023,24 +1031,25 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock,
// Note that we handle "Merging" before we handle anything else. We
// want to poll until *nothing* is merging if we can, so everything has
// a chance to get marked as completed or failed.
return UpdateState::Merging;
return MergeResult(UpdateState::Merging);
}
if (failed) {
if (failure_code != MergeFailureCode::Ok) {
// Note: since there are many drop-out cases for failure, we acknowledge
// it in WaitForMerge rather than here and elsewhere.
return UpdateState::MergeFailed;
return MergeResult(UpdateState::MergeFailed, failure_code);
}
if (wrong_phase) {
// If we got here, no other partitions are being merged, and nothing
// failed to merge. It's safe to move to the next merge phase.
if (!MergeSecondPhaseSnapshots(lock)) {
return UpdateState::MergeFailed;
auto code = MergeSecondPhaseSnapshots(lock);
if (code != MergeFailureCode::Ok) {
return MergeResult(UpdateState::MergeFailed, code);
}
return UpdateState::Merging;
return MergeResult(UpdateState::Merging);
}
if (needs_reboot) {
WriteUpdateState(lock, UpdateState::MergeNeedsReboot);
return UpdateState::MergeNeedsReboot;
return MergeResult(UpdateState::MergeNeedsReboot);
}
if (cancelled) {
// This is an edge case, that we handle as correctly as we sensibly can.
@ -1048,16 +1057,17 @@ UpdateState SnapshotManager::CheckMergeState(LockedFile* lock,
// removed the snapshot as a result. The exact state of the update is
// undefined now, but this can only happen on an unlocked device where
// partitions can be flashed without wiping userdata.
return UpdateState::Cancelled;
return MergeResult(UpdateState::Cancelled);
}
return UpdateState::MergeCompleted;
return MergeResult(UpdateState::MergeCompleted);
}
UpdateState SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::string& name,
const SnapshotUpdateStatus& update_status) {
auto SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::string& name,
const SnapshotUpdateStatus& update_status)
-> MergeResult {
SnapshotStatus snapshot_status;
if (!ReadSnapshotStatus(lock, name, &snapshot_status)) {
return UpdateState::MergeFailed;
return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ReadStatus);
}
std::unique_ptr<LpMetadata> current_metadata;
@ -1070,7 +1080,7 @@ UpdateState SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::
if (!current_metadata ||
GetMetadataPartitionState(*current_metadata, name) != MetadataPartitionState::Updated) {
DeleteSnapshot(lock, name);
return UpdateState::Cancelled;
return MergeResult(UpdateState::Cancelled);
}
// During a check, we decided the merge was complete, but we were unable to
@ -1081,11 +1091,11 @@ UpdateState SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::
if (snapshot_status.state() == SnapshotState::MERGE_COMPLETED) {
// NB: It's okay if this fails now, we gave cleanup our best effort.
OnSnapshotMergeComplete(lock, name, snapshot_status);
return UpdateState::MergeCompleted;
return MergeResult(UpdateState::MergeCompleted);
}
LOG(ERROR) << "Expected snapshot or snapshot-merge for device: " << name;
return UpdateState::MergeFailed;
return MergeResult(UpdateState::MergeFailed, MergeFailureCode::UnknownTargetType);
}
// This check is expensive so it is only enabled for debugging.
@ -1095,29 +1105,30 @@ UpdateState SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::
std::string target_type;
DmTargetSnapshot::Status status;
if (!QuerySnapshotStatus(name, &target_type, &status)) {
return UpdateState::MergeFailed;
return MergeResult(UpdateState::MergeFailed, MergeFailureCode::QuerySnapshotStatus);
}
if (target_type == "snapshot" &&
DecideMergePhase(snapshot_status) == MergePhase::SECOND_PHASE &&
update_status.merge_phase() == MergePhase::FIRST_PHASE) {
// The snapshot is not being merged because it's in the wrong phase.
return UpdateState::None;
return MergeResult(UpdateState::None);
}
if (target_type != "snapshot-merge") {
// We can get here if we failed to rewrite the target type in
// InitiateMerge(). If we failed to create the target in first-stage
// init, boot would not succeed.
LOG(ERROR) << "Snapshot " << name << " has incorrect target type: " << target_type;
return UpdateState::MergeFailed;
return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ExpectedMergeTarget);
}
// These two values are equal when merging is complete.
if (status.sectors_allocated != status.metadata_sectors) {
if (snapshot_status.state() == SnapshotState::MERGE_COMPLETED) {
LOG(ERROR) << "Snapshot " << name << " is merging after being marked merge-complete.";
return UpdateState::MergeFailed;
return MergeResult(UpdateState::MergeFailed,
MergeFailureCode::UnmergedSectorsAfterCompletion);
}
return UpdateState::Merging;
return MergeResult(UpdateState::Merging);
}
// Merging is done. First, update the status file to indicate the merge
@ -1130,18 +1141,18 @@ UpdateState SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::
// snapshot device for this partition.
snapshot_status.set_state(SnapshotState::MERGE_COMPLETED);
if (!WriteSnapshotStatus(lock, snapshot_status)) {
return UpdateState::MergeFailed;
return MergeResult(UpdateState::MergeFailed, MergeFailureCode::WriteStatus);
}
if (!OnSnapshotMergeComplete(lock, name, snapshot_status)) {
return UpdateState::MergeNeedsReboot;
return MergeResult(UpdateState::MergeNeedsReboot);
}
return UpdateState::MergeCompleted;
return MergeResult(UpdateState::MergeCompleted, MergeFailureCode::Ok);
}
bool SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) {
MergeFailureCode SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) {
std::vector<std::string> snapshots;
if (!ListSnapshots(lock, &snapshots)) {
return UpdateState::MergeFailed;
return MergeFailureCode::ListSnapshots;
}
SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock);
@ -1150,24 +1161,27 @@ bool SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) {
update_status.set_merge_phase(MergePhase::SECOND_PHASE);
if (!WriteSnapshotUpdateStatus(lock, update_status)) {
return false;
return MergeFailureCode::WriteStatus;
}
bool rewrote_all = true;
MergeFailureCode result = MergeFailureCode::Ok;
for (const auto& snapshot : snapshots) {
SnapshotStatus snapshot_status;
if (!ReadSnapshotStatus(lock, snapshot, &snapshot_status)) {
return UpdateState::MergeFailed;
return MergeFailureCode::ReadStatus;
}
if (DecideMergePhase(snapshot_status) != MergePhase::SECOND_PHASE) {
continue;
}
if (!SwitchSnapshotToMerge(lock, snapshot)) {
auto code = SwitchSnapshotToMerge(lock, snapshot);
if (code != MergeFailureCode::Ok) {
LOG(ERROR) << "Failed to switch snapshot to a second-phase merge target: " << snapshot;
rewrote_all = false;
if (result == MergeFailureCode::Ok) {
result = code;
}
}
}
return rewrote_all;
return result;
}
std::string SnapshotManager::GetSnapshotBootIndicatorPath() {
@ -1199,7 +1213,7 @@ void SnapshotManager::AcknowledgeMergeSuccess(LockedFile* lock) {
RemoveAllUpdateState(lock);
}
void SnapshotManager::AcknowledgeMergeFailure() {
void SnapshotManager::AcknowledgeMergeFailure(MergeFailureCode failure_code) {
// Log first, so worst case, we always have a record of why the calls below
// were being made.
LOG(ERROR) << "Merge could not be completed and will be marked as failed.";
@ -1216,7 +1230,7 @@ void SnapshotManager::AcknowledgeMergeFailure() {
return;
}
WriteUpdateState(lock.get(), UpdateState::MergeFailed);
WriteUpdateState(lock.get(), UpdateState::MergeFailed, failure_code);
}
bool SnapshotManager::OnSnapshotMergeComplete(LockedFile* lock, const std::string& name,
@ -2414,10 +2428,15 @@ SnapshotUpdateStatus SnapshotManager::ReadSnapshotUpdateStatus(LockedFile* lock)
return status;
}
bool SnapshotManager::WriteUpdateState(LockedFile* lock, UpdateState state) {
bool SnapshotManager::WriteUpdateState(LockedFile* lock, UpdateState state,
MergeFailureCode failure_code) {
SnapshotUpdateStatus status;
status.set_state(state);
if (state == UpdateState::MergeFailed) {
status.set_merge_failure_code(failure_code);
}
// If we're transitioning between two valid states (eg, we're not beginning
// or ending an OTA), then make sure to propagate the compression bit.
if (!(state == UpdateState::Initiated || state == UpdateState::None)) {

View file

@ -676,6 +676,18 @@ TEST_F(SnapshotTest, UpdateBootControlHal) {
ASSERT_EQ(test_device->merge_status(), MergeStatus::MERGING);
}
TEST_F(SnapshotTest, MergeFailureCode) {
ASSERT_TRUE(AcquireLock());
ASSERT_TRUE(sm->WriteUpdateState(lock_.get(), UpdateState::MergeFailed,
MergeFailureCode::ListSnapshots));
ASSERT_EQ(test_device->merge_status(), MergeStatus::MERGING);
SnapshotUpdateStatus status = sm->ReadSnapshotUpdateStatus(lock_.get());
ASSERT_EQ(status.state(), UpdateState::MergeFailed);
ASSERT_EQ(status.merge_failure_code(), MergeFailureCode::ListSnapshots);
}
enum class Request { UNKNOWN, LOCK_SHARED, LOCK_EXCLUSIVE, UNLOCK, EXIT };
std::ostream& operator<<(std::ostream& os, Request request) {
switch (request) {