libsnapshot: Resume snapshot merge if snapshots are in second

phase

If the device reboots when SnapshotUpdateStatus switches from
first phase to second phase, then track the transition
and resume the merge.

Bug: 374225913
Test: OTA on Pixel - Verify merge resumes when device reboots just after
first phase merge
Change-Id: I5f62a03852a4b012850b11d0c1e6b96ec0556278
Signed-off-by: Akilesh Kailash <akailash@google.com>
This commit is contained in:
Akilesh Kailash 2024-10-18 11:39:44 -07:00
parent 097b607ac6
commit 5ad59a4cff
3 changed files with 166 additions and 4 deletions

View file

@ -442,6 +442,7 @@ class SnapshotManager final : public ISnapshotManager {
FRIEND_TEST(SnapshotUpdateTest, QueryStatusError);
FRIEND_TEST(SnapshotUpdateTest, SnapshotStatusFileWithoutCow);
FRIEND_TEST(SnapshotUpdateTest, SpaceSwapUpdate);
FRIEND_TEST(SnapshotUpdateTest, InterruptMergeDuringPhaseUpdate);
FRIEND_TEST(SnapshotUpdateTest, MapAllSnapshotsWithoutSlotSwitch);
friend class SnapshotTest;
friend class SnapshotUpdateTest;

View file

@ -1343,10 +1343,25 @@ auto SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::string&
}
if (merge_status == "snapshot" &&
DecideMergePhase(snapshot_status) == MergePhase::SECOND_PHASE &&
update_status.merge_phase() == MergePhase::FIRST_PHASE) {
// The snapshot is not being merged because it's in the wrong phase.
return MergeResult(UpdateState::None);
DecideMergePhase(snapshot_status) == MergePhase::SECOND_PHASE) {
if (update_status.merge_phase() == MergePhase::FIRST_PHASE) {
// The snapshot is not being merged because it's in the wrong phase.
return MergeResult(UpdateState::None);
} else {
// update_status is already in second phase but the
// snapshot_status is still not set to SnapshotState::MERGING.
//
// Resume the merge at this point. see b/374225913
LOG(INFO) << "SwitchSnapshotToMerge: " << name << " after resuming merge";
auto code = SwitchSnapshotToMerge(lock, name);
if (code != MergeFailureCode::Ok) {
LOG(ERROR) << "Failed to switch snapshot: " << name
<< " to merge during second phase";
return MergeResult(UpdateState::MergeFailed,
MergeFailureCode::UnknownTargetType);
}
return MergeResult(UpdateState::Merging);
}
}
if (merge_status == "snapshot-merge") {
@ -1442,8 +1457,14 @@ MergeFailureCode SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) {
return MergeFailureCode::WriteStatus;
}
auto current_slot_suffix = device_->GetSlotSuffix();
MergeFailureCode result = MergeFailureCode::Ok;
for (const auto& snapshot : snapshots) {
if (!android::base::EndsWith(snapshot, current_slot_suffix)) {
LOG(ERROR) << "Skipping invalid snapshot: " << snapshot
<< " during MergeSecondPhaseSnapshots";
continue;
}
SnapshotStatus snapshot_status;
if (!ReadSnapshotStatus(lock, snapshot, &snapshot_status)) {
return MergeFailureCode::ReadStatus;

View file

@ -1607,6 +1607,146 @@ TEST_F(SnapshotUpdateTest, SpaceSwapUpdate) {
}
}
// Test that shrinking and growing partitions at the same time is handled
// correctly in VABC.
TEST_F(SnapshotUpdateTest, InterruptMergeDuringPhaseUpdate) {
if (!snapuserd_required_) {
// b/179111359
GTEST_SKIP() << "Skipping snapuserd test";
}
auto old_sys_size = GetSize(sys_);
auto old_prd_size = GetSize(prd_);
// Grow |sys| but shrink |prd|.
SetSize(sys_, old_sys_size * 2);
sys_->set_estimate_cow_size(8_MiB);
SetSize(prd_, old_prd_size / 2);
prd_->set_estimate_cow_size(1_MiB);
AddOperationForPartitions();
ASSERT_TRUE(sm->BeginUpdate());
ASSERT_TRUE(sm->CreateUpdateSnapshots(manifest_));
// Check that the old partition sizes were saved correctly.
{
ASSERT_TRUE(AcquireLock());
auto local_lock = std::move(lock_);
SnapshotStatus status;
ASSERT_TRUE(sm->ReadSnapshotStatus(local_lock.get(), "prd_b", &status));
ASSERT_EQ(status.old_partition_size(), 3145728);
ASSERT_TRUE(sm->ReadSnapshotStatus(local_lock.get(), "sys_b", &status));
ASSERT_EQ(status.old_partition_size(), 3145728);
}
ASSERT_TRUE(WriteSnapshotAndHash(sys_));
ASSERT_TRUE(WriteSnapshotAndHash(vnd_));
ASSERT_TRUE(ShiftAllSnapshotBlocks("prd_b", old_prd_size));
sync();
// Assert that source partitions aren't affected.
for (const auto& name : {"sys_a", "vnd_a", "prd_a"}) {
ASSERT_TRUE(IsPartitionUnchanged(name));
}
ASSERT_TRUE(sm->FinishedSnapshotWrites(false));
// Simulate shutting down the device.
ASSERT_TRUE(UnmapAll());
// After reboot, init does first stage mount.
auto init = NewManagerForFirstStageMount("_b");
ASSERT_NE(init, nullptr);
ASSERT_TRUE(init->NeedSnapshotsInFirstStageMount());
ASSERT_TRUE(init->CreateLogicalAndSnapshotPartitions("super", snapshot_timeout_));
// Check that the target partitions have the same content.
for (const auto& name : {"sys_b", "vnd_b", "prd_b"}) {
ASSERT_TRUE(IsPartitionUnchanged(name));
}
// Initiate the merge and wait for it to be completed.
if (ShouldSkipLegacyMerging()) {
LOG(INFO) << "Skipping legacy merge in test";
return;
}
ASSERT_TRUE(init->InitiateMerge());
ASSERT_EQ(init->IsSnapuserdRequired(), snapuserd_required_);
{
// Check that the merge phase is FIRST_PHASE until at least one call
// to ProcessUpdateState() occurs.
ASSERT_TRUE(AcquireLock());
auto local_lock = std::move(lock_);
auto status = init->ReadSnapshotUpdateStatus(local_lock.get());
ASSERT_EQ(status.merge_phase(), MergePhase::FIRST_PHASE);
}
// Wait until prd_b merge is completed which is part of first phase
std::chrono::milliseconds timeout(6000);
auto start = std::chrono::steady_clock::now();
// Keep polling until the merge is complete or timeout is reached
while (true) {
// Query the merge status
const auto merge_status = init->snapuserd_client()->QuerySnapshotStatus("prd_b");
if (merge_status == "snapshot-merge-complete") {
break;
}
auto now = std::chrono::steady_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - start);
ASSERT_TRUE(elapsed < timeout);
// sleep for a second and allow merge to complete
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
}
// Now, forcefully update the snapshot-update status to SECOND PHASE
// This will not update the snapshot status of sys_b to MERGING
if (init->UpdateUsesUserSnapshots()) {
ASSERT_TRUE(AcquireLock());
auto local_lock = std::move(lock_);
auto status = init->ReadSnapshotUpdateStatus(local_lock.get());
status.set_merge_phase(MergePhase::SECOND_PHASE);
ASSERT_TRUE(init->WriteSnapshotUpdateStatus(local_lock.get(), status));
}
// Simulate shutting down the device and creating partitions again.
ASSERT_TRUE(UnmapAll());
ASSERT_TRUE(init->CreateLogicalAndSnapshotPartitions("super", snapshot_timeout_));
DeviceMapper::TargetInfo target;
ASSERT_TRUE(init->IsSnapshotDevice("prd_b", &target));
ASSERT_EQ(DeviceMapper::GetTargetType(target.spec), "user");
ASSERT_TRUE(init->IsSnapshotDevice("sys_b", &target));
ASSERT_EQ(DeviceMapper::GetTargetType(target.spec), "user");
ASSERT_TRUE(init->IsSnapshotDevice("vnd_b", &target));
ASSERT_EQ(DeviceMapper::GetTargetType(target.spec), "user");
// Complete the merge; "sys" and "vnd" should resume the merge
// even though merge was interrupted after update_status was updated to
// SECOND_PHASE
ASSERT_EQ(UpdateState::MergeCompleted, init->ProcessUpdateState());
// Make sure the second phase ran and deleted snapshots.
{
ASSERT_TRUE(AcquireLock());
auto local_lock = std::move(lock_);
std::vector<std::string> snapshots;
ASSERT_TRUE(init->ListSnapshots(local_lock.get(), &snapshots));
ASSERT_TRUE(snapshots.empty());
}
// Check that the target partitions have the same content after the merge.
for (const auto& name : {"sys_b", "vnd_b", "prd_b"}) {
ASSERT_TRUE(IsPartitionUnchanged(name))
<< "Content of " << name << " changes after the merge";
}
}
// Test that if new system partitions uses empty space in super, that region is not snapshotted.
TEST_F(SnapshotUpdateTest, DirectWriteEmptySpace) {
GTEST_SKIP() << "b/141889746";