From 91b351ea7be56ff8a52e8344a871604288db9bbc Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 15 Jun 2021 17:09:00 -0700 Subject: [PATCH] Perform a consistency check before deleting snapshots. If for some reason the COW state is not fully synced to disk, but dm-snapshot has flushed its pending merges, we do not want to delete snapshots. Doing so could potentially leave blocks unmerged. This situation is quite unexpected so we label it as a merge failure. The device can recover by completely syncing the COW state, and then rebooting, which will attempt to make forward progress on the merge. Bug: 190582627 Test: vts_libsnapshot_test full OTA on bramble incremental OTA on bramble Change-Id: Ib887f1d9e4397a712ed2f800cc1222cf9305a039 Merged-In: Ib887f1d9e4397a712ed2f800cc1222cf9305a039 --- .../android/snapshot/snapshot.proto | 7 ++ .../include/libsnapshot/cow_reader.h | 5 +- .../include/libsnapshot/snapshot.h | 2 + fs_mgr/libsnapshot/snapshot.cpp | 99 +++++++++++++++++-- 4 files changed, 102 insertions(+), 11 deletions(-) diff --git a/fs_mgr/libsnapshot/android/snapshot/snapshot.proto b/fs_mgr/libsnapshot/android/snapshot/snapshot.proto index 92aa55c07..9f227c970 100644 --- a/fs_mgr/libsnapshot/android/snapshot/snapshot.proto +++ b/fs_mgr/libsnapshot/android/snapshot/snapshot.proto @@ -158,6 +158,13 @@ enum MergeFailureCode { ExpectedMergeTarget = 11; UnmergedSectorsAfterCompletion = 12; UnexpectedMergeState = 13; + GetCowPathConsistencyCheck = 14; + OpenCowConsistencyCheck = 15; + ParseCowConsistencyCheck = 16; + OpenCowDirectConsistencyCheck = 17; + MemAlignConsistencyCheck = 18; + DirectReadConsistencyCheck = 19; + WrongMergeCountConsistencyCheck = 20; }; // Next: 8 diff --git a/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h b/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h index 9ebcfd983..669e58ac6 100644 --- a/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h +++ b/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h @@ -143,12 +143,11 @@ class CowReader : public ICowReader { void InitializeMerge(); + // Number of copy, replace, and zero ops. Set if InitializeMerge is called. void set_total_data_ops(uint64_t size) { total_data_ops_ = size; } - uint64_t total_data_ops() { return total_data_ops_; } - + // Number of copy ops. Set if InitializeMerge is called. void set_copy_ops(uint64_t size) { copy_ops_ = size; } - uint64_t total_copy_ops() { return copy_ops_; } void CloseCowFd() { owned_fd_ = {}; } diff --git a/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h b/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h index 603e89694..65034f71e 100644 --- a/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h +++ b/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h @@ -603,6 +603,8 @@ class SnapshotManager final : public ISnapshotManager { MergeResult CheckMergeState(LockedFile* lock, const std::function& before_cancel); MergeResult CheckTargetMergeState(LockedFile* lock, const std::string& name, const SnapshotUpdateStatus& update_status); + MergeFailureCode CheckMergeConsistency(LockedFile* lock, const std::string& name, + const SnapshotStatus& update_status); // Interact with status files under /metadata/ota/snapshots. bool WriteSnapshotStatus(LockedFile* lock, const SnapshotStatus& status); diff --git a/fs_mgr/libsnapshot/snapshot.cpp b/fs_mgr/libsnapshot/snapshot.cpp index e2c03aedd..be732ece9 100644 --- a/fs_mgr/libsnapshot/snapshot.cpp +++ b/fs_mgr/libsnapshot/snapshot.cpp @@ -1126,6 +1126,11 @@ auto SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::string& return MergeResult(UpdateState::Merging); } + auto code = CheckMergeConsistency(lock, name, snapshot_status); + if (code != MergeFailureCode::Ok) { + return MergeResult(UpdateState::MergeFailed, code); + } + // Merging is done. First, update the status file to indicate the merge // is complete. We do this before calling OnSnapshotMergeComplete, even // though this means the write is potentially wasted work (since in the @@ -1144,6 +1149,91 @@ auto SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::string& return MergeResult(UpdateState::MergeCompleted, MergeFailureCode::Ok); } +// This returns the backing device, not the dm-user layer. +static std::string GetMappedCowDeviceName(const std::string& snapshot, + const SnapshotStatus& status) { + // If no partition was created (the COW exists entirely on /data), the + // device-mapper layering is different than if we had a partition. + if (status.cow_partition_size() == 0) { + return GetCowImageDeviceName(snapshot); + } + return GetCowName(snapshot); +} + +MergeFailureCode SnapshotManager::CheckMergeConsistency(LockedFile* lock, const std::string& name, + const SnapshotStatus& status) { + CHECK(lock); + + if (!status.compression_enabled()) { + // Do not try to verify old-style COWs yet. + return MergeFailureCode::Ok; + } + + auto& dm = DeviceMapper::Instance(); + + std::string cow_image_name = GetMappedCowDeviceName(name, status); + std::string cow_image_path; + if (!dm.GetDmDevicePathByName(cow_image_name, &cow_image_path)) { + LOG(ERROR) << "Failed to get path for cow device: " << cow_image_name; + return MergeFailureCode::GetCowPathConsistencyCheck; + } + + // First pass, count # of ops. + size_t num_ops = 0; + { + unique_fd fd(open(cow_image_path.c_str(), O_RDONLY | O_CLOEXEC)); + if (fd < 0) { + PLOG(ERROR) << "Failed to open " << cow_image_name; + return MergeFailureCode::OpenCowConsistencyCheck; + } + + CowReader reader; + if (!reader.Parse(std::move(fd))) { + LOG(ERROR) << "Failed to parse cow " << cow_image_path; + return MergeFailureCode::ParseCowConsistencyCheck; + } + + for (auto iter = reader.GetOpIter(); !iter->Done(); iter->Next()) { + if (!IsMetadataOp(iter->Get())) { + num_ops++; + } + } + } + + // Second pass, try as hard as we can to get the actual number of blocks + // the system thinks is merged. + unique_fd fd(open(cow_image_path.c_str(), O_RDONLY | O_DIRECT | O_SYNC | O_CLOEXEC)); + if (fd < 0) { + PLOG(ERROR) << "Failed to open direct " << cow_image_name; + return MergeFailureCode::OpenCowDirectConsistencyCheck; + } + + void* addr; + size_t page_size = getpagesize(); + if (posix_memalign(&addr, page_size, page_size) < 0) { + PLOG(ERROR) << "posix_memalign with page size " << page_size; + return MergeFailureCode::MemAlignConsistencyCheck; + } + + // COWs are always at least 2MB, this is guaranteed in snapshot creation. + std::unique_ptr buffer(addr, ::free); + if (!android::base::ReadFully(fd, buffer.get(), page_size)) { + PLOG(ERROR) << "Direct read failed " << cow_image_name; + return MergeFailureCode::DirectReadConsistencyCheck; + } + + auto header = reinterpret_cast(buffer.get()); + if (header->num_merge_ops != num_ops) { + LOG(ERROR) << "COW consistency check failed, expected " << num_ops << " to be merged, " + << "but " << header->num_merge_ops << " were actually recorded."; + LOG(ERROR) << "Aborting merge progress for snapshot " << name + << ", will try again next boot"; + return MergeFailureCode::WrongMergeCountConsistencyCheck; + } + + return MergeFailureCode::Ok; +} + MergeFailureCode SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) { std::vector snapshots; if (!ListSnapshots(lock, &snapshots)) { @@ -1429,14 +1519,7 @@ bool SnapshotManager::PerformInitTransition(InitTransition transition, continue; } - // If no partition was created (the COW exists entirely on /data), the - // device-mapper layering is different than if we had a partition. - std::string cow_image_name; - if (snapshot_status.cow_partition_size() == 0) { - cow_image_name = GetCowImageDeviceName(snapshot); - } else { - cow_image_name = GetCowName(snapshot); - } + std::string cow_image_name = GetMappedCowDeviceName(snapshot, snapshot_status); std::string cow_image_device; if (!dm.GetDmDevicePathByName(cow_image_name, &cow_image_device)) {