From cffa413de4e91c62893b57959ac44b058d380e4a Mon Sep 17 00:00:00 2001 From: Akilesh Kailash Date: Mon, 2 Oct 2023 22:22:36 -0700 Subject: [PATCH] snapuserd: I/O request on overlapping blocks during snapshot-merge. This fixes the case when all the following conditions are true: 1: Incremental OTA 2: When there are sequence of overlapping COPY operations within one merge-window (510 blocks) 3: Device is rebooted when snapshot-merge is in-progress of this merge-window. When device reboots, the state of merge-window (of 510 blocks) was merge-in-progress (aka - only partial set of blocks were merged in this window thereby the state of the base device is in-complete for this window) 4: During the next boot, if there any I/O request from the filesystem which maps to the merge-window in (3): a: The data has to be retrieved from the scratch space of the COW until the snapshot-merge for that window is completed. b: Once the snapshot-merge is complete for that window, data has to be retrieved from base device. The bug was in step 4(a) wherein I/O request was getting routed to base device. This patch addresses the above flow by fixing step 4(a). A new vts test has been added to explicitly track this issue. Additionally, there is no need to re-scan the partition if partition is in merge resume path. This should cut down the overhead of the scan. Bug: 275296365 Test: 1: 100 iterations of ./vts_snapuserd_test --gtest_filter=SnapuserdTest.Snapshot_COPY_Overlap_Merge_Resume_IO_Validate_TEST 2: Incremental OTA on Pixel 6 Pro with multiple iterations of device reboot when merge is in progress Change-Id: Ib53be7f07ff192a84ec7f7049b2c6be01dad1041 Signed-off-by: Akilesh Kailash --- .../user-space-merge/snapuserd_core.cpp | 13 ++- .../user-space-merge/snapuserd_core.h | 4 + .../user-space-merge/snapuserd_readahead.cpp | 9 +- .../user-space-merge/snapuserd_test.cpp | 95 +++++++++++++++++++ .../snapuserd_transitions.cpp | 28 +++++- 5 files changed, 145 insertions(+), 4 deletions(-) diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp index c2958512d..e886ec399 100644 --- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp +++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp @@ -173,6 +173,10 @@ bool SnapshotHandler::ReadMetadata() { } SNAP_LOG(INFO) << "Merge-ops: " << header.num_merge_ops; + if (header.num_merge_ops) { + resume_merge_ = true; + SNAP_LOG(INFO) << "Resume Snapshot-merge"; + } if (!MmapMetadata()) { SNAP_LOG(ERROR) << "mmap failed"; @@ -295,6 +299,11 @@ bool SnapshotHandler::Start() { if (ra_thread_) { ra_thread_status = std::async(std::launch::async, &ReadAhead::RunThread, read_ahead_thread_.get()); + // If this is a merge-resume path, wait until RA thread is fully up as + // the data has to be re-constructed from the scratch space. + if (resume_merge_ && ShouldReconstructDataFromCow()) { + WaitForRaThreadToStart(); + } } // Launch worker threads @@ -307,7 +316,9 @@ bool SnapshotHandler::Start() { std::async(std::launch::async, &MergeWorker::Run, merge_thread_.get()); // Now that the worker threads are up, scan the partitions. - if (perform_verification_) { + // If the snapshot-merge is being resumed, there is no need to scan as the + // current slot is already marked as boot complete. + if (perform_verification_ && !resume_merge_) { update_verify_->VerifyUpdatePartition(); } diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h index e401c118e..f88406d2a 100644 --- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h +++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h @@ -147,6 +147,8 @@ class SnapshotHandler : public std::enable_shared_from_this { void WakeupMonitorMergeThread(); void WaitForMergeComplete(); bool WaitForMergeBegin(); + void RaThreadStarted(); + void WaitForRaThreadToStart(); void NotifyRAForMergeReady(); bool WaitForMergeReady(); void MergeFailed(); @@ -221,6 +223,7 @@ class SnapshotHandler : public std::enable_shared_from_this { // Read-ahead related bool populate_data_from_cow_ = false; bool ra_thread_ = false; + bool ra_thread_started_ = false; int total_ra_blocks_merged_ = 0; MERGE_IO_TRANSITION io_state_ = MERGE_IO_TRANSITION::INVALID; std::unique_ptr read_ahead_thread_; @@ -242,6 +245,7 @@ class SnapshotHandler : public std::enable_shared_from_this { bool scratch_space_ = false; int num_worker_threads_ = kNumWorkerThreads; bool perform_verification_ = true; + bool resume_merge_ = false; std::unique_ptr ring_; std::unique_ptr update_verify_; diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp index d2128c5d0..998d233d5 100644 --- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp +++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp @@ -206,6 +206,7 @@ bool ReadAhead::ReconstructDataFromCow() { return false; } + snapuserd_->RaThreadStarted(); SNAP_LOG(INFO) << "ReconstructDataFromCow success"; notify_read_ahead_failed.Cancel(); return true; @@ -716,9 +717,13 @@ bool ReadAhead::ReadAheadIOStart() { total_ra_blocks_completed_ += total_blocks_merged_; snapuserd_->SetMergedBlockCountForNextCommit(total_blocks_merged_); - // Flush the data only if we have a overlapping blocks in the region + // Flush the scratch data - Technically, we should flush only for overlapping + // blocks; However, since this region is mmap'ed, the dirty pages can still + // get flushed to disk at any random point in time. Instead, make sure + // the data in scratch is in the correct state before merge thread resumes. + // // Notify the Merge thread to resume merging this window - if (!snapuserd_->ReadAheadIOCompleted(overlap_)) { + if (!snapuserd_->ReadAheadIOCompleted(true)) { SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed..."; snapuserd_->ReadAheadIOFailed(); return false; diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp index 620ecbd3f..65f31cf7c 100644 --- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp +++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp @@ -235,9 +235,11 @@ class SnapuserdTest : public SnapuserdTestBase { bool Merge(); void ValidateMerge(); void ReadSnapshotDeviceAndValidate(); + void ReadSnapshotAndValidateOverlappingBlocks(); void Shutdown(); void MergeInterrupt(); void MergeInterruptFixed(int duration); + void MergeInterruptAndValidate(int duration); void MergeInterruptRandomly(int max_duration); bool StartMerge(); void CheckMergeCompletion(); @@ -358,6 +360,76 @@ void SnapuserdTest::ReadSnapshotDeviceAndValidate() { ASSERT_EQ(memcmp(snapuserd_buffer.get(), (char*)orig_buffer_.get() + (size_ * 4), size_), 0); } +void SnapuserdTest::ReadSnapshotAndValidateOverlappingBlocks() { + // Open COW device + unique_fd fd(open(cow_system_->path, O_RDONLY)); + ASSERT_GE(fd, 0); + + CowReader reader; + ASSERT_TRUE(reader.Parse(fd)); + + const auto& header = reader.GetHeader(); + size_t total_mapped_addr_length = header.prefix.header_size + BUFFER_REGION_DEFAULT_SIZE; + + ASSERT_GE(header.prefix.major_version, 2); + + void* mapped_addr = mmap(NULL, total_mapped_addr_length, PROT_READ, MAP_SHARED, fd.get(), 0); + ASSERT_NE(mapped_addr, MAP_FAILED); + + bool populate_data_from_scratch = false; + struct BufferState* ra_state = + reinterpret_cast((char*)mapped_addr + header.prefix.header_size); + if (ra_state->read_ahead_state == kCowReadAheadDone) { + populate_data_from_scratch = true; + } + + size_t num_merge_ops = header.num_merge_ops; + // We have some partial merge operations completed. + // To test the merge-resume path, forcefully corrupt the data of the base + // device for the offsets where the merge is still pending. + if (num_merge_ops && populate_data_from_scratch) { + std::string corrupt_buffer(4096, 0); + // Corrupt two blocks from the point where the merge has to be resumed by + // writing down zeroe's. + // + // Now, since this is a merge-resume path, the "correct" data should be + // in the scratch space of the COW device. When there is an I/O request + // from the snapshot device, the data has to be retrieved from the + // scratch space. If not and I/O is routed to the base device, we + // may end up with corruption. + off_t corrupt_offset = (num_merge_ops + 2) * 4096; + + if (corrupt_offset < size_) { + ASSERT_EQ(android::base::WriteFullyAtOffset(base_fd_, (void*)corrupt_buffer.c_str(), + 4096, corrupt_offset), + true); + corrupt_offset -= 4096; + ASSERT_EQ(android::base::WriteFullyAtOffset(base_fd_, (void*)corrupt_buffer.c_str(), + 4096, corrupt_offset), + true); + fsync(base_fd_.get()); + } + } + + // Time to read the snapshot device. + unique_fd snapshot_fd(open(dmuser_dev_->GetPath().c_str(), O_RDONLY | O_DIRECT | O_SYNC)); + ASSERT_GE(snapshot_fd, 0); + + void* buff_addr; + ASSERT_EQ(posix_memalign(&buff_addr, 4096, size_), 0); + + std::unique_ptr snapshot_buffer(buff_addr, ::free); + + // Scan the entire snapshot device and read the data and verify data + // integrity. Since the base device was forcefully corrupted, the data from + // this scan should be retrieved from scratch space of the COW partition. + // + // Furthermore, after the merge is complete, base device data is again + // verified as the aforementioned corrupted blocks aren't persisted. + ASSERT_EQ(ReadFullyAtOffset(snapshot_fd, snapshot_buffer.get(), size_, 0), true); + ASSERT_EQ(memcmp(snapshot_buffer.get(), orig_buffer_.get(), size_), 0); +} + void SnapuserdTest::CreateCowDeviceWithCopyOverlap_2() { auto writer = CreateCowDeviceInternal(); ASSERT_NE(writer, nullptr); @@ -665,6 +737,20 @@ void SnapuserdTest::MergeInterruptFixed(int duration) { ASSERT_TRUE(Merge()); } +void SnapuserdTest::MergeInterruptAndValidate(int duration) { + ASSERT_TRUE(StartMerge()); + + for (int i = 0; i < 15; i++) { + std::this_thread::sleep_for(std::chrono::milliseconds(duration)); + ASSERT_NO_FATAL_FAILURE(SimulateDaemonRestart()); + ReadSnapshotAndValidateOverlappingBlocks(); + ASSERT_TRUE(StartMerge()); + } + + ASSERT_NO_FATAL_FAILURE(SimulateDaemonRestart()); + ASSERT_TRUE(Merge()); +} + void SnapuserdTest::MergeInterrupt() { // Interrupt merge at various intervals ASSERT_TRUE(StartMerge()); @@ -761,6 +847,15 @@ TEST_F(SnapuserdTest, Snapshot_COPY_Overlap_Merge_Resume_TEST) { ValidateMerge(); } +TEST_F(SnapuserdTest, Snapshot_COPY_Overlap_Merge_Resume_IO_Validate_TEST) { + if (!harness_->HasUserDevice()) { + GTEST_SKIP() << "Skipping snapshot read; not supported"; + } + ASSERT_NO_FATAL_FAILURE(SetupCopyOverlap_2()); + ASSERT_NO_FATAL_FAILURE(MergeInterruptAndValidate(2)); + ValidateMerge(); +} + TEST_F(SnapuserdTest, Snapshot_Merge_Crash_Fixed_Ordered) { ASSERT_NO_FATAL_FAILURE(SetupOrderedOps()); ASSERT_NO_FATAL_FAILURE(MergeInterruptFixed(300)); diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_transitions.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_transitions.cpp index f3e001952..8d090bf58 100644 --- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_transitions.cpp +++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_transitions.cpp @@ -366,6 +366,26 @@ void SnapshotHandler::WaitForMergeComplete() { } } +void SnapshotHandler::RaThreadStarted() { + std::unique_lock lock(lock_); + ra_thread_started_ = true; +} + +void SnapshotHandler::WaitForRaThreadToStart() { + auto now = std::chrono::system_clock::now(); + auto deadline = now + 3s; + { + std::unique_lock lock(lock_); + while (!ra_thread_started_) { + auto status = cv.wait_until(lock, deadline); + if (status == std::cv_status::timeout) { + SNAP_LOG(ERROR) << "Read-ahead thread did not start"; + return; + } + } + } +} + std::string SnapshotHandler::GetMergeStatus() { bool merge_not_initiated = false; bool merge_monitored = false; @@ -618,7 +638,6 @@ bool SnapshotHandler::GetRABuffer(std::unique_lock* lock, uint64_t b std::unordered_map::iterator it = read_ahead_buffer_map_.find(block); if (it == read_ahead_buffer_map_.end()) { - SNAP_LOG(ERROR) << "Block: " << block << " not found in RA buffer"; return false; } @@ -642,6 +661,13 @@ MERGE_GROUP_STATE SnapshotHandler::ProcessMergingBlock(uint64_t new_block, void* MERGE_GROUP_STATE state = blk_state->merge_state_; switch (state) { case MERGE_GROUP_STATE::GROUP_MERGE_PENDING: { + // If this is a merge-resume path, check if the data is + // available from scratch space. Data from scratch space takes + // higher precedence than from source device for overlapping + // blocks. + if (resume_merge_ && GetRABuffer(&lock, new_block, buffer)) { + return (MERGE_GROUP_STATE::GROUP_MERGE_IN_PROGRESS); + } blk_state->num_ios_in_progress += 1; // ref count [[fallthrough]]; }