snapuserd: I/O request on overlapping blocks during snapshot-merge.

This fixes the case when all the following conditions are true:

1: Incremental OTA
2: When there are sequence of overlapping COPY operations within one merge-window
   (510 blocks)
3: Device is rebooted when snapshot-merge is in-progress of this
   merge-window. When device reboots, the state of merge-window (of 510 blocks) was
   merge-in-progress (aka - only partial set of blocks were merged in
   this window thereby the state of the base device is in-complete for
   this window)
4: During the next boot, if there any I/O request from the filesystem
   which maps to the merge-window in (3):
   	a: The data has to be retrieved from the scratch space of the
	COW until the snapshot-merge for that window is completed.

	b: Once the snapshot-merge is complete for that window, data
	has to be retrieved from base device.

The bug was in step 4(a) wherein I/O request was getting routed to base
device.

This patch addresses the above flow by fixing step 4(a).

A new vts test has been added to explicitly track this issue.

Additionally, there is no need to re-scan the partition if partition is in merge resume path. This should cut down the overhead of the scan.

Bug: 275296365
Test: 1: 100 iterations of ./vts_snapuserd_test --gtest_filter=SnapuserdTest.Snapshot_COPY_Overlap_Merge_Resume_IO_Validate_TEST
2: Incremental OTA on Pixel 6 Pro with multiple iterations of device
   reboot when merge is in progress

Change-Id: Ib53be7f07ff192a84ec7f7049b2c6be01dad1041
Signed-off-by: Akilesh Kailash <akailash@google.com>
This commit is contained in:
Akilesh Kailash 2023-10-02 22:22:36 -07:00
parent 7cf712ab3b
commit cffa413de4
5 changed files with 145 additions and 4 deletions

View file

@ -173,6 +173,10 @@ bool SnapshotHandler::ReadMetadata() {
}
SNAP_LOG(INFO) << "Merge-ops: " << header.num_merge_ops;
if (header.num_merge_ops) {
resume_merge_ = true;
SNAP_LOG(INFO) << "Resume Snapshot-merge";
}
if (!MmapMetadata()) {
SNAP_LOG(ERROR) << "mmap failed";
@ -295,6 +299,11 @@ bool SnapshotHandler::Start() {
if (ra_thread_) {
ra_thread_status =
std::async(std::launch::async, &ReadAhead::RunThread, read_ahead_thread_.get());
// If this is a merge-resume path, wait until RA thread is fully up as
// the data has to be re-constructed from the scratch space.
if (resume_merge_ && ShouldReconstructDataFromCow()) {
WaitForRaThreadToStart();
}
}
// Launch worker threads
@ -307,7 +316,9 @@ bool SnapshotHandler::Start() {
std::async(std::launch::async, &MergeWorker::Run, merge_thread_.get());
// Now that the worker threads are up, scan the partitions.
if (perform_verification_) {
// If the snapshot-merge is being resumed, there is no need to scan as the
// current slot is already marked as boot complete.
if (perform_verification_ && !resume_merge_) {
update_verify_->VerifyUpdatePartition();
}

View file

@ -147,6 +147,8 @@ class SnapshotHandler : public std::enable_shared_from_this<SnapshotHandler> {
void WakeupMonitorMergeThread();
void WaitForMergeComplete();
bool WaitForMergeBegin();
void RaThreadStarted();
void WaitForRaThreadToStart();
void NotifyRAForMergeReady();
bool WaitForMergeReady();
void MergeFailed();
@ -221,6 +223,7 @@ class SnapshotHandler : public std::enable_shared_from_this<SnapshotHandler> {
// Read-ahead related
bool populate_data_from_cow_ = false;
bool ra_thread_ = false;
bool ra_thread_started_ = false;
int total_ra_blocks_merged_ = 0;
MERGE_IO_TRANSITION io_state_ = MERGE_IO_TRANSITION::INVALID;
std::unique_ptr<ReadAhead> read_ahead_thread_;
@ -242,6 +245,7 @@ class SnapshotHandler : public std::enable_shared_from_this<SnapshotHandler> {
bool scratch_space_ = false;
int num_worker_threads_ = kNumWorkerThreads;
bool perform_verification_ = true;
bool resume_merge_ = false;
std::unique_ptr<struct io_uring> ring_;
std::unique_ptr<UpdateVerify> update_verify_;

View file

@ -206,6 +206,7 @@ bool ReadAhead::ReconstructDataFromCow() {
return false;
}
snapuserd_->RaThreadStarted();
SNAP_LOG(INFO) << "ReconstructDataFromCow success";
notify_read_ahead_failed.Cancel();
return true;
@ -716,9 +717,13 @@ bool ReadAhead::ReadAheadIOStart() {
total_ra_blocks_completed_ += total_blocks_merged_;
snapuserd_->SetMergedBlockCountForNextCommit(total_blocks_merged_);
// Flush the data only if we have a overlapping blocks in the region
// Flush the scratch data - Technically, we should flush only for overlapping
// blocks; However, since this region is mmap'ed, the dirty pages can still
// get flushed to disk at any random point in time. Instead, make sure
// the data in scratch is in the correct state before merge thread resumes.
//
// Notify the Merge thread to resume merging this window
if (!snapuserd_->ReadAheadIOCompleted(overlap_)) {
if (!snapuserd_->ReadAheadIOCompleted(true)) {
SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
snapuserd_->ReadAheadIOFailed();
return false;

View file

@ -235,9 +235,11 @@ class SnapuserdTest : public SnapuserdTestBase {
bool Merge();
void ValidateMerge();
void ReadSnapshotDeviceAndValidate();
void ReadSnapshotAndValidateOverlappingBlocks();
void Shutdown();
void MergeInterrupt();
void MergeInterruptFixed(int duration);
void MergeInterruptAndValidate(int duration);
void MergeInterruptRandomly(int max_duration);
bool StartMerge();
void CheckMergeCompletion();
@ -358,6 +360,76 @@ void SnapuserdTest::ReadSnapshotDeviceAndValidate() {
ASSERT_EQ(memcmp(snapuserd_buffer.get(), (char*)orig_buffer_.get() + (size_ * 4), size_), 0);
}
void SnapuserdTest::ReadSnapshotAndValidateOverlappingBlocks() {
// Open COW device
unique_fd fd(open(cow_system_->path, O_RDONLY));
ASSERT_GE(fd, 0);
CowReader reader;
ASSERT_TRUE(reader.Parse(fd));
const auto& header = reader.GetHeader();
size_t total_mapped_addr_length = header.prefix.header_size + BUFFER_REGION_DEFAULT_SIZE;
ASSERT_GE(header.prefix.major_version, 2);
void* mapped_addr = mmap(NULL, total_mapped_addr_length, PROT_READ, MAP_SHARED, fd.get(), 0);
ASSERT_NE(mapped_addr, MAP_FAILED);
bool populate_data_from_scratch = false;
struct BufferState* ra_state =
reinterpret_cast<struct BufferState*>((char*)mapped_addr + header.prefix.header_size);
if (ra_state->read_ahead_state == kCowReadAheadDone) {
populate_data_from_scratch = true;
}
size_t num_merge_ops = header.num_merge_ops;
// We have some partial merge operations completed.
// To test the merge-resume path, forcefully corrupt the data of the base
// device for the offsets where the merge is still pending.
if (num_merge_ops && populate_data_from_scratch) {
std::string corrupt_buffer(4096, 0);
// Corrupt two blocks from the point where the merge has to be resumed by
// writing down zeroe's.
//
// Now, since this is a merge-resume path, the "correct" data should be
// in the scratch space of the COW device. When there is an I/O request
// from the snapshot device, the data has to be retrieved from the
// scratch space. If not and I/O is routed to the base device, we
// may end up with corruption.
off_t corrupt_offset = (num_merge_ops + 2) * 4096;
if (corrupt_offset < size_) {
ASSERT_EQ(android::base::WriteFullyAtOffset(base_fd_, (void*)corrupt_buffer.c_str(),
4096, corrupt_offset),
true);
corrupt_offset -= 4096;
ASSERT_EQ(android::base::WriteFullyAtOffset(base_fd_, (void*)corrupt_buffer.c_str(),
4096, corrupt_offset),
true);
fsync(base_fd_.get());
}
}
// Time to read the snapshot device.
unique_fd snapshot_fd(open(dmuser_dev_->GetPath().c_str(), O_RDONLY | O_DIRECT | O_SYNC));
ASSERT_GE(snapshot_fd, 0);
void* buff_addr;
ASSERT_EQ(posix_memalign(&buff_addr, 4096, size_), 0);
std::unique_ptr<void, decltype(&::free)> snapshot_buffer(buff_addr, ::free);
// Scan the entire snapshot device and read the data and verify data
// integrity. Since the base device was forcefully corrupted, the data from
// this scan should be retrieved from scratch space of the COW partition.
//
// Furthermore, after the merge is complete, base device data is again
// verified as the aforementioned corrupted blocks aren't persisted.
ASSERT_EQ(ReadFullyAtOffset(snapshot_fd, snapshot_buffer.get(), size_, 0), true);
ASSERT_EQ(memcmp(snapshot_buffer.get(), orig_buffer_.get(), size_), 0);
}
void SnapuserdTest::CreateCowDeviceWithCopyOverlap_2() {
auto writer = CreateCowDeviceInternal();
ASSERT_NE(writer, nullptr);
@ -665,6 +737,20 @@ void SnapuserdTest::MergeInterruptFixed(int duration) {
ASSERT_TRUE(Merge());
}
void SnapuserdTest::MergeInterruptAndValidate(int duration) {
ASSERT_TRUE(StartMerge());
for (int i = 0; i < 15; i++) {
std::this_thread::sleep_for(std::chrono::milliseconds(duration));
ASSERT_NO_FATAL_FAILURE(SimulateDaemonRestart());
ReadSnapshotAndValidateOverlappingBlocks();
ASSERT_TRUE(StartMerge());
}
ASSERT_NO_FATAL_FAILURE(SimulateDaemonRestart());
ASSERT_TRUE(Merge());
}
void SnapuserdTest::MergeInterrupt() {
// Interrupt merge at various intervals
ASSERT_TRUE(StartMerge());
@ -761,6 +847,15 @@ TEST_F(SnapuserdTest, Snapshot_COPY_Overlap_Merge_Resume_TEST) {
ValidateMerge();
}
TEST_F(SnapuserdTest, Snapshot_COPY_Overlap_Merge_Resume_IO_Validate_TEST) {
if (!harness_->HasUserDevice()) {
GTEST_SKIP() << "Skipping snapshot read; not supported";
}
ASSERT_NO_FATAL_FAILURE(SetupCopyOverlap_2());
ASSERT_NO_FATAL_FAILURE(MergeInterruptAndValidate(2));
ValidateMerge();
}
TEST_F(SnapuserdTest, Snapshot_Merge_Crash_Fixed_Ordered) {
ASSERT_NO_FATAL_FAILURE(SetupOrderedOps());
ASSERT_NO_FATAL_FAILURE(MergeInterruptFixed(300));

View file

@ -366,6 +366,26 @@ void SnapshotHandler::WaitForMergeComplete() {
}
}
void SnapshotHandler::RaThreadStarted() {
std::unique_lock<std::mutex> lock(lock_);
ra_thread_started_ = true;
}
void SnapshotHandler::WaitForRaThreadToStart() {
auto now = std::chrono::system_clock::now();
auto deadline = now + 3s;
{
std::unique_lock<std::mutex> lock(lock_);
while (!ra_thread_started_) {
auto status = cv.wait_until(lock, deadline);
if (status == std::cv_status::timeout) {
SNAP_LOG(ERROR) << "Read-ahead thread did not start";
return;
}
}
}
}
std::string SnapshotHandler::GetMergeStatus() {
bool merge_not_initiated = false;
bool merge_monitored = false;
@ -618,7 +638,6 @@ bool SnapshotHandler::GetRABuffer(std::unique_lock<std::mutex>* lock, uint64_t b
std::unordered_map<uint64_t, void*>::iterator it = read_ahead_buffer_map_.find(block);
if (it == read_ahead_buffer_map_.end()) {
SNAP_LOG(ERROR) << "Block: " << block << " not found in RA buffer";
return false;
}
@ -642,6 +661,13 @@ MERGE_GROUP_STATE SnapshotHandler::ProcessMergingBlock(uint64_t new_block, void*
MERGE_GROUP_STATE state = blk_state->merge_state_;
switch (state) {
case MERGE_GROUP_STATE::GROUP_MERGE_PENDING: {
// If this is a merge-resume path, check if the data is
// available from scratch space. Data from scratch space takes
// higher precedence than from source device for overlapping
// blocks.
if (resume_merge_ && GetRABuffer(&lock, new_block, buffer)) {
return (MERGE_GROUP_STATE::GROUP_MERGE_IN_PROGRESS);
}
blk_state->num_ios_in_progress += 1; // ref count
[[fallthrough]];
}