libsnapshot: Use words for xor ops

Use words instead of doing xor byte-by-byte for the entire buffer.
Profiling this with unittest, I can see the xor ops is much faster.

Also handle the word size appropriately for 32bit and 64bit platforms.

simpleperf shows that ProcessXorOp() is atleast 30% faster.
Similar improvement is seen for ProcessXorData().

Test: snapuserd_test
Bug: 369905394
Change-Id: I0bd8586f7fc1bf184f19320667b8195b07f9cdf2
Signed-off-by: Sandeep Dhavale <dhavale@google.com>
This commit is contained in:
Sandeep Dhavale 2024-09-27 13:37:18 -07:00
parent 14807185ac
commit 0d84909d66
2 changed files with 15 additions and 8 deletions

View file

@ -104,6 +104,8 @@ bool ReadWorker::ProcessCopyOp(const CowOperation* cow_op, void* buffer) {
}
bool ReadWorker::ProcessXorOp(const CowOperation* cow_op, void* buffer) {
using WordType = std::conditional_t<sizeof(void*) == sizeof(uint64_t), uint64_t, uint32_t>;
if (!ReadFromSourceDevice(cow_op, buffer)) {
return false;
}
@ -120,9 +122,12 @@ bool ReadWorker::ProcessXorOp(const CowOperation* cow_op, void* buffer) {
return false;
}
auto xor_out = reinterpret_cast<uint8_t*>(buffer);
for (size_t i = 0; i < BLOCK_SZ; i++) {
xor_out[i] ^= xor_buffer_[i];
auto xor_in = reinterpret_cast<const WordType*>(xor_buffer_.data());
auto xor_out = reinterpret_cast<WordType*>(buffer);
auto num_words = BLOCK_SZ / sizeof(WordType);
for (auto i = 0; i < num_words; i++) {
xor_out[i] ^= xor_in[i];
}
return true;
}

View file

@ -458,6 +458,7 @@ bool ReadAhead::ReapIoCompletions(int pending_ios_to_complete) {
void ReadAhead::ProcessXorData(size_t& block_xor_index, size_t& xor_index,
std::vector<const CowOperation*>& xor_op_vec, void* buffer,
loff_t& buffer_offset) {
using WordType = std::conditional_t<sizeof(void*) == sizeof(uint64_t), uint64_t, uint32_t>;
loff_t xor_buf_offset = 0;
while (block_xor_index < blocks_.size()) {
@ -470,13 +471,14 @@ void ReadAhead::ProcessXorData(size_t& block_xor_index, size_t& xor_index,
// Check if this block is an XOR op
if (xor_op->new_block == new_block) {
// Pointer to the data read from base device
uint8_t* buffer = reinterpret_cast<uint8_t*>(bufptr);
auto buffer_words = reinterpret_cast<WordType*>(bufptr);
// Get the xor'ed data read from COW device
uint8_t* xor_data = reinterpret_cast<uint8_t*>((char*)bufsink_.GetPayloadBufPtr() +
xor_buf_offset);
auto xor_data_words = reinterpret_cast<WordType*>(
(char*)bufsink_.GetPayloadBufPtr() + xor_buf_offset);
auto num_words = BLOCK_SZ / sizeof(WordType);
for (size_t byte_offset = 0; byte_offset < BLOCK_SZ; byte_offset++) {
buffer[byte_offset] ^= xor_data[byte_offset];
for (auto i = 0; i < num_words; i++) {
buffer_words[i] ^= xor_data_words[i];
}
// Move to next XOR op