Merge "Create an interface for the cd entry hash table"

This commit is contained in:
Tianjie Xu 2020-03-10 23:51:18 +00:00 committed by Gerrit Code Review
commit 73ae00bbde
2 changed files with 130 additions and 74 deletions

View file

@ -106,55 +106,79 @@ static uint32_t ComputeHash(std::string_view name) {
return static_cast<uint32_t>(std::hash<std::string_view>{}(name));
}
/*
* Convert a ZipEntry to a hash table index, verifying that it's in a
* valid range.
*/
static int64_t EntryToIndex(const ZipStringOffset* hash_table, const uint32_t hash_table_size,
std::string_view name, const uint8_t* start) {
// Convert a ZipEntry to a hash table index, verifying that it's in a valid range.
std::pair<int32_t, uint64_t> CdEntryMapZip32::GetCdEntryOffset(std::string_view name,
const uint8_t* start) const {
const uint32_t hash = ComputeHash(name);
// NOTE: (hash_table_size - 1) is guaranteed to be non-negative.
uint32_t ent = hash & (hash_table_size - 1);
while (hash_table[ent].name_offset != 0) {
if (hash_table[ent].ToStringView(start) == name) {
return ent;
uint32_t ent = hash & (hash_table_size_ - 1);
while (hash_table_[ent].name_offset != 0) {
if (hash_table_[ent].ToStringView(start) == name) {
return {0, hash_table_[ent].name_offset};
}
ent = (ent + 1) & (hash_table_size - 1);
ent = (ent + 1) & (hash_table_size_ - 1);
}
ALOGV("Zip: Unable to find entry %.*s", static_cast<int>(name.size()), name.data());
return kEntryNotFound;
return {kEntryNotFound, 0};
}
/*
* Add a new entry to the hash table.
*/
static int32_t AddToHash(ZipStringOffset* hash_table, const uint32_t hash_table_size,
std::string_view name, const uint8_t* start) {
int32_t CdEntryMapZip32::AddToMap(std::string_view name, const uint8_t* start) {
const uint64_t hash = ComputeHash(name);
uint32_t ent = hash & (hash_table_size - 1);
uint32_t ent = hash & (hash_table_size_ - 1);
/*
* We over-allocated the table, so we're guaranteed to find an empty slot.
* Further, we guarantee that the hashtable size is not 0.
*/
while (hash_table[ent].name_offset != 0) {
if (hash_table[ent].ToStringView(start) == name) {
while (hash_table_[ent].name_offset != 0) {
if (hash_table_[ent].ToStringView(start) == name) {
// We've found a duplicate entry. We don't accept duplicates.
ALOGW("Zip: Found duplicate entry %.*s", static_cast<int>(name.size()), name.data());
return kDuplicateEntry;
}
ent = (ent + 1) & (hash_table_size - 1);
ent = (ent + 1) & (hash_table_size_ - 1);
}
// `name` has already been validated before entry.
const char* start_char = reinterpret_cast<const char*>(start);
hash_table[ent].name_offset = static_cast<uint32_t>(name.data() - start_char);
hash_table[ent].name_length = static_cast<uint16_t>(name.size());
hash_table_[ent].name_offset = static_cast<uint32_t>(name.data() - start_char);
hash_table_[ent].name_length = static_cast<uint16_t>(name.size());
return 0;
}
void CdEntryMapZip32::ResetIteration() {
current_position_ = 0;
}
std::pair<std::string_view, uint64_t> CdEntryMapZip32::Next(const uint8_t* cd_start) {
while (current_position_ < hash_table_size_) {
const auto& entry = hash_table_[current_position_];
current_position_ += 1;
if (entry.name_offset != 0) {
return {entry.ToStringView(cd_start), entry.name_offset};
}
}
// We have reached the end of the hash table.
return {};
}
CdEntryMapZip32::CdEntryMapZip32(uint16_t num_entries) {
hash_table_size_ = RoundUpPower2(1 + (num_entries * 4) / 3);
hash_table_ = {
reinterpret_cast<ZipStringOffset*>(calloc(hash_table_size_, sizeof(ZipStringOffset))), free};
}
std::unique_ptr<CdEntryMapInterface> CdEntryMapZip32::Create(uint16_t num_entries) {
auto entry_map = new CdEntryMapZip32(num_entries);
CHECK(entry_map->hash_table_ != nullptr)
<< "Zip: unable to allocate the " << entry_map->hash_table_size_
<< " entry hash_table, entry size: " << sizeof(ZipStringOffset);
return std::unique_ptr<CdEntryMapInterface>(entry_map);
}
#if defined(__BIONIC__)
uint64_t GetOwnerTag(const ZipArchive* archive) {
return android_fdsan_create_owner_tag(ANDROID_FDSAN_OWNER_TYPE_ZIPARCHIVE,
@ -168,9 +192,7 @@ ZipArchive::ZipArchive(const int fd, bool assume_ownership)
directory_offset(0),
central_directory(),
directory_map(),
num_entries(0),
hash_table_size(0),
hash_table(nullptr) {
num_entries(0) {
#if defined(__BIONIC__)
if (assume_ownership) {
android_fdsan_exchange_owner_tag(fd, 0, GetOwnerTag(this));
@ -184,9 +206,7 @@ ZipArchive::ZipArchive(const void* address, size_t length)
directory_offset(0),
central_directory(),
directory_map(),
num_entries(0),
hash_table_size(0),
hash_table(nullptr) {}
num_entries(0) {}
ZipArchive::~ZipArchive() {
if (close_file && mapped_zip.GetFileDescriptor() >= 0) {
@ -196,8 +216,6 @@ ZipArchive::~ZipArchive() {
close(mapped_zip.GetFileDescriptor());
#endif
}
free(hash_table);
}
static int32_t MapCentralDirectory0(const char* debug_file_name, ZipArchive* archive,
@ -344,12 +362,8 @@ static int32_t ParseZipArchive(ZipArchive* archive) {
* low as 50% after we round off to a power of 2. There must be at
* least one unused entry to avoid an infinite loop during creation.
*/
archive->hash_table_size = RoundUpPower2(1 + (num_entries * 4) / 3);
archive->hash_table =
reinterpret_cast<ZipStringOffset*>(calloc(archive->hash_table_size, sizeof(ZipStringOffset)));
if (archive->hash_table == nullptr) {
ALOGW("Zip: unable to allocate the %u-entry hash_table, entry size: %zu",
archive->hash_table_size, sizeof(ZipStringOffset));
archive->cd_entry_map = CdEntryMapZip32::Create(num_entries);
if (archive->cd_entry_map == nullptr) {
return kAllocationFailed;
}
@ -401,9 +415,9 @@ static int32_t ParseZipArchive(ZipArchive* archive) {
// Add the CDE filename to the hash table.
std::string_view entry_name{reinterpret_cast<const char*>(file_name), file_name_length};
const int add_result = AddToHash(archive->hash_table, archive->hash_table_size, entry_name,
archive->central_directory.GetBasePtr());
if (add_result != 0) {
if (auto add_result =
archive->cd_entry_map->AddToMap(entry_name, archive->central_directory.GetBasePtr());
add_result != 0) {
ALOGW("Zip: Error adding entry to hash table %d", add_result);
return add_result;
}
@ -514,14 +528,13 @@ static int32_t ValidateDataDescriptor(MappedZipFile& mapped_zip, ZipEntry* entry
return 0;
}
static int32_t FindEntry(const ZipArchive* archive, const int32_t ent, ZipEntry* data) {
const uint16_t nameLen = archive->hash_table[ent].name_length;
static int32_t FindEntry(const ZipArchive* archive, std::string_view entryName,
const uint64_t nameOffset, ZipEntry* data) {
// Recover the start of the central directory entry from the filename
// pointer. The filename is the first entry past the fixed-size data,
// so we can just subtract back from that.
const uint8_t* base_ptr = archive->central_directory.GetBasePtr();
const uint8_t* ptr = base_ptr + archive->hash_table[ent].name_offset;
const uint8_t* ptr = base_ptr + nameOffset;
ptr -= sizeof(CentralDirectoryRecord);
// This is the base of our mmapped region, we have to sanity check that
@ -627,8 +640,11 @@ static int32_t FindEntry(const ZipArchive* archive, const int32_t ent, ZipEntry*
// Check that the local file header name matches the declared
// name in the central directory.
CHECK_LE(entryName.size(), UINT16_MAX);
auto nameLen = static_cast<uint16_t>(entryName.size());
if (lfh->file_name_length != nameLen) {
ALOGW("Zip: lfh name length did not match central directory");
ALOGW("Zip: lfh name length did not match central directory for %s: %" PRIu16 " %" PRIu16,
std::string(entryName).c_str(), lfh->file_name_length, nameLen);
return kInconsistentInformation;
}
const off64_t name_offset = local_header_offset + sizeof(LocalFileHeader);
@ -641,9 +657,7 @@ static int32_t FindEntry(const ZipArchive* archive, const int32_t ent, ZipEntry*
ALOGW("Zip: failed reading lfh name from offset %" PRId64, static_cast<int64_t>(name_offset));
return kIoError;
}
const std::string_view entry_name =
archive->hash_table[ent].ToStringView(archive->central_directory.GetBasePtr());
if (memcmp(entry_name.data(), name_buf.data(), nameLen) != 0) {
if (memcmp(entryName.data(), name_buf.data(), nameLen) != 0) {
ALOGW("Zip: lfh name did not match central directory");
return kInconsistentInformation;
}
@ -689,7 +703,7 @@ struct IterationHandle {
int32_t StartIteration(ZipArchiveHandle archive, void** cookie_ptr,
const std::string_view optional_prefix,
const std::string_view optional_suffix) {
if (archive == NULL || archive->hash_table == NULL) {
if (archive == nullptr || archive->cd_entry_map == nullptr) {
ALOGW("Zip: Invalid ZipArchiveHandle");
return kInvalidHandle;
}
@ -700,6 +714,7 @@ int32_t StartIteration(ZipArchiveHandle archive, void** cookie_ptr,
return kInvalidEntryName;
}
archive->cd_entry_map->ResetIteration();
*cookie_ptr = new IterationHandle(archive, optional_prefix, optional_suffix);
return 0;
}
@ -715,14 +730,14 @@ int32_t FindEntry(const ZipArchiveHandle archive, const std::string_view entryNa
return kInvalidEntryName;
}
const int64_t ent = EntryToIndex(archive->hash_table, archive->hash_table_size, entryName,
archive->central_directory.GetBasePtr());
if (ent < 0) {
const auto [result, offset] =
archive->cd_entry_map->GetCdEntryOffset(entryName, archive->central_directory.GetBasePtr());
if (result != 0) {
ALOGV("Zip: Could not find entry %.*s", static_cast<int>(entryName.size()), entryName.data());
return static_cast<int32_t>(ent); // kEntryNotFound is safe to truncate.
return static_cast<int32_t>(result); // kEntryNotFound is safe to truncate.
}
// We know there are at most hash_table_size entries, safe to truncate.
return FindEntry(archive, static_cast<uint32_t>(ent), data);
return FindEntry(archive, entryName, offset, data);
}
int32_t Next(void* cookie, ZipEntry* data, std::string* name) {
@ -736,35 +751,32 @@ int32_t Next(void* cookie, ZipEntry* data, std::string* name) {
int32_t Next(void* cookie, ZipEntry* data, std::string_view* name) {
IterationHandle* handle = reinterpret_cast<IterationHandle*>(cookie);
if (handle == NULL) {
if (handle == nullptr) {
ALOGW("Zip: Null ZipArchiveHandle");
return kInvalidHandle;
}
ZipArchive* archive = handle->archive;
if (archive == NULL || archive->hash_table == NULL) {
if (archive == nullptr || archive->cd_entry_map == nullptr) {
ALOGW("Zip: Invalid ZipArchiveHandle");
return kInvalidHandle;
}
const uint32_t currentOffset = handle->position;
const uint32_t hash_table_length = archive->hash_table_size;
const ZipStringOffset* hash_table = archive->hash_table;
for (uint32_t i = currentOffset; i < hash_table_length; ++i) {
const std::string_view entry_name =
hash_table[i].ToStringView(archive->central_directory.GetBasePtr());
if (hash_table[i].name_offset != 0 && (android::base::StartsWith(entry_name, handle->prefix) &&
android::base::EndsWith(entry_name, handle->suffix))) {
handle->position = (i + 1);
const int error = FindEntry(archive, i, data);
auto entry = archive->cd_entry_map->Next(archive->central_directory.GetBasePtr());
while (entry != std::pair<std::string_view, uint64_t>()) {
const auto [entry_name, offset] = entry;
if (android::base::StartsWith(entry_name, handle->prefix) &&
android::base::EndsWith(entry_name, handle->suffix)) {
const int error = FindEntry(archive, entry_name, offset, data);
if (!error && name) {
*name = entry_name;
}
return error;
}
entry = archive->cd_entry_map->Next(archive->central_directory.GetBasePtr());
}
handle->position = 0;
archive->cd_entry_map->ResetIteration();
return kIterationEnd;
}

View file

@ -23,6 +23,7 @@
#include <unistd.h>
#include <memory>
#include <utility>
#include <vector>
#include "android-base/macros.h"
@ -140,6 +141,28 @@ class CentralDirectory {
size_t length_;
};
// This class is the interface of the central directory entries map. The map
// helps to locate a particular cd entry based on the filename.
class CdEntryMapInterface {
public:
virtual ~CdEntryMapInterface() = default;
// Adds an entry to the map. The |name| should internally points to the
// filename field of a cd entry. And |start| points to the beginning of the
// central directory. Returns 0 on success.
virtual int32_t AddToMap(std::string_view name, const uint8_t* start) = 0;
// For the zip entry |entryName|, finds the offset of its filename field in
// the central directory. Returns a pair of [status, offset]. The value of
// the status is 0 on success.
virtual std::pair<int32_t, uint64_t> GetCdEntryOffset(std::string_view name,
const uint8_t* cd_start) const = 0;
// Resets the iterator to the beginning of the map.
virtual void ResetIteration() = 0;
// Returns the [name, cd offset] of the current element. Also increments the
// iterator to points to the next element. Returns an empty pair we have read
// past boundary.
virtual std::pair<std::string_view, uint64_t> Next(const uint8_t* cd_start) = 0;
};
/**
* More space efficient string representation of strings in an mmaped zipped
* file than std::string_view. Using std::string_view as an entry in the
@ -160,6 +183,33 @@ struct ZipStringOffset {
}
};
// This implementation of CdEntryMap uses an array hash table. It uses less
// memory than std::map; and it's used as the default implementation for zip
// archives without zip64 extension.
class CdEntryMapZip32 : public CdEntryMapInterface {
public:
static std::unique_ptr<CdEntryMapInterface> Create(uint16_t num_entries);
int32_t AddToMap(std::string_view name, const uint8_t* start) override;
std::pair<int32_t, uint64_t> GetCdEntryOffset(std::string_view name,
const uint8_t* cd_start) const override;
void ResetIteration() override;
std::pair<std::string_view, uint64_t> Next(const uint8_t* cd_start) override;
private:
explicit CdEntryMapZip32(uint16_t num_entries);
// We know how many entries are in the Zip archive, so we can have a
// fixed-size hash table. We define a load factor of 0.75 and over
// allocate so the maximum number entries can never be higher than
// ((4 * UINT16_MAX) / 3 + 1) which can safely fit into a uint32_t.
uint32_t hash_table_size_{0};
std::unique_ptr<ZipStringOffset[], decltype(&free)> hash_table_{nullptr, free};
// The position of element for the current iteration.
uint32_t current_position_{0};
};
struct ZipArchive {
// open Zip archive
mutable MappedZipFile mapped_zip;
@ -172,13 +222,7 @@ struct ZipArchive {
// number of entries in the Zip archive
uint16_t num_entries;
// We know how many entries are in the Zip archive, so we can have a
// fixed-size hash table. We define a load factor of 0.75 and over
// allocate so the maximum number entries can never be higher than
// ((4 * UINT16_MAX) / 3 + 1) which can safely fit into a uint32_t.
uint32_t hash_table_size;
ZipStringOffset* hash_table;
std::unique_ptr<CdEntryMapInterface> cd_entry_map;
ZipArchive(const int fd, bool assume_ownership);
ZipArchive(const void* address, size_t length);