From 0ec0eaa2140f3c7671442d610d6297316b314184 Mon Sep 17 00:00:00 2001 From: Tianjie Date: Thu, 26 Mar 2020 12:34:44 -0700 Subject: [PATCH] Support parsing of data descriptor The size fields in the data descriptor can be either 4 bytes or 8 bytes. This depends on if the size are read from the zip64 extended field in the local file header. This cl adds support to parse these cases. Also fix a misconception in that the uncompressed and compressed size doesn't need to exist together in the zip64 fields of the central directory. But they still need to co-exist in the fields of the local file header. Bug: 150900468 Test: unit tests pass, python tests pass Change-Id: Ia54f9bf56c85ff456ead90a136f7fddc5be5220c --- .../include/ziparchive/zip_archive.h | 4 + libziparchive/test_ziparchive_large.py | 28 ++++- libziparchive/testdata/zip64.zip | Bin 0 -> 200 bytes libziparchive/zip_archive.cc | 115 +++++++++--------- libziparchive/zip_archive_common.h | 19 ++- libziparchive/zip_archive_private.h | 18 +++ libziparchive/zip_archive_test.cc | 56 ++++++++- libziparchive/zip_writer.cc | 17 ++- 8 files changed, 179 insertions(+), 78 deletions(-) create mode 100644 libziparchive/testdata/zip64.zip diff --git a/libziparchive/include/ziparchive/zip_archive.h b/libziparchive/include/ziparchive/zip_archive.h index 435bfb6b0..4697bb752 100644 --- a/libziparchive/include/ziparchive/zip_archive.h +++ b/libziparchive/include/ziparchive/zip_archive.h @@ -77,6 +77,10 @@ struct ZipEntry { // footer. uint32_t uncompressed_length; + // If the value of uncompressed length and compressed length are stored in + // the zip64 extended info of the extra field. + bool zip64_format_size{false}; + // The offset to the start of data for this ZipEntry. off64_t offset; diff --git a/libziparchive/test_ziparchive_large.py b/libziparchive/test_ziparchive_large.py index c29c37e9d..6b82f6385 100644 --- a/libziparchive/test_ziparchive_large.py +++ b/libziparchive/test_ziparchive_large.py @@ -25,14 +25,18 @@ import zipfile import time class Zip64Test(unittest.TestCase): + @staticmethod + def _WriteFile(path, size_in_kib): + contents = os.path.basename(path)[0] * 1024 + with open(path, 'w') as f: + for it in range(0, size_in_kib): + f.write(contents) + @staticmethod def _AddEntriesToZip(output_zip, entries_dict=None): for name, size in entries_dict.items(): - contents = name[0] * 1024 file_path = tempfile.NamedTemporaryFile() - with open(file_path.name, 'w') as f: - for it in range(0, size): - f.write(contents) + Zip64Test._WriteFile(file_path.name, size) output_zip.write(file_path.name, arcname = name) def _getEntryNames(self, zip_name): @@ -93,6 +97,22 @@ class Zip64Test(unittest.TestCase): self._ExtractEntries(zip_path.name) + def test_forceDataDescriptor(self): + file_path = tempfile.NamedTemporaryFile(suffix='.txt') + # TODO create the entry > 4GiB. + self._WriteFile(file_path.name, 1024) + + zip_path = tempfile.NamedTemporaryFile(suffix='.zip') + with zipfile.ZipFile(zip_path, 'w', allowZip64=True) as output_zip: + pass + # The fd option force writes a data descriptor + cmd = ['zip', '-fd', zip_path.name, file_path.name] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + proc.communicate() + read_names = self._getEntryNames(zip_path.name) + self.assertEquals([file_path.name[1:]], read_names) + self._ExtractEntries(zip_path.name) + if __name__ == '__main__': testsuite = unittest.TestLoader().discover( os.path.dirname(os.path.realpath(__file__))) diff --git a/libziparchive/testdata/zip64.zip b/libziparchive/testdata/zip64.zip new file mode 100644 index 0000000000000000000000000000000000000000..3f25a4c25443df6e3ad18413a4eeb8fda7db6137 GIT binary patch literal 200 zcmWIWW@gc40D)JV>jF6bHbwo10!9WA23= 0; i--) { if (scan_buffer[i] == 0x50) { uint32_t* sig_addr = reinterpret_cast(&scan_buffer[i]); - if (get_unaligned(sig_addr) == EocdRecord::kSignature) { + if (android::base::get_unaligned(sig_addr) == EocdRecord::kSignature) { ALOGV("+++ Found EOCD at buf+%d", i); break; } @@ -360,8 +358,9 @@ static ZipError ParseZip64ExtendedInfoInExtraField( // Data Size - 2 bytes uint16_t offset = 0; while (offset < extraFieldLength - 4) { - auto headerId = get_unaligned(extraFieldStart + offset); - auto dataSize = get_unaligned(extraFieldStart + offset + 2); + auto readPtr = const_cast(extraFieldStart + offset); + auto headerId = ConsumeUnaligned(&readPtr); + auto dataSize = ConsumeUnaligned(&readPtr); offset += 4; if (dataSize > extraFieldLength - offset) { @@ -376,54 +375,44 @@ static ZipError ParseZip64ExtendedInfoInExtraField( continue; } - uint16_t expectedDataSize = 0; - // We expect the extended field to include both uncompressed and compressed size. - if (zip32UncompressedSize == UINT32_MAX || zip32CompressedSize == UINT32_MAX) { - expectedDataSize += 16; + std::optional uncompressedFileSize; + std::optional compressedFileSize; + std::optional localHeaderOffset; + if (zip32UncompressedSize == UINT32_MAX) { + uncompressedFileSize = ConsumeUnaligned(&readPtr); + } + if (zip32CompressedSize == UINT32_MAX) { + compressedFileSize = ConsumeUnaligned(&readPtr); } if (zip32LocalFileHeaderOffset == UINT32_MAX) { - expectedDataSize += 8; + localHeaderOffset = ConsumeUnaligned(&readPtr); } - if (expectedDataSize == 0) { + // calculate how many bytes we read after the data size field. + size_t bytesRead = readPtr - (extraFieldStart + offset); + if (bytesRead == 0) { ALOGW("Zip: Data size should not be 0 in zip64 extended field"); return kInvalidFile; } - if (dataSize != expectedDataSize) { + if (dataSize != bytesRead) { auto localOffsetString = zip32LocalFileHeaderOffset.has_value() ? std::to_string(zip32LocalFileHeaderOffset.value()) : "missing"; - ALOGW("Zip: Invalid data size in zip64 extended field, expect %" PRIu16 ", get %" PRIu16 + ALOGW("Zip: Invalid data size in zip64 extended field, expect %zu , get %" PRIu16 ", uncompressed size %" PRIu32 ", compressed size %" PRIu32 ", local header offset %s", - expectedDataSize, dataSize, zip32UncompressedSize, zip32CompressedSize, + bytesRead, dataSize, zip32UncompressedSize, zip32CompressedSize, localOffsetString.c_str()); return kInvalidFile; } - std::optional uncompressedFileSize; - std::optional compressedFileSize; - std::optional localHeaderOffset; - if (zip32UncompressedSize == UINT32_MAX || zip32CompressedSize == UINT32_MAX) { - uncompressedFileSize = get_unaligned(extraFieldStart + offset); - compressedFileSize = get_unaligned(extraFieldStart + offset + 8); - offset += 16; - - // TODO(xunchang) Support handling file large than UINT32_MAX. It's theoretically possible - // for libz to (de)compressing file larger than UINT32_MAX. But we should use our own - // bytes counter to replace stream.total_out. - if (uncompressedFileSize.value() >= UINT32_MAX || compressedFileSize.value() >= UINT32_MAX) { - ALOGW( - "Zip: File size larger than UINT32_MAX isn't supported yet. uncompressed size %" PRIu64 - ", compressed size %" PRIu64, - uncompressedFileSize.value(), compressedFileSize.value()); - return kInvalidFile; - } - } - - if (zip32LocalFileHeaderOffset == UINT32_MAX) { - localHeaderOffset = get_unaligned(extraFieldStart + offset); - offset += 8; + // TODO(xunchang) Support handling file large than UINT32_MAX. It's theoretically possible + // for libz to (de)compressing file larger than UINT32_MAX. But we should use our own + // bytes counter to replace stream.total_out. + if ((uncompressedFileSize.has_value() && uncompressedFileSize.value() > UINT32_MAX) || + (compressedFileSize.has_value() && compressedFileSize.value() > UINT32_MAX)) { + ALOGW("Zip: File size larger than UINT32_MAX isn't supported yet"); + return kInvalidFile; } zip64Info->uncompressed_file_size = uncompressedFileSize; @@ -625,7 +614,8 @@ void CloseArchive(ZipArchiveHandle archive) { } static int32_t ValidateDataDescriptor(MappedZipFile& mapped_zip, ZipEntry* entry) { - uint8_t ddBuf[sizeof(DataDescriptor) + sizeof(DataDescriptor::kOptSignature)]; + // Maximum possible size for data descriptor: 2 * 4 + 2 * 8 = 24 bytes + uint8_t ddBuf[24]; off64_t offset = entry->offset; if (entry->method != kCompressStored) { offset += entry->compressed_length; @@ -638,18 +628,26 @@ static int32_t ValidateDataDescriptor(MappedZipFile& mapped_zip, ZipEntry* entry } const uint32_t ddSignature = *(reinterpret_cast(ddBuf)); - const uint16_t ddOffset = (ddSignature == DataDescriptor::kOptSignature) ? 4 : 0; - const DataDescriptor* descriptor = reinterpret_cast(ddBuf + ddOffset); + uint8_t* ddReadPtr = (ddSignature == DataDescriptor::kOptSignature) ? ddBuf + 4 : ddBuf; + DataDescriptor descriptor{}; + descriptor.crc32 = ConsumeUnaligned(&ddReadPtr); + if (entry->zip64_format_size) { + descriptor.compressed_size = ConsumeUnaligned(&ddReadPtr); + descriptor.uncompressed_size = ConsumeUnaligned(&ddReadPtr); + } else { + descriptor.compressed_size = ConsumeUnaligned(&ddReadPtr); + descriptor.uncompressed_size = ConsumeUnaligned(&ddReadPtr); + } // Validate that the values in the data descriptor match those in the central // directory. - if (entry->compressed_length != descriptor->compressed_size || - entry->uncompressed_length != descriptor->uncompressed_size || - entry->crc32 != descriptor->crc32) { + if (entry->compressed_length != descriptor.compressed_size || + entry->uncompressed_length != descriptor.uncompressed_size || + entry->crc32 != descriptor.crc32) { ALOGW("Zip: size/crc32 mismatch. expected {%" PRIu32 ", %" PRIu32 ", %" PRIx32 - "}, was {%" PRIu32 ", %" PRIu32 ", %" PRIx32 "}", + "}, was {%" PRIu64 ", %" PRIu64 ", %" PRIx32 "}", entry->compressed_length, entry->uncompressed_length, entry->crc32, - descriptor->compressed_size, descriptor->uncompressed_size, descriptor->crc32); + descriptor.compressed_size, descriptor.uncompressed_size, descriptor.crc32); return kInconsistentInformation; } @@ -706,18 +704,14 @@ static int32_t FindEntry(const ZipArchive* archive, std::string_view entryName, return status; } - if (cdr->uncompressed_size == UINT32_MAX || cdr->compressed_size == UINT32_MAX) { - CHECK(zip64_info.uncompressed_file_size.has_value()); - CHECK(zip64_info.compressed_file_size.has_value()); - // TODO(xunchang) remove the size limit and support entry length > UINT32_MAX. - data->uncompressed_length = static_cast(zip64_info.uncompressed_file_size.value()); - data->compressed_length = static_cast(zip64_info.compressed_file_size.value()); - } - - if (local_header_offset == UINT32_MAX) { - CHECK(zip64_info.local_header_offset.has_value()); - local_header_offset = zip64_info.local_header_offset.value(); - } + // TODO(xunchang) remove the size limit and support entry length > UINT32_MAX. + data->uncompressed_length = + static_cast(zip64_info.uncompressed_file_size.value_or(cdr->uncompressed_size)); + data->compressed_length = + static_cast(zip64_info.compressed_file_size.value_or(cdr->compressed_size)); + local_header_offset = zip64_info.local_header_offset.value_or(local_header_offset); + data->zip64_format_size = + cdr->uncompressed_size == UINT32_MAX || cdr->compressed_size == UINT32_MAX; } if (local_header_offset + static_cast(sizeof(LocalFileHeader)) >= cd_offset) { @@ -766,6 +760,13 @@ static int32_t FindEntry(const ZipArchive* archive, std::string_view entryName, uint64_t lfh_uncompressed_size = lfh->uncompressed_size; uint64_t lfh_compressed_size = lfh->compressed_size; if (lfh_uncompressed_size == UINT32_MAX || lfh_compressed_size == UINT32_MAX) { + if (lfh_uncompressed_size != UINT32_MAX || lfh_compressed_size != UINT32_MAX) { + ALOGW( + "Zip: The zip64 extended field in the local header MUST include BOTH original and " + "compressed file size fields."); + return kInvalidFile; + } + const off64_t lfh_extra_field_offset = name_offset + lfh->file_name_length; const uint16_t lfh_extra_field_size = lfh->extra_field_length; if (lfh_extra_field_offset > cd_offset - lfh_extra_field_size) { diff --git a/libziparchive/zip_archive_common.h b/libziparchive/zip_archive_common.h index a92d4d27c..d46185603 100644 --- a/libziparchive/zip_archive_common.h +++ b/libziparchive/zip_archive_common.h @@ -165,15 +165,24 @@ struct DataDescriptor { // CRC-32 checksum of the entry. uint32_t crc32; - // Compressed size of the entry. - uint32_t compressed_size; - // Uncompressed size of the entry. - uint32_t uncompressed_size; + + // For ZIP64 format archives, the compressed and uncompressed sizes are 8 + // bytes each. Also, the ZIP64 format MAY be used regardless of the size + // of a file. When extracting, if the zip64 extended information extra field + // is present for the file the compressed and uncompressed sizes will be 8 + // byte values. + + // Compressed size of the entry, the field can be either 4 bytes or 8 bytes + // in the zip file. + uint64_t compressed_size; + // Uncompressed size of the entry, the field can be either 4 bytes or 8 bytes + // in the zip file. + uint64_t uncompressed_size; private: DataDescriptor() = default; DISALLOW_COPY_AND_ASSIGN(DataDescriptor); -} __attribute__((packed)); +}; // The zip64 end of central directory locator helps to find the zip64 EOCD. struct Zip64EocdLocator { diff --git a/libziparchive/zip_archive_private.h b/libziparchive/zip_archive_private.h index 5f5232f94..4ed07aac3 100644 --- a/libziparchive/zip_archive_private.h +++ b/libziparchive/zip_archive_private.h @@ -28,6 +28,7 @@ #include "android-base/macros.h" #include "android-base/mapped_file.h" +#include "android-base/memory.h" #include "zip_cd_entry_map.h" #include "zip_error.h" @@ -104,3 +105,20 @@ struct ZipArchive { bool InitializeCentralDirectory(off64_t cd_start_offset, size_t cd_size); }; + +int32_t ExtractToWriter(ZipArchiveHandle handle, ZipEntry* entry, zip_archive::Writer* writer); + +// Reads the unaligned data of type |T| and auto increment the offset. +template +static T ConsumeUnaligned(uint8_t** address) { + auto ret = android::base::get_unaligned(*address); + *address += sizeof(T); + return ret; +} + +// Writes the unaligned data of type |T| and auto increment the offset. +template +void EmitUnaligned(uint8_t** address, T data) { + android::base::put_unaligned(*address, data); + *address += sizeof(T); +} diff --git a/libziparchive/zip_archive_test.cc b/libziparchive/zip_archive_test.cc index 69be3dfdc..356334058 100644 --- a/libziparchive/zip_archive_test.cc +++ b/libziparchive/zip_archive_test.cc @@ -979,10 +979,11 @@ class Zip64ParseTest : public ::testing::Test { std::vector extended_field; // Fake data to mimic the compressed bytes in the zipfile. std::vector compressed_bytes; + std::vector data_descriptor; size_t GetSize() const { return local_file_header.size() + file_name.size() + extended_field.size() + - compressed_bytes.size(); + compressed_bytes.size() + data_descriptor.size(); } void CopyToOutput(std::vector* output) const { @@ -990,6 +991,7 @@ class Zip64ParseTest : public ::testing::Test { std::copy(file_name.begin(), file_name.end(), std::back_inserter(*output)); std::copy(extended_field.begin(), extended_field.end(), std::back_inserter(*output)); std::copy(compressed_bytes.begin(), compressed_bytes.end(), std::back_inserter(*output)); + std::copy(data_descriptor.begin(), data_descriptor.end(), std::back_inserter(*output)); } }; @@ -1057,7 +1059,7 @@ class Zip64ParseTest : public ::testing::Test { // Add an entry to the zipfile, construct the corresponding local header and cd entry. void AddEntry(const std::string& name, const std::vector& content, bool uncompressed_size_in_extended, bool compressed_size_in_extended, - bool local_offset_in_extended) { + bool local_offset_in_extended, bool include_data_descriptor = false) { auto uncompressed_size = static_cast(content.size()); auto compressed_size = static_cast(content.size()); uint32_t local_file_header_offset = 0; @@ -1084,6 +1086,21 @@ class Zip64ParseTest : public ::testing::Test { ConstructLocalFileHeader(name, &local_entry.local_file_header, uncompressed_size, compressed_size); ConstructExtendedField(zip64_fields, &local_entry.extended_field); + if (include_data_descriptor) { + size_t descriptor_size = compressed_size_in_extended ? 24 : 16; + local_entry.data_descriptor.resize(descriptor_size); + uint8_t* write_ptr = local_entry.data_descriptor.data(); + EmitUnaligned(&write_ptr, DataDescriptor::kOptSignature); + EmitUnaligned(&write_ptr, 0 /* crc */); + if (compressed_size_in_extended) { + EmitUnaligned(&write_ptr, compressed_size_in_extended); + EmitUnaligned(&write_ptr, uncompressed_size_in_extended); + } else { + EmitUnaligned(&write_ptr, compressed_size_in_extended); + EmitUnaligned(&write_ptr, uncompressed_size_in_extended); + } + } + file_entries_.push_back(std::move(local_entry)); if (local_offset_in_extended) { @@ -1270,3 +1287,38 @@ TEST_F(Zip64ParseTest, zip64EocdWrongLocatorOffset) { 0, OpenArchiveFromMemory(zip_content_.data(), zip_content_.size(), "debug_zip64", &handle)); CloseArchive(handle); } + +TEST_F(Zip64ParseTest, extract) { + std::vector content(200, 'a'); + AddEntry("a.txt", content, true, true, true); + ConstructEocd(); + ConstructZipFile(); + + ZipArchiveHandle handle; + ASSERT_EQ( + 0, OpenArchiveFromMemory(zip_content_.data(), zip_content_.size(), "debug_zip64", &handle)); + ZipEntry entry; + ASSERT_EQ(0, FindEntry(handle, "a.txt", &entry)); + + VectorWriter writer; + ASSERT_EQ(0, ExtractToWriter(handle, &entry, &writer)); + ASSERT_EQ(content, writer.GetOutput()); +} + +TEST_F(Zip64ParseTest, extractWithDataDescriptor) { + std::vector content(300, 'b'); + AddEntry("a.txt", std::vector(200, 'a'), true, true, true); + AddEntry("b.txt", content, true, true, true, true /* data descriptor */); + ConstructEocd(); + ConstructZipFile(); + + ZipArchiveHandle handle; + ASSERT_EQ( + 0, OpenArchiveFromMemory(zip_content_.data(), zip_content_.size(), "debug_zip64", &handle)); + ZipEntry entry; + ASSERT_EQ(0, FindEntry(handle, "b.txt", &entry)); + + VectorWriter writer; + ASSERT_EQ(0, ExtractToWriter(handle, &entry, &writer)); + ASSERT_EQ(content, writer.GetOutput()); +} diff --git a/libziparchive/zip_writer.cc b/libziparchive/zip_writer.cc index 67279a6b7..25b1da4ca 100644 --- a/libziparchive/zip_writer.cc +++ b/libziparchive/zip_writer.cc @@ -475,19 +475,16 @@ int32_t ZipWriter::FinishEntry() { if (ShouldUseDataDescriptor()) { // Some versions of ZIP don't allow STORED data to have a trailing DataDescriptor. // If this file is not seekable, or if the data is compressed, write a DataDescriptor. - const uint32_t sig = DataDescriptor::kOptSignature; - if (fwrite(&sig, sizeof(sig), 1, file_) != 1) { + // We haven't supported zip64 format yet. Write both uncompressed size and compressed + // size as uint32_t. + std::vector dataDescriptor = { + DataDescriptor::kOptSignature, current_file_entry_.crc32, + current_file_entry_.compressed_size, current_file_entry_.uncompressed_size}; + if (fwrite(dataDescriptor.data(), dataDescriptor.size() * sizeof(uint32_t), 1, file_) != 1) { return HandleError(kIoError); } - DataDescriptor dd = {}; - dd.crc32 = current_file_entry_.crc32; - dd.compressed_size = current_file_entry_.compressed_size; - dd.uncompressed_size = current_file_entry_.uncompressed_size; - if (fwrite(&dd, sizeof(dd), 1, file_) != 1) { - return HandleError(kIoError); - } - current_offset_ += sizeof(DataDescriptor::kOptSignature) + sizeof(dd); + current_offset_ += sizeof(uint32_t) * dataDescriptor.size(); } else { // Seek back to the header and rewrite to include the size. if (fseeko(file_, current_file_entry_.local_file_header_offset, SEEK_SET) != 0) {