class FormatParser::ZIPParser::FileReader
A very barebones ZIP file reader
Constants
- C_UINT16LE
- C_UINT32LE
- C_UINT64LE
- Error
- InvalidCentralDirectory
- InvalidStructure
- LocalHeaderPending
- MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
To prevent too many tiny reads, read the maximum possible size of end of central directory record upfront (all the fixed fields + at most 0xFFFF bytes of the archive comment)
- MAX_LOCAL_HEADER_SIZE
To prevent too many tiny reads, read the maximum possible size of the local file header upfront. The maximum size is all the usual items, plus the maximum size of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)
- MissingEOCD
- ReadError
- SIZE_OF_USABLE_EOCD_RECORD
- UnsupportedFeature
Public Instance Methods
Parse an IO handle to a ZIP archive into an array of Entry objects.
@param io[#tell, seek
, read, size] an IO-ish object @return [Array<ZipEntry>] an array of entries within the ZIP being parsed
# File lib/parsers/zip_parser/file_reader.rb, line 160 def read_zip_structure(io:) zip_file_size = io.size eocd_offset = get_eocd_offset(io, zip_file_size) zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset) num_files, cdir_location, cdir_size = if zip64_end_of_cdir_location num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location) else num_files_and_central_directory_offset(io, eocd_offset) end log { format('Located the central directory start at %d', cdir_location) } seek(io, cdir_location) # In zip_tricks we read the entire central directory _and_ enything behind it. # Strictly speaking, we should be able to read `cdir_size` bytes and not a byte more. # BUT! in format_parser we avoid unbounded reads, as a matter of fact they are forbidden. # So we will again limit ouselves to cdir_size, and we will take cushion of 1 KB. central_directory_str = io.read(cdir_size + 1024) raise InvalidCentralDirectory if central_directory_str.nil? central_directory_io = StringIO.new(central_directory_str) log do format( 'Read %d bytes with central directory + EOCD record and locator', central_directory_str.bytesize) end entries = (0...num_files).map do |entry_n| offset_location = cdir_location + central_directory_io.pos log do format( 'Reading the central directory entry %d starting at offset %d', entry_n, offset_location) end read_cdir_entry(central_directory_io) end entries end
Private Instance Methods
# File lib/parsers/zip_parser/file_reader.rb, line 350 def all_indices_of_substr_in_str(of_substring, in_string) last_i = 0 found_at_indices = [] while last_i = in_string.index(of_substring, last_i) found_at_indices << last_i last_i += of_substring.bytesize end found_at_indices end
# File lib/parsers/zip_parser/file_reader.rb, line 224 def assert_signature(io, signature_magic_number) readback = read_4b(io) if readback != signature_magic_number expected = '0x0' + signature_magic_number.to_s(16) actual = '0x0' + readback.to_s(16) raise InvalidStructure, "Expected signature #{expected}, but read #{actual}" end end
# File lib/parsers/zip_parser/file_reader.rb, line 328 def get_eocd_offset(file_io, zip_file_size) # Start reading from the _comment_ of the zip file (from the very end). # The maximum size of the comment is 0xFFFF (what fits in 2 bytes) implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0 # Use a soft seek (we might not be able to get as far behind in the IO as we want) # and a soft read (we might not be able to read as many bytes as we want) file_io.seek(implied_position_of_eocd_record) str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE) raise MissingEOCD unless str_containing_eocd_record eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record) raise MissingEOCD unless eocd_idx_in_buf eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf log { format('Found EOCD signature at offset %d', eocd_offset) } eocd_offset end
Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the EOCD record in the archive by fixed offsets
# File lib/parsers/zip_parser/file_reader.rb, line 383 def get_zip64_eocd_location(file_io, eocd_offset) zip64_eocd_loc_offset = eocd_offset zip64_eocd_loc_offset -= 4 # The signature zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record zip64_eocd_loc_offset -= 4 # Total number of disks log do format( 'Will look for the Zip64 EOCD locator signature at offset %d', zip64_eocd_loc_offset) end # If the offset is negative there is certainly no Zip64 EOCD locator here return unless zip64_eocd_loc_offset >= 0 file_io.seek(zip64_eocd_loc_offset) assert_signature(file_io, 0x07064b50) log { format('Found Zip64 EOCD locator at offset %d', zip64_eocd_loc_offset) } disk_num = read_4b(file_io) # number of the disk raise UnsupportedFeature, 'The archive spans multiple disks' if disk_num != 0 read_8b(file_io) rescue ReadError, InvalidStructure nil end
# File lib/parsers/zip_parser/file_reader.rb, line 360 def locate_eocd_signature(in_str) eocd_signature = [0x06054b50].pack('V') unpack_pattern = 'VvvvvVVv' minimum_record_size = 22 str_size = in_str.bytesize indices = all_indices_of_substr_in_str(eocd_signature, in_str) indices.each do |check_at| maybe_record = in_str[check_at..str_size] # If the record is smaller than the minimum - we will never recover anything break if maybe_record.bytesize < minimum_record_size signature, *_rest, comment_size = maybe_record.unpack(unpack_pattern) # Check the only condition for the match if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size return check_at # Found the EOCD marker location end end # If we haven't caught anything, return nil deliberately instead of returning the last statement nil end
Is provided as a stub to be overridden in a subclass if you need it. Will report during various stages of reading. The log message is contained in the return value of `yield` in the method (the log messages are lazy-evaluated).
# File lib/parsers/zip_parser/file_reader.rb, line 468 def log # $stderr.puts(yield) end
# File lib/parsers/zip_parser/file_reader.rb, line 448 def num_files_and_central_directory_offset(file_io, eocd_offset) seek(file_io, eocd_offset) # The size of the EOCD record is known upfront, so use a strict read eocd_record_str = read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD) io = StringIO.new(eocd_record_str) assert_signature(io, 0x06054b50) skip_ahead_2(io) # number_of_this_disk skip_ahead_2(io) # number of the disk with the EOCD record skip_ahead_2(io) # number of entries in the central directory of this disk num_files = read_2b(io) # number of entries in the central directory total cdir_size = read_4b(io) # size of the central directory cdir_offset = read_4b(io) # start of central directorty offset [num_files, cdir_offset, cdir_size] end
num_files_and_central_directory_offset_zip64
is too high. [21.12/15]
# File lib/parsers/zip_parser/file_reader.rb, line 412 def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location) seek(io, zip64_end_of_cdir_location) assert_signature(io, 0x06064b50) zip64_eocdr_size = read_8b(io) zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper zip64_eocdr = StringIO.new(zip64_eocdr) skip_ahead_2(zip64_eocdr) # version made by skip_ahead_2(zip64_eocdr) # version needed to extract disk_n = read_4b(zip64_eocdr) # number of this disk disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR if disk_n != disk_n_with_eocdr raise UnsupportedFeature, 'The archive spans multiple disks' end num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk num_files_total = read_8b(zip64_eocdr) # files total in the central directory if num_files_this_disk != num_files_total raise UnsupportedFeature, 'The archive spans multiple disks' end log do format( 'Zip64 EOCD record states there are %d files in the archive', num_files_total) end central_dir_size = read_8b(zip64_eocdr) # Size of the central directory central_dir_offset = read_8b(zip64_eocdr) # Where the central directory starts [num_files_total, central_dir_offset, central_dir_size] end
# File lib/parsers/zip_parser/file_reader.rb, line 472 def parse_out_extra_fields(extra_fields_str) extra_table = {} extras_buf = StringIO.new(extra_fields_str) until extras_buf.eof? extra_id = read_2b(extras_buf) extra_size = read_2b(extras_buf) extra_contents = read_n(extras_buf, extra_size) extra_table[extra_id] = extra_contents end extra_table end
# File lib/parsers/zip_parser/file_reader.rb, line 253 def read_2b(io) read_n(io, 2).unpack(C_UINT16LE).shift end
# File lib/parsers/zip_parser/file_reader.rb, line 257 def read_4b(io) read_n(io, 4).unpack(C_UINT32LE).shift end
# File lib/parsers/zip_parser/file_reader.rb, line 261 def read_8b(io) read_n(io, 8).unpack(C_UINT64LE).shift end
# File lib/parsers/zip_parser/file_reader.rb, line 265 def read_cdir_entry(io) assert_signature(io, 0x02014b50) ZipEntry.new.tap do |e| e.made_by = read_2b(io) e.version_needed_to_extract = read_2b(io) e.gp_flags = read_2b(io) e.storage_mode = read_2b(io) e.dos_time = read_2b(io) e.dos_date = read_2b(io) e.crc32 = read_4b(io) e.compressed_size = read_4b(io) e.uncompressed_size = read_4b(io) filename_size = read_2b(io) extra_size = read_2b(io) comment_len = read_2b(io) e.disk_number_start = read_2b(io) e.internal_attrs = read_2b(io) e.external_attrs = read_4b(io) e.local_file_header_offset = read_4b(io) e.filename = read_n(io, filename_size) # Extra fields extras = read_n(io, extra_size) # Comment e.comment = read_n(io, comment_len) # Parse out the extra fields extra_table = parse_out_extra_fields(extras) # ...of which we really only need the Zip64 extra if zip64_extra_contents ||= extra_table[1] # If the Zip64 extra is present, we let it override all # the values fetched from the conventional header zip64_extra = StringIO.new(zip64_extra_contents) log do format( 'Will read Zip64 extra data for %s, %d bytes', e.filename, zip64_extra.size) end # Now here be dragons. The APPNOTE specifies that # # > The order of the fields in the ZIP64 extended # > information record is fixed, but the fields will # > only appear if the corresponding Local or Central # > directory record field is set to 0xFFFF or 0xFFFFFFFF. # # It means that before we read this stuff we need to check if the previously-read # values are at overflow, and only _then_ proceed to read them. Bah. if e.uncompressed_size == 0xFFFFFFFF e.uncompressed_size = read_8b(zip64_extra) end if e.compressed_size == 0xFFFFFFFF e.compressed_size = read_8b(zip64_extra) end if e.local_file_header_offset == 0xFFFFFFFF e.local_file_header_offset = read_8b(zip64_extra) end # Disk number comes last and we can skip it anyway, since we do # not support multi-disk archives end end end
# File lib/parsers/zip_parser/file_reader.rb, line 244 def read_n(io, n_bytes) io.read(n_bytes).tap do |d| raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil? unless d.bytesize == n_bytes raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}" end end end
# File lib/parsers/zip_parser/file_reader.rb, line 215 def seek(io, absolute_pos) io.seek(absolute_pos) unless absolute_pos == io.pos raise ReadError, "Expected to seek to #{absolute_pos} but only got to #{io.pos}" end nil end
# File lib/parsers/zip_parser/file_reader.rb, line 203 def skip_ahead_2(io) skip_ahead_n(io, 2) end
# File lib/parsers/zip_parser/file_reader.rb, line 207 def skip_ahead_4(io) skip_ahead_n(io, 4) end
# File lib/parsers/zip_parser/file_reader.rb, line 211 def skip_ahead_8(io) skip_ahead_n(io, 8) end
# File lib/parsers/zip_parser/file_reader.rb, line 233 def skip_ahead_n(io, n) pos_before = io.pos io.seek(io.pos + n) pos_after = io.pos delta = pos_after - pos_before unless delta == n raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead" end nil end