class FormatParser::ZIPParser::FileReader

A very barebones ZIP file reader

Constants

C_UINT16LE
C_UINT32LE
C_UINT64LE
Error
InvalidCentralDirectory
InvalidStructure
LocalHeaderPending
MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE

To prevent too many tiny reads, read the maximum possible size of end of central directory record upfront (all the fixed fields + at most 0xFFFF bytes of the archive comment)

MAX_LOCAL_HEADER_SIZE

To prevent too many tiny reads, read the maximum possible size of the local file header upfront. The maximum size is all the usual items, plus the maximum size of the filename (0xFFFF bytes) and the maximum size of the extras (0xFFFF bytes)

MissingEOCD
ReadError
SIZE_OF_USABLE_EOCD_RECORD
UnsupportedFeature

Public Instance Methods

read_zip_structure(io:) click to toggle source

Parse an IO handle to a ZIP archive into an array of Entry objects.

@param io[#tell, seek, read, size] an IO-ish object @return [Array<ZipEntry>] an array of entries within the ZIP being parsed

# File lib/parsers/zip_parser/file_reader.rb, line 160
def read_zip_structure(io:)
  zip_file_size = io.size
  eocd_offset = get_eocd_offset(io, zip_file_size)
  zip64_end_of_cdir_location = get_zip64_eocd_location(io, eocd_offset)
  num_files, cdir_location, cdir_size =
    if zip64_end_of_cdir_location
      num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
    else
      num_files_and_central_directory_offset(io, eocd_offset)
    end

  log { format('Located the central directory start at %d', cdir_location) }
  seek(io, cdir_location)

  # In zip_tricks we read the entire central directory _and_ enything behind it.
  # Strictly speaking, we should be able to read `cdir_size` bytes and not a byte more.
  # BUT! in format_parser we avoid unbounded reads, as a matter of fact they are forbidden.
  # So we will again limit ouselves to cdir_size, and we will take cushion of 1 KB.
  central_directory_str = io.read(cdir_size + 1024)
  raise InvalidCentralDirectory if central_directory_str.nil?

  central_directory_io = StringIO.new(central_directory_str)
  log do
    format(
      'Read %d bytes with central directory + EOCD record and locator',
      central_directory_str.bytesize)
  end

  entries = (0...num_files).map do |entry_n|
    offset_location = cdir_location + central_directory_io.pos
    log do
      format(
        'Reading the central directory entry %d starting at offset %d',
        entry_n, offset_location)
    end
    read_cdir_entry(central_directory_io)
  end

  entries
end

Private Instance Methods

all_indices_of_substr_in_str(of_substring, in_string) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 350
def all_indices_of_substr_in_str(of_substring, in_string)
  last_i = 0
  found_at_indices = []
  while last_i = in_string.index(of_substring, last_i)
    found_at_indices << last_i
    last_i += of_substring.bytesize
  end
  found_at_indices
end
assert_signature(io, signature_magic_number) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 224
def assert_signature(io, signature_magic_number)
  readback = read_4b(io)
  if readback != signature_magic_number
    expected = '0x0' + signature_magic_number.to_s(16)
    actual = '0x0' + readback.to_s(16)
    raise InvalidStructure, "Expected signature #{expected}, but read #{actual}"
  end
end
get_eocd_offset(file_io, zip_file_size) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 328
def get_eocd_offset(file_io, zip_file_size)
  # Start reading from the _comment_ of the zip file (from the very end).
  # The maximum size of the comment is 0xFFFF (what fits in 2 bytes)
  implied_position_of_eocd_record = zip_file_size - MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE
  implied_position_of_eocd_record = 0 if implied_position_of_eocd_record < 0

  # Use a soft seek (we might not be able to get as far behind in the IO as we want)
  # and a soft read (we might not be able to read as many bytes as we want)
  file_io.seek(implied_position_of_eocd_record)
  str_containing_eocd_record = file_io.read(MAX_END_OF_CENTRAL_DIRECTORY_RECORD_SIZE)
  raise MissingEOCD unless str_containing_eocd_record

  eocd_idx_in_buf = locate_eocd_signature(str_containing_eocd_record)

  raise MissingEOCD unless eocd_idx_in_buf

  eocd_offset = implied_position_of_eocd_record + eocd_idx_in_buf
  log { format('Found EOCD signature at offset %d', eocd_offset) }

  eocd_offset
end
get_zip64_eocd_location(file_io, eocd_offset) click to toggle source

Find the Zip64 EOCD locator segment offset. Do this by seeking backwards from the EOCD record in the archive by fixed offsets

# File lib/parsers/zip_parser/file_reader.rb, line 383
def get_zip64_eocd_location(file_io, eocd_offset)
  zip64_eocd_loc_offset = eocd_offset
  zip64_eocd_loc_offset -= 4 # The signature
  zip64_eocd_loc_offset -= 4 # Which disk has the Zip64 end of central directory record
  zip64_eocd_loc_offset -= 8 # Offset of the zip64 central directory record
  zip64_eocd_loc_offset -= 4 # Total number of disks

  log do
    format(
      'Will look for the Zip64 EOCD locator signature at offset %d',
      zip64_eocd_loc_offset)
  end

  # If the offset is negative there is certainly no Zip64 EOCD locator here
  return unless zip64_eocd_loc_offset >= 0

  file_io.seek(zip64_eocd_loc_offset)
  assert_signature(file_io, 0x07064b50)

  log { format('Found Zip64 EOCD locator at offset %d', zip64_eocd_loc_offset) }

  disk_num = read_4b(file_io) # number of the disk
  raise UnsupportedFeature, 'The archive spans multiple disks' if disk_num != 0
  read_8b(file_io)
rescue ReadError, InvalidStructure
  nil
end
locate_eocd_signature(in_str) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 360
def locate_eocd_signature(in_str)
  eocd_signature = [0x06054b50].pack('V')
  unpack_pattern = 'VvvvvVVv'
  minimum_record_size = 22
  str_size = in_str.bytesize
  indices = all_indices_of_substr_in_str(eocd_signature, in_str)
  indices.each do |check_at|
    maybe_record = in_str[check_at..str_size]
    # If the record is smaller than the minimum - we will never recover anything
    break if maybe_record.bytesize < minimum_record_size
    signature, *_rest, comment_size = maybe_record.unpack(unpack_pattern)

    # Check the only condition for the match
    if signature == 0x06054b50 && (maybe_record.bytesize - minimum_record_size) == comment_size
      return check_at # Found the EOCD marker location
    end
  end
  # If we haven't caught anything, return nil deliberately instead of returning the last statement
  nil
end
log() click to toggle source

Is provided as a stub to be overridden in a subclass if you need it. Will report during various stages of reading. The log message is contained in the return value of `yield` in the method (the log messages are lazy-evaluated).

# File lib/parsers/zip_parser/file_reader.rb, line 468
def log
  # $stderr.puts(yield)
end
num_files_and_central_directory_offset(file_io, eocd_offset) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 448
def num_files_and_central_directory_offset(file_io, eocd_offset)
  seek(file_io, eocd_offset)

  # The size of the EOCD record is known upfront, so use a strict read
  eocd_record_str = read_n(file_io, SIZE_OF_USABLE_EOCD_RECORD)
  io = StringIO.new(eocd_record_str)

  assert_signature(io, 0x06054b50)
  skip_ahead_2(io) # number_of_this_disk
  skip_ahead_2(io) # number of the disk with the EOCD record
  skip_ahead_2(io) # number of entries in the central directory of this disk
  num_files = read_2b(io)   # number of entries in the central directory total
  cdir_size = read_4b(io)   # size of the central directory
  cdir_offset = read_4b(io) # start of central directorty offset
  [num_files, cdir_offset, cdir_size]
end
num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location) click to toggle source

num_files_and_central_directory_offset_zip64 is too high. [21.12/15]

# File lib/parsers/zip_parser/file_reader.rb, line 412
def num_files_and_central_directory_offset_zip64(io, zip64_end_of_cdir_location)
  seek(io, zip64_end_of_cdir_location)

  assert_signature(io, 0x06064b50)

  zip64_eocdr_size = read_8b(io)
  zip64_eocdr = read_n(io, zip64_eocdr_size) # Reading in bulk is cheaper
  zip64_eocdr = StringIO.new(zip64_eocdr)
  skip_ahead_2(zip64_eocdr) # version made by
  skip_ahead_2(zip64_eocdr) # version needed to extract

  disk_n = read_4b(zip64_eocdr) # number of this disk
  disk_n_with_eocdr = read_4b(zip64_eocdr) # number of the disk with the EOCDR
  if disk_n != disk_n_with_eocdr
    raise UnsupportedFeature, 'The archive spans multiple disks'
  end

  num_files_this_disk = read_8b(zip64_eocdr) # number of files on this disk
  num_files_total     = read_8b(zip64_eocdr) # files total in the central directory

  if num_files_this_disk != num_files_total
    raise UnsupportedFeature, 'The archive spans multiple disks'
  end

  log do
    format(
      'Zip64 EOCD record states there are %d files in the archive',
      num_files_total)
  end

  central_dir_size    = read_8b(zip64_eocdr) # Size of the central directory
  central_dir_offset  = read_8b(zip64_eocdr) # Where the central directory starts

  [num_files_total, central_dir_offset, central_dir_size]
end
parse_out_extra_fields(extra_fields_str) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 472
def parse_out_extra_fields(extra_fields_str)
  extra_table = {}
  extras_buf = StringIO.new(extra_fields_str)
  until extras_buf.eof?
    extra_id = read_2b(extras_buf)
    extra_size = read_2b(extras_buf)
    extra_contents = read_n(extras_buf, extra_size)
    extra_table[extra_id] = extra_contents
  end
  extra_table
end
read_2b(io) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 253
def read_2b(io)
  read_n(io, 2).unpack(C_UINT16LE).shift
end
read_4b(io) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 257
def read_4b(io)
  read_n(io, 4).unpack(C_UINT32LE).shift
end
read_8b(io) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 261
def read_8b(io)
  read_n(io, 8).unpack(C_UINT64LE).shift
end
read_cdir_entry(io) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 265
def read_cdir_entry(io)
  assert_signature(io, 0x02014b50)
  ZipEntry.new.tap do |e|
    e.made_by = read_2b(io)
    e.version_needed_to_extract = read_2b(io)
    e.gp_flags = read_2b(io)
    e.storage_mode = read_2b(io)
    e.dos_time = read_2b(io)
    e.dos_date = read_2b(io)
    e.crc32 = read_4b(io)
    e.compressed_size = read_4b(io)
    e.uncompressed_size = read_4b(io)
    filename_size = read_2b(io)
    extra_size = read_2b(io)
    comment_len = read_2b(io)
    e.disk_number_start = read_2b(io)
    e.internal_attrs = read_2b(io)
    e.external_attrs = read_4b(io)
    e.local_file_header_offset = read_4b(io)
    e.filename = read_n(io, filename_size)

    # Extra fields
    extras = read_n(io, extra_size)
    # Comment
    e.comment = read_n(io, comment_len)

    # Parse out the extra fields
    extra_table = parse_out_extra_fields(extras)

    # ...of which we really only need the Zip64 extra
    if zip64_extra_contents ||= extra_table[1]
      # If the Zip64 extra is present, we let it override all
      # the values fetched from the conventional header
      zip64_extra = StringIO.new(zip64_extra_contents)
      log do
        format(
          'Will read Zip64 extra data for %s, %d bytes',
          e.filename, zip64_extra.size)
      end
      # Now here be dragons. The APPNOTE specifies that
      #
      # > The order of the fields in the ZIP64 extended
      # > information record is fixed, but the fields will
      # > only appear if the corresponding Local or Central
      # > directory record field is set to 0xFFFF or 0xFFFFFFFF.
      #
      # It means that before we read this stuff we need to check if the previously-read
      # values are at overflow, and only _then_ proceed to read them. Bah.
      if e.uncompressed_size == 0xFFFFFFFF
        e.uncompressed_size = read_8b(zip64_extra)
      end
      if e.compressed_size == 0xFFFFFFFF
        e.compressed_size = read_8b(zip64_extra)
      end
      if e.local_file_header_offset == 0xFFFFFFFF
        e.local_file_header_offset = read_8b(zip64_extra)
      end
      # Disk number comes last and we can skip it anyway, since we do
      # not support multi-disk archives
    end
  end
end
read_n(io, n_bytes) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 244
def read_n(io, n_bytes)
  io.read(n_bytes).tap do |d|
    raise ReadError, "Expected to read #{n_bytes} bytes, but the IO was at the end" if d.nil?
    unless d.bytesize == n_bytes
      raise ReadError, "Expected to read #{n_bytes} bytes, read #{d.bytesize}"
    end
  end
end
seek(io, absolute_pos) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 215
def seek(io, absolute_pos)
  io.seek(absolute_pos)
  unless absolute_pos == io.pos
    raise ReadError,
          "Expected to seek to #{absolute_pos} but only got to #{io.pos}"
  end
  nil
end
skip_ahead_2(io) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 203
def skip_ahead_2(io)
  skip_ahead_n(io, 2)
end
skip_ahead_4(io) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 207
def skip_ahead_4(io)
  skip_ahead_n(io, 4)
end
skip_ahead_8(io) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 211
def skip_ahead_8(io)
  skip_ahead_n(io, 8)
end
skip_ahead_n(io, n) click to toggle source
# File lib/parsers/zip_parser/file_reader.rb, line 233
def skip_ahead_n(io, n)
  pos_before = io.pos
  io.seek(io.pos + n)
  pos_after = io.pos
  delta = pos_after - pos_before
  unless delta == n
    raise ReadError, "Expected to seek #{n} bytes ahead, but could only seek #{delta} bytes ahead"
  end
  nil
end