class FormatParser::JPEGParser

Constants

APP1_MARKER
EOI_MARKER
EXIF_MAGIC_STRING
JPEG_MIME_TYPE
JPEG_SOI_MARKER_HEAD
MUST_FIND_NEXT_MARKER_WITHIN_BYTES
SOF_MARKERS
SOS_MARKER

Public Class Methods

call(io) click to toggle source
# File lib/parsers/jpeg_parser.rb, line 21
def self.call(io)
  new.call(io)
end
likely_match?(filename) click to toggle source
# File lib/parsers/jpeg_parser.rb, line 17
def self.likely_match?(filename)
  filename =~ /\.jpe?g$/i
end

Public Instance Methods

call(io) click to toggle source
# File lib/parsers/jpeg_parser.rb, line 25
def call(io)
  @buf = FormatParser::IOConstraint.new(io)
  @width             = nil
  @height            = nil
  @exif_data_frames  = []
  scan
end

Private Instance Methods

read_char() click to toggle source
# File lib/parsers/jpeg_parser.rb, line 35
def read_char
  safe_read(@buf, 1).unpack('C').first
end
read_frame() click to toggle source
# File lib/parsers/jpeg_parser.rb, line 174
def read_frame
  length = read_short - 2
  safe_read(@buf, length)
end
read_next_marker() click to toggle source

Read a byte, if it is 0xFF then skip bytes as long as they are also 0xFF (byte stuffing) and return the first byte scanned that is not 0xFF. Also applies limits so that we do not read for inordinate amount of time should we encounter a file where we do have a SOI marker at the start and then no markers for a very long time (happened with some PSDs)

# File lib/parsers/jpeg_parser.rb, line 107
def read_next_marker
  # We need to find a sequence of two bytes - the first one is 0xFF, the other is anything but 0xFF
  a = read_char
  (MUST_FIND_NEXT_MARKER_WITHIN_BYTES - 1).times do
    b = read_char
    return b if a == 0xFF && b != 0xFF # Caught the marker
    a = b # Shift the tuple one byte forward
  end
  nil # Nothing found
end
read_short() click to toggle source
# File lib/parsers/jpeg_parser.rb, line 39
def read_short
  safe_read(@buf, 2).unpack('n*').first
end
scan() click to toggle source
# File lib/parsers/jpeg_parser.rb, line 43
def scan
  # Most JPEG images start with the 0xFF0xD8 SOI marker.
  # We _can_ search for that marker, but we will then
  # ambiguously capture things like JPEGs embedded in ID3
  # tags of MP3s - these _are_ JPEGs but we care much
  # more about the top-level "wrapper" file, not about
  # it's bits and bobs
  return unless safe_read(@buf, 2) == JPEG_SOI_MARKER_HEAD

  markers_start_at = @buf.pos

  @buf.seek(markers_start_at)

  while marker = read_next_marker
    case marker
    when *SOF_MARKERS
      scan_start_of_frame
    when EOI_MARKER, SOS_MARKER
      # When we reach "End of image" or "Start of scan" markers
      # we are transitioning into the image data that we don't need
      # or we have reached EOF.
      break
    when APP1_MARKER
      scan_app1_frame
    else
      skip_frame
    end
  end

  Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_read_until_capture', @buf.pos)

  # A single file might contain multiple EXIF data frames. In a JPEG this would
  # manifest as multiple APP1 markers. The way different programs handle these
  # differs, for us it makes the most sense to simply "flatten" them top-down.
  # So we start with the first EXIF frame, and we then allow the APP1 markers
  # that come later in the file to override the properties they _do_ specify.
  flat_exif = FormatParser::EXIFParser::EXIFStack.new(@exif_data_frames)

  # Return at the earliest possible opportunity
  if @width && @height
    dw, dh = flat_exif.rotated? ? [@height, @width] : [@width, @height]
    result = FormatParser::Image.new(
      format: :jpg,
      width_px: @width,
      height_px: @height,
      display_width_px: dw,
      display_height_px: dh,
      orientation: flat_exif.orientation_sym,
      intrinsics: {exif: flat_exif},
      content_type: JPEG_MIME_TYPE
    )

    return result
  end

  nil # We could not parse anything
rescue InvalidStructure
  nil # Due to the way JPEG is structured it is possible that some invalid inputs will get caught
end
scan_app1_frame() click to toggle source
# File lib/parsers/jpeg_parser.rb, line 133
def scan_app1_frame
  # Read the entire EXIF frame at once to not overload the number of reads. If we don't,
  # EXIFR parses our file from the very beginning and does the same parsing we do, just
  # the second time around. What we care about, rather, is the EXIF data only. So we will
  # pry it out of the APP1 frame and parse it as the TIFF segment - which is what EXIFR
  # does under the hood.
  marker_length_at = @buf.pos
  app1_frame_content_length = read_short - 2

  # If there is certainly not enough data in this APP1 to begin with, bail out.
  # For the sake of the argument assume that a usable EXIF marker would contain
  # at least 2 bytes of data - not exact science, but it can help us
  # avoid reading _anything_ from the APP1 marker body if it's too small anyway
  return if app1_frame_content_length < (EXIF_MAGIC_STRING.bytesize + 2)

  # Peek whether the contents of the marker starts with Exif\0
  maybe_exif_magic_str = safe_read(@buf, EXIF_MAGIC_STRING.bytesize)

  # If we could not find the magic Exif\0 string at the start of the marker,
  # seek to the start of the next marker and return
  return unless maybe_exif_magic_str == EXIF_MAGIC_STRING

  # ...and only then read the marker contents and parse it as EXIF.
  # Use StringIO.new instead of #write - https://github.com/aws/aws-sdk-ruby/issues/785#issuecomment-95456838
  exif_buf = StringIO.new(safe_read(@buf, app1_frame_content_length - EXIF_MAGIC_STRING.bytesize))

  Measurometer.add_distribution_value('format_parser.JPEGParser.bytes_sent_to_exif_parser', exif_buf.size)

  @exif_data_frames << exif_from_tiff_io(exif_buf)
rescue EXIFR::MalformedTIFF
  # Not a JPEG or the Exif headers contain invalid data, or
  # an APP1 marker was detected in a file that is not a JPEG
ensure
  # Reposition the file pointer to where the next marker will begin,
  # regardless whether we did find usable EXIF or not
  @buf.seek(marker_length_at + 2 + app1_frame_content_length)

  # Make sure to explicitly clear the EXIF buffers since they can be large
  exif_buf.truncate(0) if exif_buf
end
scan_start_of_frame() click to toggle source
# File lib/parsers/jpeg_parser.rb, line 118
def scan_start_of_frame
  length = read_short
  read_char # depth, unused
  height = read_short
  width  = read_short
  size   = read_char

  if length == (size * 3) + 8
    @width = width
    @height = height
  else
    raise InvalidStructure
  end
end
skip_frame() click to toggle source
# File lib/parsers/jpeg_parser.rb, line 179
def skip_frame
  length = read_short - 2
  safe_skip(@buf, length)
end