module SmsBackupRenderer

Constants

HIGH_SURROGATES
VERSION

Public Class Methods

fix_surrogate_pairs(string) click to toggle source

Although the files claim to be UTF-8, SMS Backup & Restore produces files that incorrectly represent characters such as emoji using surrogate pairs, such that a single character is represented by two adjacent, separately-escaped characters which are supposed to be interpreted as a single Unicode surrogate pair. Nokogiri crashes when it encounters these, since it tries to interpret each part of the pair as a separate character. This method is a hacky workaround that simply searches the whole file for strings that look like escaped surrogate pairs and replaces them with the literal character they represent.

# File lib/sms_backup_renderer/parser.rb, line 16
def self.fix_surrogate_pairs(string)
  string.gsub!(/\&\#(\d{5})\;\&\#(\d{5})\;/) do |match|
    high = Regexp.last_match[1].to_i
    if HIGH_SURROGATES.include?(high)
      low = Regexp.last_match[2].to_i
      code_point = ((high - 0xD800) << 10) + (low - 0xDC00) + 0x010000
      [code_point].pack('U*')
    else
      match[0]
    end
  end
end
generate_html_from_archive(input_file_path, output_dir_path) click to toggle source
# File lib/sms_backup_renderer.rb, line 10
def self.generate_html_from_archive(input_file_path, output_dir_path)
  input_tempfile = Tempfile.new('sms_backup_renderer')
  input_text = File.read(input_file_path)
  SmsBackupRenderer.fix_surrogate_pairs(input_text)
  File.write(input_tempfile.path, input_text)

  data_dir_path = File.join(output_dir_path, 'data')
  FileUtils.mkdir_p(data_dir_path)

  input_file = File.open(input_tempfile.path)
  messages = SmsBackupRenderer.parse(input_file, data_dir_path)
  input_file.close
  input_tempfile.close

  message_groups = messages.group_by {|m| m.participants.reject(&:owner).map(&:normalized_address).sort}.values

  assets_dir_path = File.join(output_dir_path, 'assets')
  FileUtils.cp_r(File.join(File.dirname(__FILE__), 'sms_backup_renderer', 'assets'), output_dir_path)
  conversations_dir_path = File.join(output_dir_path, 'conversations')
  FileUtils.mkdir_p(conversations_dir_path)

  conversation_pages = message_groups.map do |group_messages|
    filename = ConversationPage.build_filename(group_messages.first.participants.reject(&:owner))
    path = File.join(conversations_dir_path, filename)
    SmsBackupRenderer::ConversationPage.new(path, assets_dir_path, group_messages)
  end

  conversation_pages.each(&:write)

  SmsBackupRenderer::IndexPage.new(
    File.join(output_dir_path, 'index.html'), assets_dir_path, conversation_pages).write
end
mms_address_contact_names(mms) click to toggle source

Build a hash of normalized addresses to contact names using information in an MMS XML record. The data in the archive does not provide any explicit mapping of addresses to contact names, but at least for me it seems like the tilde-separated address attribute and the comma-separated contact_name attribute are provided in the same order, so we can try to use those to build a mapping. Obviously, this is error-prone, but seems better than nothing.

mms - nokogiri object representing the MMS element

Returns a Hash of String normalized addresses to String contact names.

# File lib/sms_backup_renderer/parser.rb, line 145
def self.mms_address_contact_names(mms)
  addresses = parse_mms_combined_address(mms.attr('address'))
  contact_names = mms.attr('contact_name').split(',').map(&:strip)

  # There may be more addresses than contact names. It seems like the addresses for unknown contacts
  # are placed at the end of the list. We'll omit them from the hash.
  addresses = addresses.take(contact_names.count)

  addresses.zip(contact_names).to_h
end
mms_outgoing_type?(type) click to toggle source
# File lib/sms_backup_renderer/parser.rb, line 116
def self.mms_outgoing_type?(type)
  case type
  when '132'
    false
  when '128'
    true
  else
    raise "Unrecognized MMS m_type #{type}"
  end
end
mms_sender_addr_type?(type) click to toggle source
# File lib/sms_backup_renderer/parser.rb, line 127
def self.mms_sender_addr_type?(type)
  case type
  when '137'
    true
  else
    false
  end
end
parse(input, data_dir_path) click to toggle source
# File lib/sms_backup_renderer/parser.rb, line 29
def self.parse(input, data_dir_path)
  messages = []
  Nokogiri::XML::Reader(input).each do |node|
    next unless node.node_type == Nokogiri::XML::Reader::TYPE_ELEMENT
    case node.name
    when 'sms'
      sms = Nokogiri::XML(node.outer_xml).at('/sms')
      outgoing = sms_outgoing_type?(sms.attr('type'))
      messages << Message.new(
        date_time: Time.strptime(sms.attr('date'), '%Q'),
        parts: sms.attr('body') ? [TextPart.new(sms.attr('body'))] : [],
        outgoing: outgoing,
        participants: [Participant.new(
          address: sms.attr('address'),
          name: sms.attr('contact_name'),
          owner: false,
          sender: !outgoing)],
        subject: sms.attr('subject'))
    when 'mms'
      mms = Nokogiri::XML(node.outer_xml).at('/mms')
      unless ['null',
              'application/vnd.wap.multipart.related',
              'application/vnd.wap.multipart.mixed'].include?(mms.attr('ct_t'))
        raise "Unrecognized MMS ct_t #{mms.attr('ct_t')}"
      end
      
      parts = mms.xpath('parts/part').map do |part|
        case part.attr('ct')
        when 'application/smil'
          # should probably use this, but I think I can get by without it
          nil
        when 'text/plain'
          TextPart.new(part.attr('text'))
        when /\Aimage\/(.+)\z/
          data = Base64.decode64(part.attr('data'))
          digest = Digest::MD5.hexdigest(data)
          path = File.join(data_dir_path, "#{digest}.#{$1}")
          File.write(path, data)
          ImagePart.new(part.attr('ct'), path)
        when /\Avideo\/(.+)\z/
          data = Base64.decode64(part.attr('data'))
          digest = Digest::MD5.hexdigest(data)
          path = File.join(data_dir_path, "#{digest}.#{$1}")
          File.write(path, data)
          VideoPart.new(part.attr('ct'), path)
        else
          UnsupportedPart.new(part.to_xml)
        end
      end.compact

      non_owner_addresses = parse_mms_combined_address(mms.attr('address'))
      address_contact_names = mms_address_contact_names(mms)
      participants = mms.xpath('addrs/addr').map do |addr|
        Participant.new(
          address: addr.attr('address'),
          name: address_contact_names[Participant.normalize_address(addr.attr('address'))],
          owner: !non_owner_addresses.include?(Participant.normalize_address(addr.attr('address'))),
          sender: mms_sender_addr_type?(addr.attr('type')))
      end

      # Some messages include the sender as a recipient as well; we don't want Participants
      # for those recipients since it would interfere with proper conversation grouping.
      if sender = participants.detect(&:sender)
        participants.delete_if { |p| !p.sender && p.normalized_address == sender.normalized_address}
      end

      messages << Message.new(
        date_time: Time.strptime(mms.attr('date'), '%Q'),
        outgoing: mms_outgoing_type?(mms.attr('m_type')),
        participants: participants,
        parts: parts)
    end
  end
  messages
end
parse_mms_combined_address(address_attribute) click to toggle source

The XML for MMSes contains an ‘address’ attribute containing a list of addresses separated by tildes. Although there are also separate ‘addr’ elements for each address, the combined attribute can be useful because it appears to exclude the owner of the archive’s address, and because the order can be correlated with the contact_name attribute.

address_attribute - the value of the ‘address’ attribute from the XML element for the MMS message

Returns an Array of String normalized addresses.

# File lib/sms_backup_renderer/parser.rb, line 164
def self.parse_mms_combined_address(address_attribute)
  address_attribute.split('~').map {|a| Participant.normalize_address(a)}
end
sms_outgoing_type?(type) click to toggle source
# File lib/sms_backup_renderer/parser.rb, line 105
def self.sms_outgoing_type?(type)
  case type
  when '1'
    false
  when '2'
    true
  else
    raise "Unrecognized SMS type #{type}"
  end
end