module Swordfish::DOCX::Parser

Public Instance Methods

_node_parse_list(node) click to toggle source

Parse a list

# File lib/swordfish/formats/docx/parser.rb, line 118
def _node_parse_list(node)
  # In Office OpenXML, a list is not a distinct element type, but rather a
  # specialized paragraph that references an abstract numbering scheme
  # and includes an indentation level. As a result, the build buffer
  # must be used to assemble the Swordfish::Node representation of the list,
  # since the only way to tell the list has been fully parsed is to encounter
  # a non-list element.

  # Handle paragraphs with no level, which represent multi-paragraph list items
  if node.xpath(".//w:numPr/w:ilvl").length.zero?
    para = Swordfish::Node::Paragraph.new
    _node_parse_runs(node).each {|r| para.append(r)}
    @buffer.last_list_item(:recurse => true).wrap_children(Swordfish::Node::Inline, Swordfish::Node::Paragraph)
    @buffer.last_list_item(:recurse => true).append para
    return
  end

  # Get the list item's abstract numbering and level
  list_item = Swordfish::Node::ListItem.new
  _node_parse_runs(node).each {|r| list_item.append(r)}
  level = node.xpath(".//w:numPr/w:ilvl")[0]['w:val'].to_i
  numbering_scheme = node.xpath(".//w:numPr/w:numId")[0]['w:val'].to_i

  # If the build buffer is empty, this is a new list
  unless @buffer
    @buffer = Swordfish::Node::List.new
    # default to bullet in case of bad numbering reference
    @buffer.stylize @numbering.fetch(numbering_scheme, {}).fetch(level, "bullet").to_sym
    @buffer_initial_value = level # Lists may have an arbitrary initial level
  end

  # Compare the level of this list item to the bottommost node in
  # the build buffer to determine where in the hierarchy to add
  # this node (i.e., are we dealing with list nesting or not?)
  if @buffer.depth_of_final_node >= level || @buffer.children.empty?
    # Add sibling to existing list
    target = @buffer
    (level - @buffer_initial_value).times do
      target = target.last_list_item.nested_list
    end
    target.append list_item
  elsif @buffer.depth_of_final_node < level
    # Add new nested list
    target = @buffer
    (level - @buffer_initial_value - 1).times do
      target = target.last_list_item.nested_list
    end
    list = Swordfish::Node::List.new
    list.append list_item
    list.stylize @numbering[numbering_scheme][level].to_sym
    target.last_list_item.append list
  end
end
_node_parse_paragraph(node) click to toggle source

Parse a paragraph

# File lib/swordfish/formats/docx/parser.rb, line 107
def _node_parse_paragraph(node)
  paragraph = Swordfish::Node::Paragraph.new
  _node_parse_runs(node).each {|r| paragraph.append(r)}
  if node.xpath("./w:pPr/w:pStyle").length > 0
    style_id = node.xpath("./w:pPr/w:pStyle")[0]['w:val'].to_sym
    paragraph.style = @styles[style_id] if @styles[style_id]
  end
  paragraph
end
_node_parse_runs(node, context = nil) click to toggle source

Parse one or more runs

# File lib/swordfish/formats/docx/parser.rb, line 10
def _node_parse_runs(node, context = nil)
  # The 'run' is the basic unit of text in Office OpenXML. A paragraph, table cell, or other
  # block element may contain one or more runs, and each run has an associated set of styles.
  texts = []
  # A complex field is a special type of node spanning multiple runs, where most of the runs
  # designate a special control flow rather than normal text.
  complex_field = nil

  nodes = node.is_a?(Array) ? node : node.children
  nodes.each_with_index do |run_xml, idx|
    case run_xml.name
      when 'r'
        if run_xml.xpath('./w:br').length > 0
          # This run contains a linebreak. It may also contain other elements, so this isn't exclusive.
          texts << Swordfish::Node::Linebreak.new
        end

        if run_xml.xpath('./w:t').length > 0 && complex_field.nil?
          # A True run node
          # Only examine the run if it includes text codes. The run may also include
          # things like comment nodes, which should be ignored.
          text = Swordfish::Node::Text.new
          text.content = run_xml.xpath('./w:t')[0].content
          get_styles_for_node(run_xml.xpath('./w:rPr')[0], text)
          texts << text
        elsif run_xml.xpath('.//*[name()="pic:pic"]').length > 0
          # An image run
          image = Swordfish::Node::Image.new
          relationship_id = run_xml.xpath('.//*[name()="pic:pic"]/*[name()="pic:blipFill"]/*[name()="a:blip"]')[0]['r:embed'] rescue nil
          if relationship_id
            image.original_name = @relationships[relationship_id].split('/').last
            @swordfish_doc.images[image.original_name] = read_image(image.original_name)
            texts << image
          end
        elsif run_xml.xpath('./w:fldChar').length > 0 || complex_field
          # A complex field
          case
            when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'begin'
              # Start the complex field
              complex_field = true
            when run_xml.xpath('./w:instrText').length > 0
              # An instruction run, defining the complex field's behavior
              instruction = run_xml.xpath('./w:instrText')[0].content
              if instruction =~ /^\s*HYPERLINK/
                # A hyperlink
                complex_field = Swordfish::Node::Hyperlink.new
                complex_field.href = instruction.match(/^\s*HYPERLINK (?:"" )?(?:\\l )?"([^"]+)"/).captures[0]
              else
                # Anything else
                complex_field = Swordfish::Node::Text.new
              end
            when run_xml.xpath('./w:t').length > 0 && complex_field.children.length.zero?
              # The textual content
              complex_field.append(_node_parse_runs(nodes.to_a[idx..-1]))
            when run_xml.xpath('./w:fldChar').length > 0 && run_xml.xpath('./w:fldChar')[0]['w:fldCharType'] == 'end'
              # End the complex field
              if complex_field
                texts << complex_field
                complex_field = nil
              else
                # Handle the case where _node_parse_runs gets called from within a complex field
                return texts
              end
          end
        elsif run_xml.xpath('./w:footnoteReference').length > 0
          # A footnote reference
          id = run_xml.xpath('./w:footnoteReference')[0]['w:id'].to_i
          texts << @footnotes[id] if @footnotes[id]
        elsif run_xml.xpath('./w:endnoteReference').length > 0
          # An endnote reference
          id = run_xml.xpath('./w:endnoteReference')[0]['w:id'].to_i
          texts << @endnotes[id] if @endnotes[id]
        end
      when 'hyperlink'
        # Hyperlink nodes are placed amongst other run nodes, but
        # they themselves also contain runs. Hyperlinks include
        # a relationship ID attribute defining their reference.
        link = Swordfish::Node::Hyperlink.new
        link.href = context ? @relationships[context][run_xml['r:id']] : @relationships[run_xml['r:id']]
        _node_parse_runs(run_xml).each {|r| link.append(r)}
        texts << link
    end
  end
  # Clean up runs by merging them if they have identical styles
  texts = texts.reduce([]) do |memo, run|
    if memo.length > 0 && memo.last.is_a?(Swordfish::Node::Text) && run.is_a?(Swordfish::Node::Text) && memo.last.style == run.style
      memo.last.content += run.content
    else
      memo << run
    end
    memo
  end

  texts
end
_node_parse_table(node) click to toggle source

Parse a table

# File lib/swordfish/formats/docx/parser.rb, line 173
def _node_parse_table(node)
  table = Swordfish::Node::Table.new
  node.xpath("./w:tr").each do |row|
    table.append _node_parse_table_row(row)
  end
  table
end
_node_parse_table_cell(node) click to toggle source

Parse a table cell

# File lib/swordfish/formats/docx/parser.rb, line 191
def _node_parse_table_cell(node)
  # In a Swordfish::Node::Table object, the number of table cells must equal the
  # total number of rows times the total number of columns; that is, even if
  # two cells are merged together, there must be a Swordfish::Node::TableCell for
  # each one. Merges are defined using the "merge_up" and "merge_left" properties.

  cell = Swordfish::Node::TableCell.new
  extra_cells = []

  # Get the inner content of the cell
  node.xpath("./w:p").each do |paragraph|
    cell.append _node_parse_paragraph(paragraph)
  end
  
  # Determine whether this cell spans multiple rows. In Office OpenXML,
  # a table cell is defined in every row, even if the cell is vertically-merged. The representation
  # of the merged cell within each row is given a vMerge property, with the topmost one also
  # having a vMerge value of "restart", and the others having no vMerge value.
  if node.xpath("./w:tcPr/w:vMerge").length > 0 && node.xpath("./w:tcPr/w:vMerge")[0]['w:val'].nil?
    cell.merge_up = true
  end

  # Determine whether this cell spans multiple columns. Unlike with vertical merges,
  # a horizontally-merged Office OpenXML cell is only defined once, but is given a gridSpan
  # property defining the number of columns it spans. Since Swordfish requires a cell for each
  # column, loop to generate the additional cells, and set their merge_left values appropriately.
  if node.xpath("./w:tcPr/w:gridSpan").length > 0
    node.xpath("./w:tcPr/w:gridSpan")[0]['w:val'].to_i.-(1).times do
      c = Swordfish::Node::TableCell.new
      c.merge_left = true
      extra_cells << c
    end
  end

  # Return the generated cell or cells
  if extra_cells.empty?
    return cell
  else
    return [cell] + extra_cells
  end
end
_node_parse_table_row(node) click to toggle source

Parse a table row

# File lib/swordfish/formats/docx/parser.rb, line 182
def _node_parse_table_row(node)
  row = Swordfish::Node::TableRow.new
  node.xpath('./w:tc').each do |cell|
    row.append _node_parse_table_cell(cell)
  end
  row
end