class Ghostwriter::Writer

Main Ghostwriter converter object.

Attributes

heading_marker[R]
ol_marker[R]
table_column[R]
table_corner[R]
table_row[R]
ul_marker[R]

Public Class Methods

new(link_base: '', heading_marker: '--', ul_marker: '-', ol_marker: '1', table_column: '|', table_row: '-', table_corner: '|') click to toggle source

Creates a new ghostwriter

@param [String] link_base the url to prefix relative links with

# File lib/ghostwriter/writer.rb, line 11
def initialize(link_base: '', heading_marker: '--', ul_marker: '-', ol_marker: '1',
               table_column: '|', table_row: '-', table_corner: '|')
   @link_base      = link_base
   @heading_marker = heading_marker
   @ul_marker      = ul_marker
   @ol_marker      = ol_marker
   @table_column   = table_column
   @table_row      = table_row
   @table_corner   = table_corner

   freeze
end

Public Instance Methods

textify(html) click to toggle source

Strips HTML down to plain text.

@param html [String] the HTML to be convert to text

@return converted text

# File lib/ghostwriter/writer.rb, line 29
def textify(html)
   doc = Nokogiri::HTML(html.gsub(/\s+/, ' '))

   doc.search('style, script').remove

   replace_anchors(doc)
   replace_images(doc)

   simple_replace(doc, '*[role="presentation"]', "\n")

   replace_headers(doc)
   replace_lists(doc)
   replace_tables(doc)

   simple_replace(doc, 'hr', "\n----------\n\n")
   simple_replace(doc, 'br', "\n")
   simple_replace(doc, 'p', "\n\n")

   normalize_lines(doc)
end

Private Instance Methods

normalize_lines(doc) click to toggle source
# File lib/ghostwriter/writer.rb, line 58
def normalize_lines(doc)
   doc.text.strip.split("\n").collect(&:strip).join("\n").concat("\n")
end
replace_anchors(doc) click to toggle source
# File lib/ghostwriter/writer.rb, line 62
def replace_anchors(doc)
   doc.search('a').each do |link_node|
      href = get_link_target(link_node, get_link_base(doc))

      link_node.inner_html = if link_matches(href, link_node.inner_html)
                                href.to_s
                             else
                                "#{ link_node.inner_html } (#{ href })"
                             end
   end
end
replace_headers(doc) click to toggle source
# File lib/ghostwriter/writer.rb, line 96
def replace_headers(doc)
   doc.search('header, h1').each do |node|
      node.replace("#{ @heading_marker } #{ node.inner_html } #{ @heading_marker }\n"
                         .squeeze(' '))
   end

   (2..6).each do |n|
      doc.search("h#{ n }").each do |node|
         node.replace("#{ @heading_marker * n } #{ node.inner_html } #{ @heading_marker * n }\n"
                            .squeeze(' '))
      end
   end
end
replace_images(doc) click to toggle source
# File lib/ghostwriter/writer.rb, line 110
def replace_images(doc)
   doc.search('img[role=presentation]').remove

   doc.search('img').each do |img_node|
      src = img_node['src']
      alt = img_node['alt']

      src = 'embedded' if src.start_with? 'data:'

      img_node.replace("#{ alt } (#{ src })") unless alt.nil? || alt.empty?
   end
end
simple_replace(doc, tag, replacement) click to toggle source
# File lib/ghostwriter/writer.rb, line 52
def simple_replace(doc, tag, replacement)
   doc.search(tag).each do |node|
      node.replace(node.inner_html + replacement)
   end
end