class Ghostwriter::Writer
Main Ghostwriter
converter object.
Attributes
heading_marker[R]
link_base[R]
ol_marker[R]
table_column[R]
table_corner[R]
table_row[R]
ul_marker[R]
Public Class Methods
new(link_base: '', heading_marker: '--', ul_marker: '-', ol_marker: '1', table_column: '|', table_row: '-', table_corner: '|')
click to toggle source
Creates a new ghostwriter
@param [String] link_base
the url to prefix relative links with
# File lib/ghostwriter/writer.rb, line 11 def initialize(link_base: '', heading_marker: '--', ul_marker: '-', ol_marker: '1', table_column: '|', table_row: '-', table_corner: '|') @link_base = link_base @heading_marker = heading_marker @ul_marker = ul_marker @ol_marker = ol_marker @table_column = table_column @table_row = table_row @table_corner = table_corner freeze end
Public Instance Methods
textify(html)
click to toggle source
Strips HTML down to plain text.
@param html [String] the HTML to be convert to text
@return converted text
# File lib/ghostwriter/writer.rb, line 29 def textify(html) doc = Nokogiri::HTML(html.gsub(/\s+/, ' ')) doc.search('style, script').remove replace_anchors(doc) replace_images(doc) simple_replace(doc, '*[role="presentation"]', "\n") replace_headers(doc) replace_lists(doc) replace_tables(doc) simple_replace(doc, 'hr', "\n----------\n\n") simple_replace(doc, 'br', "\n") simple_replace(doc, 'p', "\n\n") normalize_lines(doc) end
Private Instance Methods
get_link_base(doc)
click to toggle source
# File lib/ghostwriter/writer.rb, line 78 def get_link_base(doc) # <base> node is unique by W3C spec base_node = doc.search('base').first base_node ? base_node['href'] : @link_base end
get_link_target(link_node, base)
click to toggle source
# File lib/ghostwriter/writer.rb, line 85 def get_link_target(link_node, base) href = URI(link_node['href']) if href.absolute? href else base + href.to_s end rescue URI::InvalidURIError link_node['href'].gsub(/^(tel|mailto):/, '').strip end
link_matches(first, second)
click to toggle source
# File lib/ghostwriter/writer.rb, line 74 def link_matches(first, second) first.to_s.gsub(%r{^https?://}, '').chomp('/') == second.gsub(%r{^https?://}, '').chomp('/') end
normalize_lines(doc)
click to toggle source
# File lib/ghostwriter/writer.rb, line 58 def normalize_lines(doc) doc.text.strip.split("\n").collect(&:strip).join("\n").concat("\n") end
replace_anchors(doc)
click to toggle source
# File lib/ghostwriter/writer.rb, line 62 def replace_anchors(doc) doc.search('a').each do |link_node| href = get_link_target(link_node, get_link_base(doc)) link_node.inner_html = if link_matches(href, link_node.inner_html) href.to_s else "#{ link_node.inner_html } (#{ href })" end end end
replace_headers(doc)
click to toggle source
# File lib/ghostwriter/writer.rb, line 96 def replace_headers(doc) doc.search('header, h1').each do |node| node.replace("#{ @heading_marker } #{ node.inner_html } #{ @heading_marker }\n" .squeeze(' ')) end (2..6).each do |n| doc.search("h#{ n }").each do |node| node.replace("#{ @heading_marker * n } #{ node.inner_html } #{ @heading_marker * n }\n" .squeeze(' ')) end end end
replace_images(doc)
click to toggle source
# File lib/ghostwriter/writer.rb, line 110 def replace_images(doc) doc.search('img[role=presentation]').remove doc.search('img').each do |img_node| src = img_node['src'] alt = img_node['alt'] src = 'embedded' if src.start_with? 'data:' img_node.replace("#{ alt } (#{ src })") unless alt.nil? || alt.empty? end end
simple_replace(doc, tag, replacement)
click to toggle source
# File lib/ghostwriter/writer.rb, line 52 def simple_replace(doc, tag, replacement) doc.search(tag).each do |node| node.replace(node.inner_html + replacement) end end