class Unmarkdown::Parser
Constants
- AUTOLINK_EMAIL_REGEX
- AUTOLINK_URL_REGEX
- BLOCK_ELEMENT_NAMES
Public Class Methods
new(html, options = {})
click to toggle source
# File lib/unmarkdown/parser.rb, line 9 def initialize(html, options = {}) @html = html @options = options end
Public Instance Methods
parse()
click to toggle source
# File lib/unmarkdown/parser.rb, line 14 def parse # If the HTML fragment starts with a comment, it is ignored. Add an # enclosing body tag to ensure everything is included. html = @html unless html.include?('<body') html = "<body>#{@html}</body>" end # Setup document doc = Nokogiri::HTML(html) doc.encoding = 'UTF-8' # Reset bookkeeping @list = [] @list_position = [] # Parse the root node recursively root_node = doc.xpath('//body') markdown = parse_nodes(root_node.children) # Strip whitespace markdown.rstrip.gsub(/\n{2}+/, "\n\n") # TODO: Strip trailing whitespace end
Private Instance Methods
build_title(node)
click to toggle source
Build the title for links or images
# File lib/unmarkdown/parser.rb, line 150 def build_title(node) node['title'] ? %Q{ "#{node['title']}"} : '' end
parse_content(node)
click to toggle source
Get the content from a node
# File lib/unmarkdown/parser.rb, line 141 def parse_content(node) if node.children.empty? node.content else parse_nodes(node.children) end end
parse_nodes(nodes)
click to toggle source
Parse the children of a node
# File lib/unmarkdown/parser.rb, line 43 def parse_nodes(nodes) output = '' # Short-circuit if it's empty return output if !nodes || nodes.empty? # Loop through nodes nodes.each do |node| case node.name when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' level = node.name.match(/\Ah(\d)\Z/)[1].to_i if @options[:underline_headers] && level < 3 content = parse_content(node) output << content + "\n" character = level == 1 ? '=' : '-' content.length.times { output << character} else hashes = '' level.times { hashes << '#' } output << "#{hashes} #{parse_content(node)}" end when 'blockquote' parse_content(node).split("\n").each do |line| output << "> #{line}\n" end when 'ul', 'ol' output << "\n\n" if @list.count > 0 if unordered = node.name == 'ul' @list << :unordered else @list << :ordered @list_position << 0 end output << parse_nodes(node.children) @list.pop @list_position.pop unless unordered when 'li' (@list.count - 1).times { output << ' ' } if @list.last == :unordered output << "* #{parse_content(node)}" else num = (@list_position[@list_position.count - 1] += 1) output << "#{num}. #{parse_content(node)}" end when 'pre' content = parse_content(node) if @options[:fenced_code_blocks] output << "```\n#{content}\n```" else content.split("\n").each do |line| output << " #{line}\n" end end when 'hr' output << "---\n\n" when 'a' output << "[#{parse_content(node)}](#{node['href']}#{build_title(node)})" when 'i', 'em' output << "*#{parse_content(node)}*" when 'b', 'strong' output << "**#{parse_content(node)}**" when 'u' output << "_#{parse_content(node)}_" when 'mark' output << "==#{parse_content(node)}==" when 'code' output << "`#{parse_content(node)}`" when 'img' output << "![#{node['alt']}](#{node['src']}#{build_title(node)})" when 'text' content = parse_content(node) # Optionally look for links content.gsub!(AUTOLINK_URL_REGEX, '<\1>') if @options[:autolink] content.gsub!(AUTOLINK_EMAIL_REGEX, '<\1>') if @options[:autolink] output << content when 'script' next unless @options[:allow_scripts] output << node.to_html when 'p' output << parse_content(node) else # If it's an supported node or a node that just contains text, just append it output << node.to_html end output << "\n\n" if BLOCK_ELEMENT_NAMES.include?(node.name) end output end