class ContentUrls::HtmlParser

HtmlParser finds and rewrites URLs in HTML content.

Implementation note:

This methods in this class use Nokogiri to identify URLs. Nokogiri cleans HTML code when rewriting, so expect some changes to rewritten content.

Public Class Methods

base(content) click to toggle source

Returns the base URL/target for all relative URLs in the HTML content.

@param [String] content the HTML content. @return [String] the URL/target found in the content.

# File lib/content_urls/parsers/html_parser.rb, line 38
def self.base(content)
  doc = Nokogiri::HTML(content) if content rescue nil
  return nil if !doc

  base = doc.search('//head/base/@href').to_s.strip
  base = nil if base && base.empty?
  base
end
rewrite_each_url(content) { |url| ... } click to toggle source

Rewrites each URL in the HTML content by calling the supplied block with each URL.

@param [String] content the HTML content.

@example Rewrite URLs in HTML code

html = '<html><a href="index.htm">Click me</a></html>'
html = ContentUrls::HtmlParser.rewrite_each_url(html) {|url| 'index.php'}
puts "Rewritten: #{html}"
# => "Rewritten: <html><a href="index.php">Click me</a></html>"
# File lib/content_urls/parsers/html_parser.rb, line 57
def self.rewrite_each_url(content, &block)
  doc = Nokogiri::HTML(content) if content rescue nil
  return nil if !doc

  # TODO: handle href attribute of base tag
  #  - should href URL be changed?
  #  - should relative URLs be modified using base?
  #  - how should rewritten relative URLs be handled?

  @@parser_definition.each do |type, definition|
    doc.search(definition[:xpath]).each do |obj|
      if definition.has_key?(:attribute)  # use tag attribute if provided
        value = obj[definition[:attribute]]
      else  # otherwise use tag's content
        value = obj.to_s
      end
      next if value.nil? or value.strip.empty?

      if definition.has_key?(:parser)  # parse value using parser
        ContentUrls.rewrite_each_url(value, definition[:parser]) { |url| yield url }

      elsif definition.has_key?(:attribute)  # rewrite the URL within the attribute

        if definition.has_key?(:url_regex)  # use regex to obtain URL
          if (match = definition[:url_regex].match(value))
            url = yield match[:url]
            next if url.nil? or url.to_s == match.to_s  # don't change URL
            obj[definition[:attribute]] = match.pre_match + url.to_s + match.post_match
          end

        else  # value is the URL
          next if value =~ /^#/  # do not capture anchors within the content being parsed
          url = yield value
          next if url.nil? or url.to_s == match.to_s  # don't change URL
          #obj[definition[:attribute]] = url.to_s
          obj.set_attribute(definition[:attribute], url.to_s)
        end
      else
        $stderr.puts "WARNING: unable to rewrite URL for #{value.to_s}"
      end
    end
  end
  return doc.to_s
end
urls(content) click to toggle source

Returns the URLs found in the HTML content.

@param [String] content the HTML content. @return [Array] the unique URLs found in the content.

@example Parse HTML code for URLs

html = '<html><a href="index.htm">Click me</a></html>'
ContentUrls::HtmlParser.urls(html).each do |url|
  puts "Found URL: #{url}"
end
# => "Found URL: index.htm"
# File lib/content_urls/parsers/html_parser.rb, line 23
def self.urls(content)
  doc = Nokogiri::HTML(content) if content rescue nil
  urls = []
  return urls if !doc

  rewrite_each_url(content) { |url| urls << url; url }
  urls.uniq!
  urls
end