class ContentUrls::HtmlParser
HtmlParser
finds and rewrites URLs in HTML content.
Implementation note:¶ ↑
This methods in this class use Nokogiri to identify URLs. Nokogiri cleans HTML code when rewriting, so expect some changes to rewritten content.
Public Class Methods
base(content)
click to toggle source
Returns the base URL/target for all relative URLs in the HTML content.
@param [String] content the HTML content. @return [String] the URL/target found in the content.
# File lib/content_urls/parsers/html_parser.rb, line 38 def self.base(content) doc = Nokogiri::HTML(content) if content rescue nil return nil if !doc base = doc.search('//head/base/@href').to_s.strip base = nil if base && base.empty? base end
rewrite_each_url(content) { |url| ... }
click to toggle source
Rewrites each URL in the HTML content by calling the supplied block with each URL.
@param [String] content the HTML content.
@example Rewrite URLs in HTML code
html = '<html><a href="index.htm">Click me</a></html>' html = ContentUrls::HtmlParser.rewrite_each_url(html) {|url| 'index.php'} puts "Rewritten: #{html}" # => "Rewritten: <html><a href="index.php">Click me</a></html>"
# File lib/content_urls/parsers/html_parser.rb, line 57 def self.rewrite_each_url(content, &block) doc = Nokogiri::HTML(content) if content rescue nil return nil if !doc # TODO: handle href attribute of base tag # - should href URL be changed? # - should relative URLs be modified using base? # - how should rewritten relative URLs be handled? @@parser_definition.each do |type, definition| doc.search(definition[:xpath]).each do |obj| if definition.has_key?(:attribute) # use tag attribute if provided value = obj[definition[:attribute]] else # otherwise use tag's content value = obj.to_s end next if value.nil? or value.strip.empty? if definition.has_key?(:parser) # parse value using parser ContentUrls.rewrite_each_url(value, definition[:parser]) { |url| yield url } elsif definition.has_key?(:attribute) # rewrite the URL within the attribute if definition.has_key?(:url_regex) # use regex to obtain URL if (match = definition[:url_regex].match(value)) url = yield match[:url] next if url.nil? or url.to_s == match.to_s # don't change URL obj[definition[:attribute]] = match.pre_match + url.to_s + match.post_match end else # value is the URL next if value =~ /^#/ # do not capture anchors within the content being parsed url = yield value next if url.nil? or url.to_s == match.to_s # don't change URL #obj[definition[:attribute]] = url.to_s obj.set_attribute(definition[:attribute], url.to_s) end else $stderr.puts "WARNING: unable to rewrite URL for #{value.to_s}" end end end return doc.to_s end
urls(content)
click to toggle source
Returns the URLs found in the HTML content.
@param [String] content the HTML content. @return [Array] the unique URLs found in the content.
@example Parse HTML code for URLs
html = '<html><a href="index.htm">Click me</a></html>' ContentUrls::HtmlParser.urls(html).each do |url| puts "Found URL: #{url}" end # => "Found URL: index.htm"
# File lib/content_urls/parsers/html_parser.rb, line 23 def self.urls(content) doc = Nokogiri::HTML(content) if content rescue nil urls = [] return urls if !doc rewrite_each_url(content) { |url| urls << url; url } urls.uniq! urls end