module Crawler::DocumentParser
Private Instance Methods
extract_assets()
click to toggle source
Returns the static assets from the html document
# File lib/crawler/document_parser.rb, line 26 def extract_assets assets = content.css('img', 'script').map { |i| i['src'] } assets |= content.css('video').map { |v| v['poster'] } assets |= content.css('link').map { |l| l['href'] } assets.compact.uniq end
extract_domain_specific_paths()
click to toggle source
Returns the paths that are related to the given domain
# File lib/crawler/document_parser.rb, line 36 def extract_domain_specific_paths links.map do |link| uri = Addressable::URI.parse(link.strip) if uri.hostname.nil? || uri.hostname == @uri.hostname normalize_path uri.path end end.compact end
extract_links()
click to toggle source
Returns the links from the html document
# File lib/crawler/document_parser.rb, line 20 def extract_links content.css('a').map { |a| a['href'] unless a['href'] == '#' }.compact.uniq end
parse_content(uri)
click to toggle source
Parses the HTML from an http response
# File lib/crawler/document_parser.rb, line 14 def parse_content(uri) Nokogiri::HTML request(uri) end