module Crawler::DocumentParser

Private Instance Methods

extract_assets() click to toggle source

Returns the static assets from the html document

# File lib/crawler/document_parser.rb, line 26
def extract_assets
  assets = content.css('img', 'script').map { |i| i['src'] }
  assets |= content.css('video').map { |v| v['poster'] }
  assets |= content.css('link').map { |l| l['href'] }

  assets.compact.uniq
end
extract_domain_specific_paths() click to toggle source

Returns the paths that are related to the given domain

# File lib/crawler/document_parser.rb, line 36
def extract_domain_specific_paths
  links.map do |link|
    uri = Addressable::URI.parse(link.strip)
    if uri.hostname.nil? || uri.hostname == @uri.hostname
      normalize_path uri.path
    end
  end.compact
end
parse_content(uri) click to toggle source

Parses the HTML from an http response

# File lib/crawler/document_parser.rb, line 14
def parse_content(uri)
  Nokogiri::HTML request(uri)
end