class Crawlette::Page

Constants

MAILTO_REGEX

Attributes

uri[R]

Public Class Methods

new(html, uri) click to toggle source
# File lib/crawlette/page.rb, line 8
def initialize(html, uri)
  @html = html
  @uri  = uri
end

Public Instance Methods

assets() click to toggle source
# File lib/crawlette/page.rb, line 17
def assets
  @assets ||= begin
    urls = document.css('[src]').map { |a| a["src"] }
    urls += document.css('link[rel="stylesheet"][href]').map { |a| a["href"] }
    urls += document.css('meta[name^="og:image"]').map { |a| a["content"] }

    sanitize_urls(urls, external_links: true)
  end
end

Private Instance Methods

document() click to toggle source
# File lib/crawlette/page.rb, line 29
def document
  @document ||= Nokogiri::HTML.parse(@html)
end
sanitize_urls(urls, external_links: false) click to toggle source
# File lib/crawlette/page.rb, line 33
def sanitize_urls(urls, external_links: false)
  urls.reject { |url| url =~ MAILTO_REGEX }
    .map { |url| URI.parse(URI.escape(url.sub(/#.*$/, ''))) }
    .map do |uri|
      uri.host   ||= @uri.host
      uri.scheme ||= @uri.scheme
      uri.to_s.sub(/\/$/, '') if external_links || uri.host =~ /#{@uri.host}$/
    end.compact.uniq
end