class Crawlette::Page
Constants
- MAILTO_REGEX
Attributes
uri[R]
Public Class Methods
new(html, uri)
click to toggle source
# File lib/crawlette/page.rb, line 8 def initialize(html, uri) @html = html @uri = uri end
Public Instance Methods
assets()
click to toggle source
# File lib/crawlette/page.rb, line 17 def assets @assets ||= begin urls = document.css('[src]').map { |a| a["src"] } urls += document.css('link[rel="stylesheet"][href]').map { |a| a["href"] } urls += document.css('meta[name^="og:image"]').map { |a| a["content"] } sanitize_urls(urls, external_links: true) end end
links()
click to toggle source
# File lib/crawlette/page.rb, line 13 def links @links ||= sanitize_urls(document.css('a[href]').map { |a| a["href"] }) end
Private Instance Methods
document()
click to toggle source
# File lib/crawlette/page.rb, line 29 def document @document ||= Nokogiri::HTML.parse(@html) end
sanitize_urls(urls, external_links: false)
click to toggle source
# File lib/crawlette/page.rb, line 33 def sanitize_urls(urls, external_links: false) urls.reject { |url| url =~ MAILTO_REGEX } .map { |url| URI.parse(URI.escape(url.sub(/#.*$/, ''))) } .map do |uri| uri.host ||= @uri.host uri.scheme ||= @uri.scheme uri.to_s.sub(/\/$/, '') if external_links || uri.host =~ /#{@uri.host}$/ end.compact.uniq end