class Snapcrawl::Page
Constants
- EXTENSION_BLACKLIST
- PROTOCOL_BLACKLIST
Attributes
depth[R]
url[R]
Public Class Methods
new(url, depth: 0)
click to toggle source
# File lib/snapcrawl/page.rb, line 16 def initialize(url, depth: 0) @url, @depth = url.protocolize, depth end
Public Instance Methods
links()
click to toggle source
# File lib/snapcrawl/page.rb, line 32 def links return nil unless valid? doc = Nokogiri::HTML http_response.body normalize_links doc.css('a') end
pages()
click to toggle source
# File lib/snapcrawl/page.rb, line 38 def pages return nil unless valid? links.map { |link| Page.new link, depth: depth+1 } end
path()
click to toggle source
# File lib/snapcrawl/page.rb, line 28 def path @path ||= Addressable::URI.parse(url).request_uri end
save_screenshot(outfile)
click to toggle source
# File lib/snapcrawl/page.rb, line 43 def save_screenshot(outfile) return false unless valid? Screenshot.new(url).save "#{outfile}" end
site()
click to toggle source
# File lib/snapcrawl/page.rb, line 24 def site @site ||= Addressable::URI.parse(url).site end
valid?()
click to toggle source
# File lib/snapcrawl/page.rb, line 20 def valid? http_response&.success? end
Private Instance Methods
cache()
click to toggle source
# File lib/snapcrawl/page.rb, line 111 def cache Lightly.new life: Config.cache_life end
http_response()
click to toggle source
# File lib/snapcrawl/page.rb, line 50 def http_response @http_response ||= http_response! end
http_response!()
click to toggle source
# File lib/snapcrawl/page.rb, line 54 def http_response! response = cache.get(url) { HTTParty.get url, httparty_options } if !response.success? $logger.warn "http error on !undpur!#{url}!txtrst!, code: !txtylw!#{response.code}!txtrst!, message: #{response.message.strip}" end response rescue => e $logger.error "http error on !undpur!#{url}!txtrst! - !txtred!#{e.class}!txtrst!: #{e.message}" nil end
httparty_options()
click to toggle source
# File lib/snapcrawl/page.rb, line 69 def httparty_options Config.skip_ssl_verification ? { verify: false } : {} end
normalize_link(link)
click to toggle source
# File lib/snapcrawl/page.rb, line 84 def normalize_link(link) link = link.attribute('href').to_s.dup # Remove #hash link.gsub!(/#.+$/, '') return nil if link.empty? # Remove links to specific extensions and protocols return nil if link =~ /\.(#{EXTENSION_BLACKLIST})(\?.*)?$/ return nil if link =~ /^(#{PROTOCOL_BLACKLIST}):/ # Strip spaces link.strip! # Convert relative links to absolute begin link = Addressable::URI.join(url, link).to_s.dup rescue => e $logger.warn "!txtred!#{e.class}!txtrst!: #{e.message} on #{path} (link: #{link})" return nil end # Keep only links in our base domain return nil unless link.include? site link end
normalize_links(links)
click to toggle source
# File lib/snapcrawl/page.rb, line 73 def normalize_links(links) result = [] links.each do |link| valid_link = normalize_link link result << valid_link if valid_link end result.uniq end