class Spidercrawl::Page
Parses the content with Nokogiri
Attributes
Public Class Methods
# File lib/spidercrawl/page.rb, line 10 def initialize(url, options = {}) @url = url @code = options[:response_code] @headers = options[:response_head] @location = options[:redirect_url] @body = options[:response_body] @response_time = options[:response_time] @crawled_time = options[:crawled_time] end
Public Instance Methods
Return the base url of the page
# File lib/spidercrawl/page.rb, line 44 def base_url @base_url = "#{scheme}://#{host}" end
Return html content as a string
# File lib/spidercrawl/page.rb, line 130 def content @body.to_s end
Return the content type of the page
# File lib/spidercrawl/page.rb, line 137 def content_type doc.at("meta[@http-equiv='Content-Type']")['content'] end
Return css scripts of the page
# File lib/spidercrawl/page.rb, line 117 def css @css = doc.search("[@type='text/css']") end
Return the Nokogiri html document
# File lib/spidercrawl/page.rb, line 51 def doc @document = Nokogiri::HTML(@body) rescue Exception => e puts e.inspect puts e.backtrace end
Return any emails found in the page
# File lib/spidercrawl/page.rb, line 96 def emails @body.match(/[\w.!#\$%+-]+@[\w-]+(?:\.[\w-]+)+/) end
Return the external links found in the page
# File lib/spidercrawl/page.rb, line 89 def external_links @external_links = links.select { |link| URI.parse(link).host != host } rescue nil end
Return the headers of the page
# File lib/spidercrawl/page.rb, line 61 def headers puts @headers end
Return the url host of the page
# File lib/spidercrawl/page.rb, line 37 def host @url.host end
Return all images found in the page
# File lib/spidercrawl/page.rb, line 103 def images @images = doc.css('img').map { |img| img['src'].to_s }.uniq.delete_if { |src| src.empty? }.map { |url| absolutify(url.strip) } end
Return the internal links found in the page
# File lib/spidercrawl/page.rb, line 82 def internal_links @internal_links = links.select { |link| URI.parse(link).host == host } rescue nil end
Return the entire links found in the page; exclude empty links
# File lib/spidercrawl/page.rb, line 75 def links @links = doc.css('a').map { |link| link['href'].to_s }.uniq.delete_if { |href| href.empty? }.map { |url| absolutify(url.strip) } end
# File lib/spidercrawl/page.rb, line 124 def meta_descriptions end
# File lib/spidercrawl/page.rb, line 121 def meta_keywords end
Return true if page not found
# File lib/spidercrawl/page.rb, line 160 def not_found? @code == 404 end
Return true if page is redirected
# File lib/spidercrawl/page.rb, line 174 def redirect? (300..307).include?(@code) end
Return the response code
# File lib/spidercrawl/page.rb, line 153 def response_code @code end
Return the url scheme of the page (e.g. http, https, etc.)
# File lib/spidercrawl/page.rb, line 30 def scheme @url.scheme end
Return true if page is fetched successfully
# File lib/spidercrawl/page.rb, line 167 def success? @code == 200 end
Return plain text of the page without html tags
# File lib/spidercrawl/page.rb, line 144 def text temp_doc = doc temp_doc.css('script, noscript, style, link').each { |node| node.remove } @text = temp_doc.css('body').text.split("\n").collect { |line| line.strip }.join("\n") end
Return the title of the page
# File lib/spidercrawl/page.rb, line 68 def title @title = doc.css('head title').inner_text end
Return the url of the page
# File lib/spidercrawl/page.rb, line 23 def url @url.to_s end
Return all words found in the page
# File lib/spidercrawl/page.rb, line 110 def words @words = text.split(/[^a-zA-Z]/).delete_if { |word| word.empty? } end
Private Instance Methods
Return the absolute url
# File lib/spidercrawl/page.rb, line 182 def absolutify(page_url) return URI.escape(page_url) if page_url =~ /^\w*\:/i return base_url + URI.escape(page_url) end