class Spidercrawl::Page

Parses the content with Nokogiri

Attributes

crawled_time[RW]
location[R]
response_time[R]

Public Class Methods

new(url, options = {}) click to toggle source
# File lib/spidercrawl/page.rb, line 10
def initialize(url, options = {})
  @url = url
  @code = options[:response_code]
  @headers = options[:response_head]
  @location = options[:redirect_url]
  @body = options[:response_body]
  @response_time = options[:response_time]
  @crawled_time = options[:crawled_time]
end

Public Instance Methods

base_url() click to toggle source

Return the base url of the page

# File lib/spidercrawl/page.rb, line 44
def base_url
  @base_url = "#{scheme}://#{host}"
end
content() click to toggle source

Return html content as a string

# File lib/spidercrawl/page.rb, line 130
def content
  @body.to_s
end
content_type() click to toggle source

Return the content type of the page

# File lib/spidercrawl/page.rb, line 137
def content_type
  doc.at("meta[@http-equiv='Content-Type']")['content']
end
css() click to toggle source

Return css scripts of the page

# File lib/spidercrawl/page.rb, line 117
def css
  @css = doc.search("[@type='text/css']")
end
doc() click to toggle source

Return the Nokogiri html document

# File lib/spidercrawl/page.rb, line 51
def doc
  @document = Nokogiri::HTML(@body)
  rescue Exception => e
    puts e.inspect
    puts e.backtrace
end
emails() click to toggle source

Return any emails found in the page

# File lib/spidercrawl/page.rb, line 96
def emails
  @body.match(/[\w.!#\$%+-]+@[\w-]+(?:\.[\w-]+)+/)
end
headers() click to toggle source

Return the headers of the page

# File lib/spidercrawl/page.rb, line 61
def headers
  puts @headers
end
host() click to toggle source

Return the url host of the page

# File lib/spidercrawl/page.rb, line 37
def host
  @url.host
end
images() click to toggle source

Return all images found in the page

# File lib/spidercrawl/page.rb, line 103
def images
  @images = doc.css('img').map { |img| img['src'].to_s }.uniq.delete_if { |src| src.empty? }.map { |url| absolutify(url.strip) }
end
meta_descriptions() click to toggle source
# File lib/spidercrawl/page.rb, line 124
def meta_descriptions
end
meta_keywords() click to toggle source
# File lib/spidercrawl/page.rb, line 121
def meta_keywords
end
not_found?() click to toggle source

Return true if page not found

# File lib/spidercrawl/page.rb, line 160
def not_found?
  @code == 404
end
redirect?() click to toggle source

Return true if page is redirected

# File lib/spidercrawl/page.rb, line 174
def redirect?
  (300..307).include?(@code)
end
response_code() click to toggle source

Return the response code

# File lib/spidercrawl/page.rb, line 153
def response_code
  @code
end
scheme() click to toggle source

Return the url scheme of the page (e.g. http, https, etc.)

# File lib/spidercrawl/page.rb, line 30
def scheme
  @url.scheme
end
success?() click to toggle source

Return true if page is fetched successfully

# File lib/spidercrawl/page.rb, line 167
def success?
  @code == 200
end
text() click to toggle source

Return plain text of the page without html tags

# File lib/spidercrawl/page.rb, line 144
def text
  temp_doc = doc
  temp_doc.css('script, noscript, style, link').each { |node| node.remove }
  @text = temp_doc.css('body').text.split("\n").collect { |line| line.strip }.join("\n")
end
title() click to toggle source

Return the title of the page

# File lib/spidercrawl/page.rb, line 68
def title
  @title = doc.css('head title').inner_text
end
url() click to toggle source

Return the url of the page

# File lib/spidercrawl/page.rb, line 23
def url
  @url.to_s
end
words() click to toggle source

Return all words found in the page

# File lib/spidercrawl/page.rb, line 110
def words
  @words = text.split(/[^a-zA-Z]/).delete_if { |word| word.empty? }
end

Private Instance Methods

absolutify(page_url) click to toggle source

Return the absolute url

# File lib/spidercrawl/page.rb, line 182
def absolutify(page_url)
  return URI.escape(page_url) if page_url =~ /^\w*\:/i
  return base_url + URI.escape(page_url)
end