class Seospider::Parser

Attributes

doc[R]
html[R]
result[R]

Public Class Methods

new(attrs) click to toggle source
# File lib/seospider/parser.rb, line 10
def initialize(attrs)
  @url = attrs[:url]
  @uri = URI(@url)
  @user_agent = attrs[:user_agent] || 'Baiduspider'
  @webrobots = WebRobots.new(@user_agent)
  @debug = false

  time_start = Time.now
  r = Client.get(@url)
  time_end = Time.now
  response_time = time_end - time_start

  @html = r.body.encode!('UTF-8','UTF-8',:invalid => :replace)
  @doc = Nokogiri::HTML(@html)

  @result = {:url => @url,
    :status => r.response.code.to_i,
    :location => r.headers['location'],
    :response_time => response_time,
    :canonical => '',
    :title => '',
    :meta_keywords => '',
    :meta_description => '',
    :meta_robots => '',
    :h1 => '',
    :h2 => '',
    :h3 => '',
    :links => '',
    :headers => r.headers.to_hash
  }
  
  parse
end

Public Instance Methods

_parse_canonical() click to toggle source
# File lib/seospider/parser.rb, line 96
def _parse_canonical
  @doc.search('link[@rel="canonical"]').first['href']
end
_parse_meta_description() click to toggle source
# File lib/seospider/parser.rb, line 65
def _parse_meta_description
  @doc.search('meta[@name="description"]').first['content']
end
_parse_meta_keywords() click to toggle source
# File lib/seospider/parser.rb, line 69
def _parse_meta_keywords
  @doc.search('meta[@name="keywords"]').first['content']
end
_parse_meta_robots() click to toggle source
# File lib/seospider/parser.rb, line 73
def _parse_meta_robots
  @doc.search('meta[@name="robots"]').first['content']
end
_parse_title() click to toggle source
# File lib/seospider/parser.rb, line 61
def _parse_title
  @doc.search('title').first.content
end
parse() click to toggle source
# File lib/seospider/parser.rb, line 44
def parse
  self.methods.each do |m|
    next unless m =~ /^_parse_/

    begin
      value = self.send m
    rescue Exception => e
      warn "#{e.class} - #{e.message} -- #{m}" if @debug
      value = nil
    end

    @result[m.to_s.sub('_parse_','').to_sym] = value
  end

  @result
end