class Seospider::Parser
Attributes
doc[R]
html[R]
result[R]
Public Class Methods
new(attrs)
click to toggle source
# File lib/seospider/parser.rb, line 10 def initialize(attrs) @url = attrs[:url] @uri = URI(@url) @user_agent = attrs[:user_agent] || 'Baiduspider' @webrobots = WebRobots.new(@user_agent) @debug = false time_start = Time.now r = Client.get(@url) time_end = Time.now response_time = time_end - time_start @html = r.body.encode!('UTF-8','UTF-8',:invalid => :replace) @doc = Nokogiri::HTML(@html) @result = {:url => @url, :status => r.response.code.to_i, :location => r.headers['location'], :response_time => response_time, :canonical => '', :title => '', :meta_keywords => '', :meta_description => '', :meta_robots => '', :h1 => '', :h2 => '', :h3 => '', :links => '', :headers => r.headers.to_hash } parse end
Public Instance Methods
_parse_canonical()
click to toggle source
# File lib/seospider/parser.rb, line 96 def _parse_canonical @doc.search('link[@rel="canonical"]').first['href'] end
_parse_links()
click to toggle source
# File lib/seospider/parser.rb, line 77 def _parse_links links = [] @doc.search('a').each do |a| href = a['href'] text = a.content.strip rel = a['rel'] url = @uri.merge(URI.escape(href.to_s)) if url.host == @uri.host disallow = @webrobots.disallowed?(url.to_s) else disallow = nil end links << {href: href, text: text, rel: rel, disallow: disallow} end links end
_parse_meta_description()
click to toggle source
# File lib/seospider/parser.rb, line 65 def _parse_meta_description @doc.search('meta[@name="description"]').first['content'] end
_parse_meta_keywords()
click to toggle source
# File lib/seospider/parser.rb, line 69 def _parse_meta_keywords @doc.search('meta[@name="keywords"]').first['content'] end
_parse_meta_robots()
click to toggle source
# File lib/seospider/parser.rb, line 73 def _parse_meta_robots @doc.search('meta[@name="robots"]').first['content'] end
_parse_title()
click to toggle source
# File lib/seospider/parser.rb, line 61 def _parse_title @doc.search('title').first.content end
parse()
click to toggle source
# File lib/seospider/parser.rb, line 44 def parse self.methods.each do |m| next unless m =~ /^_parse_/ begin value = self.send m rescue Exception => e warn "#{e.class} - #{e.message} -- #{m}" if @debug value = nil end @result[m.to_s.sub('_parse_','').to_sym] = value end @result end