class RailsSpider::Mechanize
Attributes
logger[RW]
mechanize[RW]
Public Class Methods
new()
click to toggle source
Calls superclass method
RailsSpider::Fetcher::new
# File lib/rails_spider/fetchers/mechanize.rb, line 8 def initialize super @mechanize = ::Mechanize.new @mechanize.open_timeout = 20 @mechanize.pluggable_parser.default = @mechanize.pluggable_parser['text/html'] @logger = Logger.new STDOUT end
Public Instance Methods
body(url)
click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 20 def body(url) page(url).search('body') end
change_another_proxy(proxy_hash=nil, header_hash=nil)
click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 34 def change_another_proxy(proxy_hash=nil, header_hash=nil) if proxy_hash && proxy_hash[:ip] && proxy_hash[:port] ip = proxy_hash[:ip] port = proxy_hash[:port] else index = rand(@proxy.size) ip = @proxy[index][:ip] port = @proxy[index][:port] end @mechanize.set_proxy ip, port @mechanize.request_headers = header_hash unless header_hash.nil? end
grab_update()
click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 63 def grab_update logger.info "Start #{self.class} Spider grab_update." @newlinks.each do |link| @city = link['city'] unless link['city'].blank? grab_list_link(link['url']) end logger.info "End of #{self.class} Spider grab_update." end
is_existed?(event_hash)
click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 74 def is_existed?(event_hash) if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first logger.warn "#{event_hash[:url]} has been exist in #{event.id}" return true end return false end
is_grab?(url)
click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 48 def is_grab?(url) event_class.where(url: url).exists? end
links(url)
click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 24 def links(url) page(url).links.map do |link| begin link.resolved_uri.to_s rescue ::Mechanize::UnsupportedSchemeError '' end end end
page(url)
click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 16 def page(url) mechanize.get(url) end
run()
click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 52 def run logger.info "Start #{self.class} Spider..." @links.each do |link| #@city = link.values.first grab_list_link(link.keys.first) end logger.info "End of #{self.class} Spider..." end
save_page(page)
click to toggle source
# File lib/rails_spider/fetchers/witar.rb, line 15 def save_page(page) begin page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}") rescue => e logger.error e.message logger.warn "cann't save page #{page.uri}" end end