class RailsSpider::Mechanize

Attributes

logger[RW]
mechanize[RW]

Public Class Methods

new() click to toggle source
Calls superclass method RailsSpider::Fetcher::new
# File lib/rails_spider/fetchers/mechanize.rb, line 8
def initialize
  super
  @mechanize = ::Mechanize.new
  @mechanize.open_timeout = 20
  @mechanize.pluggable_parser.default = @mechanize.pluggable_parser['text/html']
  @logger = Logger.new STDOUT
end

Public Instance Methods

body(url) click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 20
def body(url)
  page(url).search('body')
end
change_another_proxy(proxy_hash=nil, header_hash=nil) click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 34
def change_another_proxy(proxy_hash=nil, header_hash=nil)
  if proxy_hash && proxy_hash[:ip] && proxy_hash[:port]
    ip = proxy_hash[:ip]
    port = proxy_hash[:port]
  else
    index = rand(@proxy.size)
    ip = @proxy[index][:ip]
    port = @proxy[index][:port]
  end
  @mechanize.set_proxy ip, port

  @mechanize.request_headers = header_hash unless header_hash.nil?
end
grab_update() click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 63
def grab_update
  logger.info "Start #{self.class} Spider grab_update."

  @newlinks.each do |link|
    @city = link['city'] unless link['city'].blank?
    grab_list_link(link['url'])
  end

  logger.info "End of #{self.class} Spider grab_update."
end
is_existed?(event_hash) click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 74
def is_existed?(event_hash)
  if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first
    logger.warn "#{event_hash[:url]} has been exist in #{event.id}"
    return true
  end
  return false
end
is_grab?(url) click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 48
def is_grab?(url)
  event_class.where(url: url).exists?
end
page(url) click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 16
def page(url)
  mechanize.get(url)
end
run() click to toggle source
# File lib/rails_spider/fetchers/mechanize.rb, line 52
def run
  logger.info "Start #{self.class} Spider..."

  @links.each do |link|
    #@city = link.values.first
    grab_list_link(link.keys.first)
  end

  logger.info "End of #{self.class} Spider..."
end
save_page(page) click to toggle source
# File lib/rails_spider/fetchers/witar.rb, line 15
def save_page(page)
  begin
    page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}")
  rescue => e
    logger.error e.message
    logger.warn "cann't save page #{page.uri}"
  end
end