class RailsSpider::Fetcher
Public Class Methods
new()
click to toggle source
# File lib/rails_spider/fetchers/base.rb, line 4 def initialize @page = '' end
Public Instance Methods
change_another_proxy(proxy_hash=nil, header_hash=nil)
click to toggle source
# File lib/rails_spider/fetchers/base.rb, line 59 def change_another_proxy(proxy_hash=nil, header_hash=nil) if proxy_hash && proxy_hash[:ip] && proxy_hash[:port] ip = proxy_hash[:ip] port = proxy_hash[:port] else index = rand(@proxy.size) ip = @proxy[index][:ip] port = @proxy[index][:port] end @mechanize.set_proxy ip, port @mechanize.request_headers = header_hash unless header_hash.nil? end
create_event(event_hash)
click to toggle source
# File lib/rails_spider/fetchers/base.rb, line 99 def create_event(event_hash) if event_hash.blank? logger.warn "Cann't create event by blank data" return end if is_existed?(event_hash) logger.warn "Paramter:#{event_hash} has been existed cann't to create" return end event = Event.new(event_hash) if event_hash[:place].blank? event.status = -1 end event.kind_id = Kind.find_or_create_by(name: event_hash[:kind]).id unless event_hash[:kind].blank? event.subkind_id = set_subkind_id(event_hash[:subkind]) unless event_hash[:subkind].blank? if event_hash[:tags] event_hash[:tags].each do |t| EventTag.create(event_id: event.id, tag_id: Tag.find_or_create_by(name: t).id) end end event.int_id = Event.max(:int_id).blank? ? 1 : Event.max(:int_id) + 1 event.save unless event.errors.blank? logger.info event.errors.full_messages.join(' / ') else logger.info 'Save event success' end end
event_class()
click to toggle source
# File lib/rails_spider/fetchers/base.rb, line 8 def event_class @event_class = EventSpider.config.event_class.constantize end
grab_update()
click to toggle source
# File lib/rails_spider/fetchers/base.rb, line 88 def grab_update logger.info "Start #{self.class} Spider grab_update." @newlinks.each do |link| @city = link['city'] unless link['city'].blank? grab_list_link(link['url']) end logger.info "End of #{self.class} Spider grab_update." end
is_existed?(event_hash)
click to toggle source
# File lib/rails_spider/fetchers/base.rb, line 128 def is_existed?(event_hash) #if event_hash[:event_id] && event_class.where(event_id: event_hash[:event_id]).first # return true #end # TODO title and city are the same #if event_hash[:title] && event_class.where(title: event_hash[:title]).first # return true #end if event_hash[:url] && event = event_class.where(url: event_hash[:url]).first logger.warn "#{event_hash[:url]} has been exist in #{event.id}" return true end return false end
is_grab?(url)
click to toggle source
# File lib/rails_spider/fetchers/base.rb, line 73 def is_grab?(url) event_class.where(url: url).exists? # 表示没有抓取 end
keep_on?()
click to toggle source
# File lib/rails_spider/fetchers/base.rb, line 143 def keep_on?; return true end
page_by_url(url, proxy_hash=nil, header_hash=nil, repeat=5)
click to toggle source
# File lib/rails_spider/fetchers/base.rb, line 12 def page_by_url(url, proxy_hash=nil, header_hash=nil, repeat=5) logger.info "Grab the page #{url}" begin change_another_proxy(proxy_hash, header_hash) logger.info "Changed to a new proxy: #{@mechanize.proxy_addr}:#{@mechanize.proxy_port} for #{url}" page = @mechanize.get(url) logger.info "Has been get the page #{url}" page rescue => e logger.error e.message e.backtrace.each do |msg| error_log.error msg end error_log.error "\n" i ||= 0 if i < repeat logger.info "Retry to get page for #{i} times" i += 1 retry else if url.include?('douban') source = 'douban' elsif url.include?('weibo') source = 'weibo' elsif url.include?('rockbundartmuseum') source = 'waitan' elsif url.include?('citymoments') source = 'citymoment' else source = 'else' end FailUrl.create(url: url, source: source, flag: "spider") logger.warn "Cann't grab url #{url}" return end end end
run()
click to toggle source
# File lib/rails_spider/fetchers/base.rb, line 77 def run logger.info "Start #{self.class} Spider..." @links.each do |link| #@city = link.values.first grab_list_link(link.keys.first) end logger.info "End of #{self.class} Spider..." end
save_page(page)
click to toggle source
# File lib/rails_spider/fetchers/base.rb, line 50 def save_page(page) begin page.save_as("html/#{Date.today.to_s}/#{page.uri.to_s.split('http://').last.chomp('/')}") rescue => e logger.error e.message logger.warn "cann't save page #{page.uri}" end end