class Spidercrawl::ParallelRequest
Makes parallel requests to the targeted website using typhoeus and hydra
Attributes
urls[RW]
Public Class Methods
new(urls, options = {})
click to toggle source
# File lib/spidercrawl/request.rb, line 103 def initialize(urls, options = {}) @urls = urls @threads = options[:threads] @timeout = options[:timeout] end
Public Instance Methods
fetch()
click to toggle source
Fetch page(s) from the given url(s)
# File lib/spidercrawl/request.rb, line 112 def fetch hydra = Typhoeus::Hydra.new(:max_concurrency => @threads) page, pages = nil, [] @urls.each do |url| request = Typhoeus::Request.new(url, :timeout => @timeout, :followlocation => false, :headers => {"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Cache-Control" => "no-cache", "Pragma" => "no-cache", "User-Agent" => UserAgents.random}) request.on_complete do |response| uri = URI(url) if response.success? puts "fetching #{url}".green.on_black page = Page.new(uri, response_code: response.code, response_head: response.headers, response_body: response.body, response_time: response.time*1000, crawled_time: (Time.now.to_f*1000).to_i) elsif (300..307).include?(response.code) puts "fetching #{url}".green.on_black puts "### #{response.code} ### redirect to #{response.headers['Location']}".white.on_black page = Page.new(uri, response_code: response.code, response_head: response.headers, response_body: response.body, response_time: response.time*1000, redirect_url: response.headers['Location']) elsif 404 == response.code puts "fetching #{url}".green.on_black puts "### #{response.code} ### not found #{url}".magenta.on_black page = Page.new(uri, response_code: response.code, response_time: response.time*1000) else puts "fetching #{url}".green.on_black puts "### #{response.code} ### failed #{url}".magenta.on_black puts "### Time: #{response.time} ### #{response.return_message}".magenta.on_black page = Page.new(uri, response_code: response.code, response_time: response.time*1000) end pages << page end hydra.queue(request) end hydra.run return pages end