class Spidercrawl::ParallelRequest

Makes parallel requests to the targeted website using typhoeus and hydra

Attributes

urls[RW]

Public Class Methods

new(urls, options = {}) click to toggle source
# File lib/spidercrawl/request.rb, line 103
def initialize(urls, options = {})
  @urls = urls
  @threads = options[:threads]
  @timeout = options[:timeout]
end

Public Instance Methods

fetch() click to toggle source

Fetch page(s) from the given url(s)

# File lib/spidercrawl/request.rb, line 112
def fetch
  hydra = Typhoeus::Hydra.new(:max_concurrency => @threads)
  page, pages = nil, []

  @urls.each do |url|
    request = Typhoeus::Request.new(url, :timeout => @timeout, :followlocation => false, :headers => {"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Cache-Control" => "no-cache", "Pragma" => "no-cache", "User-Agent" => UserAgents.random})
    request.on_complete do |response|
      uri = URI(url)
      if response.success?
        puts "fetching #{url}".green.on_black
        page = Page.new(uri, response_code: response.code,
                             response_head: response.headers,
                             response_body: response.body,
                             response_time: response.time*1000,
                             crawled_time: (Time.now.to_f*1000).to_i)
      elsif (300..307).include?(response.code)
        puts "fetching #{url}".green.on_black
        puts "### #{response.code} ### redirect to #{response.headers['Location']}".white.on_black
        page = Page.new(uri, response_code: response.code,
                             response_head: response.headers,
                             response_body: response.body,
                             response_time: response.time*1000,
                             redirect_url:  response.headers['Location'])
      elsif 404 == response.code
        puts "fetching #{url}".green.on_black
        puts "### #{response.code} ### not found #{url}".magenta.on_black
        page = Page.new(uri, response_code: response.code,
                             response_time: response.time*1000)
      else
        puts "fetching #{url}".green.on_black
        puts "### #{response.code} ### failed #{url}".magenta.on_black
        puts "### Time: #{response.time} ### #{response.return_message}".magenta.on_black
        page = Page.new(uri, response_code: response.code,
                             response_time: response.time*1000)
      end
      pages << page
    end
    hydra.queue(request)
  end
  hydra.run
  return pages
end