class Spidercrawl::Request

Makes the request to the targeted website

Attributes

uri[RW]

Public Class Methods

new(url, options = {}) click to toggle source
# File lib/spidercrawl/request.rb, line 14
def initialize(url, options = {})
  @uri = URI.parse(url)
  @threads = options[:threads]
  @timeout = options[:timeout]

  @http = Net::HTTP.new(@uri.host, @uri.port) do |http|
    http.open_timeout = @timeout # in seconds
    http.read_timeout = @timeout # in seconds
  end

  @c = Curl::Easy.new(@uri.to_s) do |curl|
    curl.headers['User-Agent'] = UserAgents.random
  end
end

Public Instance Methods

curl() click to toggle source

Fetch a page from the given url using libcurl

# File lib/spidercrawl/request.rb, line 32
def curl
  puts "fetching #{@uri.to_s}".green.on_black
  start_time = Time.now
  begin
    c = @c
    c.url = @uri.to_s
    c.perform
    end_time = Time.now
    case c.response_code
    when 200 then
      page = Page.new(@uri, response_code: c.response_code,
                            response_head: c.header_str,
                            response_body: c.body_str,
                            response_time: ((end_time-start_time)*1000).round,
                            crawled_time: (Time.now.to_f*1000).to_i)
    when 300..307 then
      page = Page.new(@uri, response_code: c.response_code,
                            response_head: c.header_str,
                            response_body: c.body_str,
                            response_time: ((end_time-start_time)*1000).round,
                            redirect_url:  c.redirect_url)
    when 404 then
      page = Page.new(@uri, response_code: c.response_code,
                            response_time: ((end_time-start_time)*1000).round)
    end
  rescue Exception => e
    puts e.inspect
    puts e.backtrace
  end
end
fetch() click to toggle source

Fetch a page from the given url using net/http

# File lib/spidercrawl/request.rb, line 66
def fetch
  puts "fetching #{@uri.to_s}".green.on_black
  start_time = Time.now
  begin
    request = Net::HTTP::Get.new(@uri.request_uri)
    request["User-Agent"] = UserAgents.random
    response = @http.request(request) 
    end_time = Time.now
    case response
    when Net::HTTPSuccess then
      page = Page.new(@uri, response_code: response.code.to_i,
                            response_head: response.instance_variable_get("@header"),
                            response_body: response.body,
                            response_time: (end_time-start_time).to_f,
                            crawled_time: (Time.now.to_f*1000).to_i)
    when Net::HTTPRedirection then
      page = Page.new(@uri, response_code: response.code.to_i,
                            response_head: response.instance_variable_get("@header"),
                            response_body: response.body,
                            response_time: (end_time-start_time).to_f,
                            redirect_url:  response['location'])
    when Net::HTTPNotFound then
      page = Page.new(@uri, response_code: response.code.to_i,
                            response_time: (end_time-start_time).to_f)
    end
  rescue Exception => e
    puts e.inspect
    puts e.backtrace
  end
end