class Spidercrawl::SpiderWorker

Start working hard

Attributes

page[R]

Public Class Methods

new(url, options = {}) click to toggle source
# File lib/spidercrawl/spider_worker.rb, line 11
def initialize(url, options = {})
  @url = url
  #@headers = options[:headers]
  @delay = options[:delay] ? options[:delay] : 0 # default 0 seconds
  @threads = options[:threads] ? options[:threads] : 10 # default 10 threads
  @timeout = options[:timeout] ? options[:timeout] : 20 # default 20 seconds
  @allow_redirections = options[:allow_redirections]
  @max_pages = options[:max_pages]
  @pattern = options[:pattern]
  @setup = nil
  @teardown = nil
  @redirect = nil
  @success = nil
  @failure = nil
end

Public Instance Methods

after_fetch(&block) click to toggle source

Code block for after fetch

# File lib/spidercrawl/spider_worker.rb, line 166
def after_fetch(&block)
  @teardown = block if block
end
before_fetch(&block) click to toggle source

Code block for before fetch

# File lib/spidercrawl/spider_worker.rb, line 159
def before_fetch(&block)
  @setup = block if block
end
crawl() click to toggle source
# File lib/spidercrawl/spider_worker.rb, line 27
def crawl
  link_queue = Queue.new
  pages, visited_links = [], []
  link_queue << @url

  spider_worker = Request.new(@url, :threads => @threads, :timeout => @timeout)

  begin
    url = link_queue.pop
    next if visited_links.include?(url) || (@pattern && url !~ @pattern)

    start_time = Time.now
    response = @setup.yield url unless @setup.nil?
    end_time = Time.now
    
    spider_worker.uri = URI.parse(url)
    page = (response ? setup_page(URI.parse(url), response, ((end_time - start_time).to_f*1000).to_i) : spider_worker.curl)
    visited_links << url

    if page.success? || page.redirect? then
      while page.redirect?
        puts ("### redirect to #{page.location}" + (visited_links.include?(page.location) ? " which we have already visited!" : "")).white.on_black
        break if visited_links.include?(page.location)
        
        start_time = Time.now
        response = @redirect.yield page.location unless @redirect.nil?
        end_time = Time.now

        spider_worker.uri = URI.parse(page.location)
        page = (response ? setup_page(URI.parse(page.location), response, ((end_time - start_time).to_f*1000).to_i) : spider_worker.curl)
        visited_links << page.url
      end
      if !visited_links.include?(page.location)
        pages << page unless page.content == ""
        page.internal_links.each do |link|
          if !visited_links.include?(link) 
            link_queue << link if @pattern && link =~ @pattern
            link_queue << link unless @pattern
          end
        end unless page.internal_links.nil?
        @teardown.yield page unless @teardown.nil?
        sleep @delay
      end
    elsif page.not_found? then
      puts "page not found"
    end
  end until link_queue.empty?
  pages
end
on_failure(&block) click to toggle source

Code block for on failure

# File lib/spidercrawl/spider_worker.rb, line 187
def on_failure(&block)
  @failure = block if block
end
on_redirect(&block) click to toggle source

Code block for on redirect

# File lib/spidercrawl/spider_worker.rb, line 173
def on_redirect(&block)
  @redirect = block if block
end
on_success(&block) click to toggle source

Code block for on success

# File lib/spidercrawl/spider_worker.rb, line 180
def on_success(&block)
  @success = block if block
end
parallel_crawl() click to toggle source
# File lib/spidercrawl/spider_worker.rb, line 77
def parallel_crawl
  link_queue = Queue.new
  pages, visited_links = [], []
  link_queue << @url

  spider_workers = ParallelRequest.new([@url], :threads => @threads, :timeout => @timeout)

  begin
    urls = []
    while !link_queue.empty?
      url = link_queue.pop
      next if visited_links.include?(url) || (@pattern && url !~ @pattern)
      visited_links << url

      start_time = Time.now
      response = @setup.yield url unless @setup.nil?
      end_time = Time.now
      
      if response then
        pages << (page = setup_page(URI.parse(url), response, ((end_time - start_time).to_f*1000).to_i))
        @teardown.yield page unless @teardown.nil?

        page.internal_links.each do |link| # queue up internal links for crawling
          if !visited_links.include?(link) 
            link_queue << link if @pattern && link =~ @pattern
            link_queue << link unless @pattern
          end
        end unless page.internal_links.nil?
      else # queue up url for crawling
        urls << url
        puts "queue: #{url}"
      end
    end

    spider_workers.urls = urls
    responses = spider_workers.fetch

    responses.each do |page|
      if (503..504).include?(page.response_code) then
        link_queue << page.url
      elsif page.success? || page.redirect? then
        response = nil
        if page.redirect? then
          puts ("### redirect to #{page.location}" + (visited_links.include?(page.location) ? " which we have already visited!" : "")).white.on_black
          unless visited_links.include?(page.location) || (@pattern && page.location !~ @pattern)
            start_time = Time.now
            response = @redirect.yield page.location unless @redirect.nil?
            end_time = Time.now

            if response then
              page = setup_page(URI.parse(page.location), response, ((end_time - start_time).to_f*1000).to_i)
              visited_links << page.url        
            else
              puts "queue: #{page.location}"
              link_queue << page.location
            end
          else
            puts "discard: #{page.location}"
          end
        end
        if page.success? || response then
          pages << page unless page.content == ""
          page.internal_links.each do |link| # queue up internal links for crawling
            if !visited_links.include?(link) 
              link_queue << link if @pattern && link =~ @pattern
              link_queue << link unless @pattern
            end
          end unless page.internal_links.nil?
          page.crawled_time = (Time.now.to_f*1000).to_i
          @teardown.yield page unless @teardown.nil?
        end
      elsif page.not_found? then
        puts "page not found"
      end
    end
  end until link_queue.empty?
  pages
end

Private Instance Methods

setup_page(uri, response, response_time) click to toggle source

Setup page based on given response

# File lib/spidercrawl/spider_worker.rb, line 195
def setup_page(uri, response, response_time)
  page = Page.new(uri, response_code: response.code.to_i,
                       response_head: response.instance_variable_get("@header"),
                       response_body: response.body,
                       response_time: response_time,
                       crawled_time: (Time.now.to_f*1000).to_i)
end