class Crawlette::Crawler

Constants

BadUrlError
MAX_THREADS

Public Class Methods

new(url, sitemap = {}) click to toggle source
# File lib/crawlette/crawler.rb, line 10
def initialize(url, sitemap = {})
  @uri = URI.parse(url)
  @pending_uris = [@uri]
  @sitemap = sitemap
  unless @uri.host && @uri.scheme
    fail BadUrlError, "Invalid url: You must provide a full qualified url"
  end
end

Public Instance Methods

crawl() click to toggle source

Crawl a web page and generate a sitemap that must also contain:

  • Links betwenn pages.

  • On which static assets each page depend on.

Example:

Crawlette::Crawler.new('gocardless.com’).crawl # => {

'http://example.com/' => {
  'assets' => ['http://example.com/image1.png', 'http://example.com/script1.js', 'http://example.com/stylesheet1.css'],
  'links' => ['http://example.com/watch-a-demo', 'http://example.com/features'],
},
'http://example.com/watch-a-demo' => {
  'assets' => ['http://example.com/image2.png', 'http://example.com/script2.js', 'http://example.com/stylesheet2.css'],
  'links' => ['http://example.com/whatever1', 'http://example.com/whatever2'],
},
'http://example.com/features' => {
  'assets' => ['http://example.com/image3.png', 'http://example.com/script3.js', 'http://example.com/stylesheet3.css'],
  'links' => ['http://example.com/features/api', 'http://example.com/features/pricing'],
},
'http://example.com/features/api' => {
  ...
},
'http://example.com/features/pricing' => {
  ...
},

}

# File lib/crawlette/crawler.rb, line 49
def crawl
  while @pending_uris.size > 0
    threads = []
    @pending_uris.pop(MAX_THREADS).each do |uri|
      threads << Thread.new do
        process_uri(uri)
      end
    end
    threads.each(&:join)
  end
  @sitemap
end

Private Instance Methods

process_uri(uri) click to toggle source
# File lib/crawlette/crawler.rb, line 65
def process_uri(uri)
  @sitemap[uri.to_s] ||= begin
    puts "... Fetching #{uri.to_s}"
    page = Page.new(Net::HTTP.get(uri), uri)
    more_uris = page.links.map { |url| URI.parse(url) }
    @pending_uris.push(*more_uris)
    { 'links'  => page.links, 'assets' => page.assets }
  end
rescue => e
  puts "ERROR! Cannot fetch #{@uri}: #{e.message}"
end