class Crawlette::Crawler
Constants
- BadUrlError
- MAX_THREADS
Public Class Methods
new(url, sitemap = {})
click to toggle source
# File lib/crawlette/crawler.rb, line 10 def initialize(url, sitemap = {}) @uri = URI.parse(url) @pending_uris = [@uri] @sitemap = sitemap unless @uri.host && @uri.scheme fail BadUrlError, "Invalid url: You must provide a full qualified url" end end
Public Instance Methods
crawl()
click to toggle source
Crawl a web page and generate a sitemap that must also contain:
-
Links betwenn pages.
-
On which static assets each page depend on.
Example:
Crawlette::Crawler.new
('gocardless.com’).crawl # => {
'http://example.com/' => { 'assets' => ['http://example.com/image1.png', 'http://example.com/script1.js', 'http://example.com/stylesheet1.css'], 'links' => ['http://example.com/watch-a-demo', 'http://example.com/features'], }, 'http://example.com/watch-a-demo' => { 'assets' => ['http://example.com/image2.png', 'http://example.com/script2.js', 'http://example.com/stylesheet2.css'], 'links' => ['http://example.com/whatever1', 'http://example.com/whatever2'], }, 'http://example.com/features' => { 'assets' => ['http://example.com/image3.png', 'http://example.com/script3.js', 'http://example.com/stylesheet3.css'], 'links' => ['http://example.com/features/api', 'http://example.com/features/pricing'], }, 'http://example.com/features/api' => { ... }, 'http://example.com/features/pricing' => { ... },
}
# File lib/crawlette/crawler.rb, line 49 def crawl while @pending_uris.size > 0 threads = [] @pending_uris.pop(MAX_THREADS).each do |uri| threads << Thread.new do process_uri(uri) end end threads.each(&:join) end @sitemap end
Private Instance Methods
process_uri(uri)
click to toggle source
# File lib/crawlette/crawler.rb, line 65 def process_uri(uri) @sitemap[uri.to_s] ||= begin puts "... Fetching #{uri.to_s}" page = Page.new(Net::HTTP.get(uri), uri) more_uris = page.links.map { |url| URI.parse(url) } @pending_uris.push(*more_uris) { 'links' => page.links, 'assets' => page.assets } end rescue => e puts "ERROR! Cannot fetch #{@uri}: #{e.message}" end