class Snapcrawl::Crawler
Attributes
url[R]
Public Class Methods
new(url)
click to toggle source
# File lib/snapcrawl/crawler.rb, line 9 def initialize(url) $logger.debug "initializing crawler with !txtgrn!#{url}" config_for_display = Config.settings.dup config_for_display['name_template'] = '%%{url}' $logger.debug "config #{config_for_display}" @url = url end
Public Instance Methods
crawl()
click to toggle source
# File lib/snapcrawl/crawler.rb, line 19 def crawl Dependencies.verify todo[url] = Page.new url process_todo while todo.any? end
Private Instance Methods
done()
click to toggle source
# File lib/snapcrawl/crawler.rb, line 94 def done @done ||= [] end
file_age(file)
click to toggle source
# File lib/snapcrawl/crawler.rb, line 86 def file_age(file) (Time.now - File.stat(file).mtime).to_i end
file_fresh?(file)
click to toggle source
# File lib/snapcrawl/crawler.rb, line 82 def file_fresh?(file) Config.cache_life > 0 and File.exist?(file) and file_age(file) < Config.cache_life end
process_page(page)
click to toggle source
# File lib/snapcrawl/crawler.rb, line 56 def process_page(page) outfile = "#{Config.snaps_dir}/#{Config.name_template}.png" % { url: page.url.to_slug } $logger.info "processing !undpur!#{page.url}!txtrst!, depth: #{page.depth}" if !page.valid? $logger.debug "page #{page.path} is invalid, aborting process" return false end if file_fresh? outfile $logger.info "screenshot for #{page.path} already exists" else $logger.info "!bldgrn!capturing screenshot for #{page.path}" save_screenshot page, outfile end true end
process_todo()
click to toggle source
# File lib/snapcrawl/crawler.rb, line 27 def process_todo $logger.debug "processing queue: !txtgrn!#{todo.count} remaining" url, page = todo.shift done.push url if process_page page register_sub_pages page.pages if page.depth < Config.depth end end
register_sub_pages(pages)
click to toggle source
# File lib/snapcrawl/crawler.rb, line 38 def register_sub_pages(pages) pages.each do |sub_page| next if todo.has_key?(sub_page) or done.include?(sub_page) if Config.url_whitelist and sub_page.path !~ /#{Config.url_whitelist}/ $logger.debug "ignoring !undpur!#{sub_page.url}!txtrst!, reason: whitelist" next end if Config.url_blacklist and sub_page.path =~ /#{Config.url_blacklist}/ $logger.debug "ignoring !undpur!#{sub_page.url}!txtrst!, reason: blacklist" next end todo[sub_page.url] = sub_page end end
save_screenshot(page, outfile)
click to toggle source
# File lib/snapcrawl/crawler.rb, line 76 def save_screenshot(page, outfile) page.save_screenshot outfile rescue => e $logger.error "screenshot error on !undpur!#{page.path}!txtrst! - !txtred!#{e.class}!txtrst!: #{e.message}" end
todo()
click to toggle source
# File lib/snapcrawl/crawler.rb, line 90 def todo @todo ||= {} end