class Snapcrawl::Crawler

Attributes

url[R]

Public Class Methods

new(url) click to toggle source
# File lib/snapcrawl/crawler.rb, line 9
def initialize(url)
  $logger.debug "initializing crawler with !txtgrn!#{url}"
  
  config_for_display = Config.settings.dup
  config_for_display['name_template'] = '%%{url}' 

  $logger.debug "config #{config_for_display}"
  @url = url
end

Public Instance Methods

crawl() click to toggle source
# File lib/snapcrawl/crawler.rb, line 19
def crawl
  Dependencies.verify
  todo[url] = Page.new url
  process_todo while todo.any?
end

Private Instance Methods

done() click to toggle source
# File lib/snapcrawl/crawler.rb, line 94
def done
  @done ||= []
end
file_age(file) click to toggle source
# File lib/snapcrawl/crawler.rb, line 86
def file_age(file)
  (Time.now - File.stat(file).mtime).to_i
end
file_fresh?(file) click to toggle source
# File lib/snapcrawl/crawler.rb, line 82
def file_fresh?(file)
  Config.cache_life > 0 and File.exist?(file) and file_age(file) < Config.cache_life
end
process_page(page) click to toggle source
# File lib/snapcrawl/crawler.rb, line 56
def process_page(page)
  outfile = "#{Config.snaps_dir}/#{Config.name_template}.png" % { url: page.url.to_slug }

  $logger.info "processing !undpur!#{page.url}!txtrst!, depth: #{page.depth}"

  if !page.valid?
    $logger.debug "page #{page.path} is invalid, aborting process"
    return false
  end

  if file_fresh? outfile
    $logger.info "screenshot for #{page.path} already exists"
  else
    $logger.info "!bldgrn!capturing screenshot for #{page.path}"
    save_screenshot page, outfile
  end

  true
end
process_todo() click to toggle source
# File lib/snapcrawl/crawler.rb, line 27
def process_todo
  $logger.debug "processing queue: !txtgrn!#{todo.count} remaining"

  url, page = todo.shift
  done.push url

  if process_page page
    register_sub_pages page.pages if page.depth < Config.depth
  end
end
register_sub_pages(pages) click to toggle source
# File lib/snapcrawl/crawler.rb, line 38
def register_sub_pages(pages)
  pages.each do |sub_page|
    next if todo.has_key?(sub_page) or done.include?(sub_page)
    
    if Config.url_whitelist and sub_page.path !~ /#{Config.url_whitelist}/
      $logger.debug "ignoring !undpur!#{sub_page.url}!txtrst!, reason: whitelist"
      next
    end

    if Config.url_blacklist and sub_page.path =~ /#{Config.url_blacklist}/
      $logger.debug "ignoring !undpur!#{sub_page.url}!txtrst!, reason: blacklist"
      next
    end

    todo[sub_page.url] = sub_page
  end
end
save_screenshot(page, outfile) click to toggle source
# File lib/snapcrawl/crawler.rb, line 76
def save_screenshot(page, outfile)
  page.save_screenshot outfile
rescue => e
  $logger.error "screenshot error on !undpur!#{page.path}!txtrst! - !txtred!#{e.class}!txtrst!: #{e.message}"
end
todo() click to toggle source
# File lib/snapcrawl/crawler.rb, line 90
def todo
  @todo ||= {}
end