class Staticizer::Crawler

Attributes

output_dir[RW]
url_queue[R]

Public Class Methods

new(initial_page, opts = {}) click to toggle source
# File lib/staticizer/crawler.rb, line 12
def initialize(initial_page, opts = {})
  if initial_page.nil?
    raise ArgumentError, "Initial page required"
  end

  @opts = opts.dup
  @url_queue = []
  @processed_urls = []
  @output_dir = @opts[:output_dir] || File.expand_path("crawl/")
  @log = @opts[:logger] || Logger.new(STDOUT)
  @log.level = @opts[:log_level] || Logger::INFO

  if @opts[:aws]
    bucket_name = @opts[:aws].delete(:bucket_name)
    Aws.config.update(opts[:aws])
    @s3_bucket = Aws::S3::Resource.new.bucket(bucket_name)
  end

  if @opts[:valid_domains].nil?
    uri = URI.parse(initial_page)
    @opts[:valid_domains] ||= [uri.host]
  end

  if @opts[:process_body]
    @process_body = @opts[:process_body]
  end

  add_url(initial_page)
end

Public Instance Methods

add_url(url, info = {}) click to toggle source
# File lib/staticizer/crawler.rb, line 106
def add_url(url, info = {})
  if @opts[:filter_url]
    url = @opts[:filter_url].call(url, info)
    return if url.nil?
  else
    regex = "(#{@opts[:valid_domains].join(")|(")})"
    return if url !~ %r{^https?://#{regex}}
  end

  url = url.sub(/#.*$/,'') # strip off any fragments
  return if @url_queue.index {|u| u[0] == url } || @processed_urls.include?(url)
  @url_queue << [url, info]
end
add_urls(urls, info = {}) click to toggle source
# File lib/staticizer/crawler.rb, line 89
def add_urls(urls, info = {})
  urls.compact.uniq.each {|url| add_url(url, info.dup) }
end
crawl() click to toggle source
# File lib/staticizer/crawler.rb, line 50
def crawl
  @log.info("Starting crawl")
  while(@url_queue.length > 0)
    url, info = @url_queue.shift
    @processed_urls << url
    process_url(url, info)
  end
  @log.info("Finished crawl")
end
extract_css_urls(css, base_uri) click to toggle source
# File lib/staticizer/crawler.rb, line 85
def extract_css_urls(css, base_uri)
  css.scan(/url\(\s*['"]?(.+?)['"]?\s*\)/).map {|src| make_absolute(base_uri, src[0]) }
end
extract_hrefs(doc, base_uri) click to toggle source
# File lib/staticizer/crawler.rb, line 60
def extract_hrefs(doc, base_uri)
  doc.xpath("//a/@href").map {|href| make_absolute(base_uri, href) }
end
extract_images(doc, base_uri) click to toggle source
# File lib/staticizer/crawler.rb, line 64
def extract_images(doc, base_uri)
  doc.xpath("//img/@src").map {|src| make_absolute(base_uri, src) }
end
extract_scripts(doc, base_uri) click to toggle source
# File lib/staticizer/crawler.rb, line 81
def extract_scripts(doc, base_uri)
  doc.xpath("//script/@src").map {|src| make_absolute(base_uri, src) }
end
extract_videos(doc, base_uri) click to toggle source
# File lib/staticizer/crawler.rb, line 72
def extract_videos(doc, base_uri)
  doc.xpath("//video").map do |video|
    sources = video.xpath("//source/@src").map {|src| make_absolute(base_uri, src)}
    poster = video.attributes["poster"].to_s
    make_absolute(base_uri, poster)
    [poster, sources]
  end.flatten.uniq.compact
end
log_level() click to toggle source
# File lib/staticizer/crawler.rb, line 42
def log_level
  @log.level
end
log_level=(level) click to toggle source
# File lib/staticizer/crawler.rb, line 46
def log_level=(level)
  @log.level = level
end
make_absolute(base_uri, href) click to toggle source
# File lib/staticizer/crawler.rb, line 93
def make_absolute(base_uri, href)
  dup_uri = base_uri.dup
  dup_uri.query = nil
  if href.to_s =~ /https?/i
    href.to_s.gsub(" ", "+")
  else
    URI::join(dup_uri.to_s, href).to_s
  end
rescue StandardError => e
  @log.error "Could not make absolute #{dup_uri} - #{href}"
  nil
end
process_body(body, uri, opts) click to toggle source
# File lib/staticizer/crawler.rb, line 222
def process_body(body, uri, opts)
  if @process_body
    body = @process_body.call(body, uri, opts)
  end
  body
end
process_redirect(url, destination_url) click to toggle source

If we hit a redirect we save the redirect as a meta refresh page TODO: for AWS S3 hosting we could instead create a redirect?

# File lib/staticizer/crawler.rb, line 217
def process_redirect(url, destination_url)
  body = "<html><head><META http-equiv='refresh' content='0;URL=\"#{destination_url}\"'></head><body>You are being redirected to <a href='#{destination_url}'>#{destination_url}</a>.</body></html>"
  save_page(body, url)
end
process_success(response, parsed_uri) click to toggle source
# File lib/staticizer/crawler.rb, line 192
def process_success(response, parsed_uri)
  url = parsed_uri.to_s
  if @opts[:filter_process]
    return if @opts[:filter_process].call(response, parsed_uri)
  end
  case response['content-type']
  when /css/
    save_page(response, parsed_uri)
    add_urls(extract_css_urls(response.body, url), {:type_hint => "css_url"})
  when /html/
    save_page(response, parsed_uri)
    doc = Nokogiri::HTML(response.body)
    add_urls(extract_links(doc, url), {:type_hint => "link"})
    add_urls(extract_scripts(doc, url), {:type_hint => "script"})
    add_urls(extract_images(doc, url), {:type_hint => "image"})
    add_urls(extract_css_urls(response.body, url), {:type_hint => "css_url"})
    add_urls(extract_videos(doc, parsed_uri), {:type_hint => "video"})
    add_urls(extract_hrefs(doc, url), {:type_hint => "href"}) unless @opts[:single_page]
  else
    save_page(response, parsed_uri)
  end
end
process_url(url, info) click to toggle source

Fetch a URI and save it to disk

# File lib/staticizer/crawler.rb, line 230
def process_url(url, info)
  @http_connections ||= {}
  parsed_uri = URI(url)

  @log.debug "Fetching #{parsed_uri}"
  
  # Attempt to use an already open Net::HTTP connection
  key = parsed_uri.host + parsed_uri.port.to_s
  connection = @http_connections[key]
  if connection.nil?
    connection = Net::HTTP.new(parsed_uri.host, parsed_uri.port)
    connection.use_ssl = true if parsed_uri.scheme.downcase == "https"
    @http_connections[key] = connection
  end

  request = Net::HTTP::Get.new(parsed_uri.request_uri)
  begin
    connection.request(request) do |response|
      case response
      when Net::HTTPSuccess
        process_success(response, parsed_uri)
      when Net::HTTPRedirection
        redirect_url = response['location']
        @log.debug "Processing redirect to #{redirect_url}"
        process_redirect(parsed_uri, redirect_url)
        add_url(redirect_url)
      else
        @log.error "Error #{response.code}:#{response.message} fetching url #{url}"
      end
    end
  rescue OpenSSL::SSL::SSLError => e
    @log.error "SSL Error #{e.message} fetching url #{url}"
  rescue Errno::ECONNRESET => e
    @log.error "Error #{e.class}:#{e.message} fetching url #{url}"
  end
end
save_page(response, uri) click to toggle source
# File lib/staticizer/crawler.rb, line 120
def save_page(response, uri)
  return if @opts[:skip_write]
  if @opts[:aws]
    save_page_to_aws(response, uri)
  else
    save_page_to_disk(response, uri)
  end
end
save_page_to_aws(response, uri) click to toggle source
# File lib/staticizer/crawler.rb, line 173
def save_page_to_aws(response, uri)
  key = uri.path
  key += "?#{uri.query}" if uri.query
  key = key.gsub(%r{/$},"/index.html")
  key = key.gsub(%r{^/},"")
  key = "index.html" if key == ""
  # Upload this file directly to AWS::S3
  opts = {:acl => "public-read"}
  opts[:content_type] = response['content-type'] rescue "text/html"
  @log.info "Uploading #{key} to s3 with content type #{opts[:content_type]}"
  if response.respond_to?(:read_body)
    body = process_body(response.read_body, uri, opts)
    @s3_bucket.object(key).put(opts.merge(body: body))
  else
    body = process_body(response, uri, opts)
    @s3_bucket.object(key).put(opts.merge(body: body))
  end
end
save_page_to_disk(response, uri) click to toggle source
# File lib/staticizer/crawler.rb, line 129
def save_page_to_disk(response, uri)
  path = uri.path
  path += "?#{uri.query}" if uri.query

  path_segments = path.scan(%r{[^/]*/})
  filename = path.include?("/") ? path[path.rindex("/")+1..-1] : path

  current = @output_dir
  FileUtils.mkdir_p(current) unless File.exist?(current)

  # Create all the directories necessary for this file
  path_segments.each do |segment|
    current = File.join(current, "#{segment}").sub(%r{/$},'')
    if File.file?(current)
      # If we are trying to create a directory and there already is a file
      # with the same name add a .d to the file since we can't create
      # a directory and file with the same name in the file system
      dirfile = current + ".d"
      FileUtils.mv(current, dirfile)
      FileUtils.mkdir(current)
      FileUtils.cp(dirfile, File.join(current, "/index.html"))
    elsif !File.exists?(current)
      FileUtils.mkdir(current)
    end
  end

  body = response.respond_to?(:read_body) ? response.read_body : response
  body = process_body(body, uri, {})
  outfile = File.join(current, "/#{filename}")
  if filename == ""
    indexfile = File.join(outfile, "/index.html")
    @log.info "Saving #{indexfile}"
    File.open(indexfile, "wb") {|f| f << body }
  elsif File.directory?(outfile)
    dirfile = outfile + ".d"
    @log.info "Saving #{dirfile}"
    File.open(dirfile, "wb") {|f| f << body }
    FileUtils.cp(dirfile, File.join(outfile, "/index.html"))
  else
    @log.info "Saving #{outfile}"
    File.open(outfile, "wb") {|f| f << body }
  end
end