module Crawler::Storage

Public Instance Methods

clear_path_results(path) click to toggle source

Deletes all path data for a path

# File lib/crawler/storage.rb, line 124
def clear_path_results(path)
  [path_assets_key(path), path_links_to_key(path), path_linked_to_from_key(path)].each do |key|
    redis.del key
  end
end
clear_stored_results() click to toggle source

Deletes all data for a domain

# File lib/crawler/storage.rb, line 109
def clear_stored_results
  paths = get_stored_paths
  redis.pipelined do
    paths.each do |path|
      [stored_paths_key, paths_visited_key, paths_to_visit_key].each do |key|
        redis.del key
      end

      clear_path_results(path)
    end
  end
end
get_domain_data() click to toggle source

Returns paths and associated data for a domain

# File lib/crawler/storage.rb, line 90
def get_domain_data
  get_stored_paths.inject({ 'domain' => base_uri.hostname, 'paths' => {}}) do |hsh, path|
    hsh['paths'][path] = get_path_data(path)
    hsh
  end
end
get_path_assets(path) click to toggle source

Returns assets for a path

# File lib/crawler/storage.rb, line 66
def get_path_assets(path)
  redis.smembers path_assets_key(path)
end
get_path_data(path) click to toggle source

Returns assets, links to, and links for a given path

# File lib/crawler/storage.rb, line 99
def get_path_data(path)
  {
    'asset_dependencies' => get_path_assets(path),
    'links_to' => get_path_links_to(path),
    'linked_to_from' => get_path_linked_to_from(path)
  }
end
get_path_linked_to_from(path) click to toggle source

Returns links that link to a path

# File lib/crawler/storage.rb, line 78
def get_path_linked_to_from(path)
  redis.smembers path_linked_to_from_key(path)
end
get_paths_to_visit() click to toggle source

Returns paths that haven't been indexed

# File lib/crawler/storage.rb, line 60
def get_paths_to_visit
  redis.smembers paths_to_visit_key
end
get_paths_visited() click to toggle source

Returns paths that have been indexed for a domain

# File lib/crawler/storage.rb, line 84
def get_paths_visited
  redis.smembers paths_visited_key
end
get_stored_paths() click to toggle source

Returns known paths for domain

# File lib/crawler/storage.rb, line 54
def get_stored_paths
  redis.smembers stored_paths_key
end
redis() click to toggle source

Redis

# File lib/crawler/storage.rb, line 132
def redis
  @redis ||= Redis.new
end
remove_path_from_queue(path) click to toggle source

Removes a path from paths that need to be visited

# File lib/crawler/storage.rb, line 48
def remove_path_from_queue(path)
  redis.srem paths_to_visit_key, path
end
store_path(path) click to toggle source

Stores a path for the domain

# File lib/crawler/storage.rb, line 8
def store_path(path)
  redis.sadd stored_paths_key, path
end
store_path_assets(path, assets) click to toggle source

Stores the static assets for a path

# File lib/crawler/storage.rb, line 14
def store_path_assets(path, assets)
  return if assets.empty?
  redis.sadd path_assets_key(path), assets
end
store_path_linked_to_from(path, links) click to toggle source

Stores paths that link to the current path

# File lib/crawler/storage.rb, line 28
def store_path_linked_to_from(path, links)
  return if links.empty?
  redis.sadd path_linked_to_from_key(path), links
end
store_path_visited(path) click to toggle source

Stores paths that have been indexed for a domain

# File lib/crawler/storage.rb, line 35
def store_path_visited(path)
  redis.sadd paths_visited_key, path
end
store_paths_to_visit(paths) click to toggle source

Stores paths that need to be visited for a domain

# File lib/crawler/storage.rb, line 41
def store_paths_to_visit(paths)
  return if paths.empty?
  redis.sadd paths_to_visit_key, paths
end

Private Instance Methods

path_assets_key(path) click to toggle source
# File lib/crawler/storage.rb, line 138
def path_assets_key(path)
  "#{prefix}-path-assets-#{base_uri.hostname}-#{path}"
end
path_linked_to_from_key(path) click to toggle source
# File lib/crawler/storage.rb, line 146
def path_linked_to_from_key(path)
  "#{prefix}-path-linked-to-from-#{base_uri.hostname}-#{path}"
end
paths_to_visit_key() click to toggle source
# File lib/crawler/storage.rb, line 158
def paths_to_visit_key
  "#{prefix}-queued-paths-#{base_uri.hostname}"
end
paths_visited_key() click to toggle source
# File lib/crawler/storage.rb, line 154
def paths_visited_key
  "#{prefix}-paths-visited-#{base_uri.hostname}"
end
prefix() click to toggle source
# File lib/crawler/storage.rb, line 162
def prefix
  'crawler'
end
stored_paths_key() click to toggle source
# File lib/crawler/storage.rb, line 150
def stored_paths_key
  "#{prefix}-paths-#{base_uri.hostname}"
end