module Crawler::Storage
Public Instance Methods
clear_path_results(path)
click to toggle source
Deletes all path data for a path
# File lib/crawler/storage.rb, line 124 def clear_path_results(path) [path_assets_key(path), path_links_to_key(path), path_linked_to_from_key(path)].each do |key| redis.del key end end
clear_stored_results()
click to toggle source
Deletes all data for a domain
# File lib/crawler/storage.rb, line 109 def clear_stored_results paths = get_stored_paths redis.pipelined do paths.each do |path| [stored_paths_key, paths_visited_key, paths_to_visit_key].each do |key| redis.del key end clear_path_results(path) end end end
get_domain_data()
click to toggle source
Returns paths and associated data for a domain
# File lib/crawler/storage.rb, line 90 def get_domain_data get_stored_paths.inject({ 'domain' => base_uri.hostname, 'paths' => {}}) do |hsh, path| hsh['paths'][path] = get_path_data(path) hsh end end
get_path_assets(path)
click to toggle source
Returns assets for a path
# File lib/crawler/storage.rb, line 66 def get_path_assets(path) redis.smembers path_assets_key(path) end
get_path_data(path)
click to toggle source
Returns assets, links to, and links for a given path
# File lib/crawler/storage.rb, line 99 def get_path_data(path) { 'asset_dependencies' => get_path_assets(path), 'links_to' => get_path_links_to(path), 'linked_to_from' => get_path_linked_to_from(path) } end
get_path_linked_to_from(path)
click to toggle source
Returns links that link to a path
# File lib/crawler/storage.rb, line 78 def get_path_linked_to_from(path) redis.smembers path_linked_to_from_key(path) end
get_path_links_to(path)
click to toggle source
Returns links that a path links to
# File lib/crawler/storage.rb, line 72 def get_path_links_to(path) redis.smembers path_links_to_key(path) end
get_paths_to_visit()
click to toggle source
Returns paths that haven't been indexed
# File lib/crawler/storage.rb, line 60 def get_paths_to_visit redis.smembers paths_to_visit_key end
get_paths_visited()
click to toggle source
Returns paths that have been indexed for a domain
# File lib/crawler/storage.rb, line 84 def get_paths_visited redis.smembers paths_visited_key end
get_stored_paths()
click to toggle source
Returns known paths for domain
# File lib/crawler/storage.rb, line 54 def get_stored_paths redis.smembers stored_paths_key end
redis()
click to toggle source
Redis
# File lib/crawler/storage.rb, line 132 def redis @redis ||= Redis.new end
remove_path_from_queue(path)
click to toggle source
Removes a path from paths that need to be visited
# File lib/crawler/storage.rb, line 48 def remove_path_from_queue(path) redis.srem paths_to_visit_key, path end
store_path(path)
click to toggle source
Stores a path for the domain
# File lib/crawler/storage.rb, line 8 def store_path(path) redis.sadd stored_paths_key, path end
store_path_assets(path, assets)
click to toggle source
Stores the static assets for a path
# File lib/crawler/storage.rb, line 14 def store_path_assets(path, assets) return if assets.empty? redis.sadd path_assets_key(path), assets end
store_path_linked_to_from(path, links)
click to toggle source
Stores paths that link to the current path
# File lib/crawler/storage.rb, line 28 def store_path_linked_to_from(path, links) return if links.empty? redis.sadd path_linked_to_from_key(path), links end
store_path_links_to(path, links)
click to toggle source
Stores paths that the current path links to
# File lib/crawler/storage.rb, line 21 def store_path_links_to(path, links) return if links.empty? redis.sadd path_links_to_key(path), links end
store_path_visited(path)
click to toggle source
Stores paths that have been indexed for a domain
# File lib/crawler/storage.rb, line 35 def store_path_visited(path) redis.sadd paths_visited_key, path end
store_paths_to_visit(paths)
click to toggle source
Stores paths that need to be visited for a domain
# File lib/crawler/storage.rb, line 41 def store_paths_to_visit(paths) return if paths.empty? redis.sadd paths_to_visit_key, paths end
Private Instance Methods
path_assets_key(path)
click to toggle source
# File lib/crawler/storage.rb, line 138 def path_assets_key(path) "#{prefix}-path-assets-#{base_uri.hostname}-#{path}" end
path_linked_to_from_key(path)
click to toggle source
# File lib/crawler/storage.rb, line 146 def path_linked_to_from_key(path) "#{prefix}-path-linked-to-from-#{base_uri.hostname}-#{path}" end
path_links_to_key(path)
click to toggle source
# File lib/crawler/storage.rb, line 142 def path_links_to_key(path) "#{prefix}-path-links-to-#{base_uri.hostname}-#{path}" end
paths_to_visit_key()
click to toggle source
# File lib/crawler/storage.rb, line 158 def paths_to_visit_key "#{prefix}-queued-paths-#{base_uri.hostname}" end
paths_visited_key()
click to toggle source
# File lib/crawler/storage.rb, line 154 def paths_visited_key "#{prefix}-paths-visited-#{base_uri.hostname}" end
prefix()
click to toggle source
# File lib/crawler/storage.rb, line 162 def prefix 'crawler' end
stored_paths_key()
click to toggle source
# File lib/crawler/storage.rb, line 150 def stored_paths_key "#{prefix}-paths-#{base_uri.hostname}" end