class Spider
Public Class Methods
new()
click to toggle source
# File lib/spider.rb, line 11 def initialize @already_visited = {} end
Public Instance Methods
crawl_domain(url, page_limit = 100)
click to toggle source
# File lib/spider.rb, line 36 def crawl_domain(url, page_limit = 100) return if @already_visited.size == page_limit url_object = open_url(url) return if url_object.nil? parsed_doc = parse_url(url_object) return if parsed_doc.nil? @already_visited[url] = true if @already_visited[url].nil? page_urls = find_urls_on_page(parsed_doc, url) page_urls.each do |page_url| if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil? crawl_domain(page_url) end end end
crawl_web(urls, depth=2, page_limit = 100)
click to toggle source
# File lib/spider.rb, line 15 def crawl_web(urls, depth=2, page_limit = 100) depth.times do next_urls = [] urls.each do |url| url_object = open_url(url) next if url_object.nil? url = update_url_if_redirected(url_object) parsed_doc = parse_url(url_object) next if parsed_doc.nil? @already_visited[url] = true if @already_visited[url].nil? return if @already_visited.size == page_limit next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys) next_urls.uniq! end urls = next_urls end end
Private Instance Methods
find_urls_on_page(parsed_doc, current_url)
click to toggle source
# File lib/spider.rb, line 74 def find_urls_on_page(parsed_doc, current_url) parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list| new_url = x['href'].split('#')[0] if new_url new_url = make_absolute(current_url, new_url) if relative?(new_url) urls_list.push(new_url) end end end
open_url(url)
click to toggle source
# File lib/spider.rb, line 56 def open_url(url) open(url) rescue puts "Unable to open url: " + url end
parse_url(url_object)
click to toggle source
# File lib/spider.rb, line 66 def parse_url(url_object) doc = Nokogiri(url_object) puts 'Crawling url ' + url_object.base_uri.to_s doc rescue puts 'Could not parse url: ' + url_object.base_uri.to_s end
update_url_if_redirected(url_object)
click to toggle source
# File lib/spider.rb, line 62 def update_url_if_redirected(url_object) url_object.base_uri.to_s end