class Spider

Public Class Methods

new() click to toggle source
# File lib/spider.rb, line 11
def initialize
        @already_visited = {}
end

Public Instance Methods

crawl_domain(url, page_limit = 100) click to toggle source
# File lib/spider.rb, line 36
def crawl_domain(url, page_limit = 100)
        return if @already_visited.size == page_limit

        url_object = open_url(url)
        return if url_object.nil?

        parsed_doc = parse_url(url_object)
        return if parsed_doc.nil?

        @already_visited[url] = true if @already_visited[url].nil?
        page_urls = find_urls_on_page(parsed_doc, url)
        page_urls.each do |page_url|
                if urls_on_same_domain?(url, page_url) && @already_visited[page_url].nil?
                        crawl_domain(page_url)
                end
        end
end
crawl_web(urls, depth=2, page_limit = 100) click to toggle source
# File lib/spider.rb, line 15
def crawl_web(urls, depth=2, page_limit = 100)
        depth.times do
                next_urls = []
                urls.each do |url|
                        url_object = open_url(url)
                        next if url_object.nil?

                        url = update_url_if_redirected(url_object)
                        parsed_doc = parse_url(url_object)
                        next if parsed_doc.nil?

                        @already_visited[url] = true if @already_visited[url].nil?
                        return if @already_visited.size == page_limit

                        next_urls += (find_urls_on_page(parsed_doc, url) - @already_visited.keys)
                        next_urls.uniq!
                end
                urls = next_urls
        end
end

Private Instance Methods

find_urls_on_page(parsed_doc, current_url) click to toggle source
# File lib/spider.rb, line 74
def find_urls_on_page(parsed_doc, current_url)
        parsed_doc.search('a[@href]').each_with_object([]) do |x, urls_list|
                new_url = x['href'].split('#')[0]
                if new_url
                        new_url = make_absolute(current_url, new_url) if relative?(new_url)
                        urls_list.push(new_url)
                end
        end
end
open_url(url) click to toggle source
# File lib/spider.rb, line 56
def open_url(url)
        open(url)
rescue
        puts "Unable to open url: " + url
end
parse_url(url_object) click to toggle source
# File lib/spider.rb, line 66
def parse_url(url_object)
        doc = Nokogiri(url_object)
        puts 'Crawling url ' + url_object.base_uri.to_s
        doc
rescue
        puts 'Could not parse url: ' + url_object.base_uri.to_s
end
update_url_if_redirected(url_object) click to toggle source
# File lib/spider.rb, line 62
def update_url_if_redirected(url_object)
        url_object.base_uri.to_s
end