class Scruber::QueueAdapters::Mongo
Public Instance Methods
Add page to queue @param url [String] URL of page @param options [Hash] Other options, see {Scruber::QueueAdapters::AbstractAdapter::Page}
@return [void]
# File lib/scruber/queue_adapters/mongo.rb, line 71 def add(url_or_page, options={}) if url_or_page.is_a?(Page) url_or_page.queue = self url_or_page.save({new: true}.merge(options)) else Page.new(self, options.merge(url: url_or_page)).save({new: true}) end end
Accessing to mongo collection instance
@return [Mongo::Collection] Mongo
collection instance
# File lib/scruber/queue_adapters/mongo.rb, line 159 def collection Scruber::Mongo.client[pages_collection_name] end
Count of downloaded pages Using to show downloading progress.
@return [Integer] count of downloaded pages
# File lib/scruber/queue_adapters/mongo.rb, line 94 def downloaded_count collection.find({fetched_at: {"$gt" => 0}}).count end
Fetch downloaded and not processed pages for feching @param count=nil [Integer] count of pages to fetch
@return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
# File lib/scruber/queue_adapters/mongo.rb, line 112 def fetch_downloaded(count=nil) if count.nil? build_pages collection.find({fetched_at: {"$gt" => 0}, processed_at: 0}).first else build_pages collection.find({fetched_at: {"$gt" => 0}, processed_at: 0}).limit(count).to_a end end
Fetch error page @param count=nil [Integer] count of pages to fetch
@return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
# File lib/scruber/queue_adapters/mongo.rb, line 138 def fetch_error(count=nil) if count.nil? build_pages collection.find({fetched_at: 0, retry_count: {"$gte" => ::Scruber.configuration.fetcher_options[:max_retry_times]}}).first else build_pages collection.find({fetched_at: 0, retry_count: {"$gte" => ::Scruber.configuration.fetcher_options[:max_retry_times]}}).limit(count).to_a end end
Fetch pending page for fetching @param count=nil [Integer] count of pages to fetch
@return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0
# File lib/scruber/queue_adapters/mongo.rb, line 125 def fetch_pending(count=nil) if count.nil? build_pages collection.find({fetched_at: 0, retry_count: {"$lt" => ::Scruber.configuration.fetcher_options[:max_retry_times]}, retry_at: {"$lte" => Time.now.to_i}}).first else build_pages collection.find({fetched_at: 0, retry_count: {"$lt" => ::Scruber.configuration.fetcher_options[:max_retry_times]}, retry_at: {"$lte" => Time.now.to_i}}).limit(count).to_a end end
Search page by id @param id [Object] id of page
@return [Page] page object
# File lib/scruber/queue_adapters/mongo.rb, line 103 def find(id) build_pages collection.find({_id: id}).first end
Used by Core
. It checks for pages that are not downloaded or not parsed yet.
@return [Boolean] true if queue still has work for scraper
# File lib/scruber/queue_adapters/mongo.rb, line 151 def has_work? fetch_pending.present? || fetch_downloaded.present? end
Check if queue was initialized. Using for `seed` method. If queue was initialized, then no need to run seed block.
@return [Boolean] true if queue already was initialized
# File lib/scruber/queue_adapters/mongo.rb, line 169 def initialized? Scruber::Mongo.client[pages_collection_name].find.first.present? end
Size of queue
@return [Integer] count of pages in queue
# File lib/scruber/queue_adapters/mongo.rb, line 85 def size collection.count end
Private Instance Methods
Wrapping mongo objects into queue Page
objects
@param pages [Hash|Array<Hash>] Mongo
document or array of mongo documents
@return [type] [description]
# File lib/scruber/queue_adapters/mongo.rb, line 181 def build_pages(pages) if pages.nil? nil elsif pages.is_a?(Array) pages.map{|p| Page.new(self, p.with_indifferent_access.merge(url: p['url']) )} else Page.new(self, pages.with_indifferent_access.merge(url: pages['url']) ) end end
Generating mongo pages collection name
@return [String] name of pages collection
# File lib/scruber/queue_adapters/mongo.rb, line 195 def pages_collection_name @_pages_collection_name ||= [Scruber::Mongo.configuration.options['collections_prefix'], @options[:scraper_name], 'pages'].select(&:present?).map(&:to_s).join('_') end