class Scruber::QueueAdapters::Mongo

Public Instance Methods

add(url_or_page, options={}) click to toggle source

Add page to queue @param url [String] URL of page @param options [Hash] Other options, see {Scruber::QueueAdapters::AbstractAdapter::Page}

@return [void]

# File lib/scruber/queue_adapters/mongo.rb, line 71
def add(url_or_page, options={})
  if url_or_page.is_a?(Page)
    url_or_page.queue = self
    url_or_page.save({new: true}.merge(options))
  else
    Page.new(self, options.merge(url: url_or_page)).save({new: true})
  end
end
Also aliased as: push
collection() click to toggle source

Accessing to mongo collection instance

@return [Mongo::Collection] Mongo collection instance

# File lib/scruber/queue_adapters/mongo.rb, line 159
def collection
  Scruber::Mongo.client[pages_collection_name]
end
downloaded_count() click to toggle source

Count of downloaded pages Using to show downloading progress.

@return [Integer] count of downloaded pages

# File lib/scruber/queue_adapters/mongo.rb, line 94
def downloaded_count
  collection.find({fetched_at: {"$gt" => 0}}).count
end
fetch_downloaded(count=nil) click to toggle source

Fetch downloaded and not processed pages for feching @param count=nil [Integer] count of pages to fetch

@return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0

# File lib/scruber/queue_adapters/mongo.rb, line 112
def fetch_downloaded(count=nil)
  if count.nil?
    build_pages collection.find({fetched_at: {"$gt" => 0}, processed_at: 0}).first
  else
    build_pages collection.find({fetched_at: {"$gt" => 0}, processed_at: 0}).limit(count).to_a
  end
end
fetch_error(count=nil) click to toggle source

Fetch error page @param count=nil [Integer] count of pages to fetch

@return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0

# File lib/scruber/queue_adapters/mongo.rb, line 138
def fetch_error(count=nil)
  if count.nil?
    build_pages collection.find({fetched_at: 0, retry_count: {"$gte" => ::Scruber.configuration.fetcher_options[:max_retry_times]}}).first
  else
    build_pages collection.find({fetched_at: 0, retry_count: {"$gte" => ::Scruber.configuration.fetcher_options[:max_retry_times]}}).limit(count).to_a
  end
end
fetch_pending(count=nil) click to toggle source

Fetch pending page for fetching @param count=nil [Integer] count of pages to fetch

@return [Scruber::QueueAdapters::AbstractAdapter::Page|Array<Scruber::QueueAdapters::AbstractAdapter::Page>] page of count = nil, or array of pages of count > 0

# File lib/scruber/queue_adapters/mongo.rb, line 125
def fetch_pending(count=nil)
  if count.nil?
    build_pages collection.find({fetched_at: 0, retry_count: {"$lt" => ::Scruber.configuration.fetcher_options[:max_retry_times]}, retry_at: {"$lte" => Time.now.to_i}}).first
  else
    build_pages collection.find({fetched_at: 0, retry_count: {"$lt" => ::Scruber.configuration.fetcher_options[:max_retry_times]}, retry_at: {"$lte" => Time.now.to_i}}).limit(count).to_a
  end
end
find(id) click to toggle source

Search page by id @param id [Object] id of page

@return [Page] page object

# File lib/scruber/queue_adapters/mongo.rb, line 103
def find(id)
  build_pages collection.find({_id: id}).first
end
has_work?() click to toggle source

Used by Core. It checks for pages that are not downloaded or not parsed yet.

@return [Boolean] true if queue still has work for scraper

# File lib/scruber/queue_adapters/mongo.rb, line 151
def has_work?
  fetch_pending.present? || fetch_downloaded.present?
end
initialized?() click to toggle source

Check if queue was initialized. Using for `seed` method. If queue was initialized, then no need to run seed block.

@return [Boolean] true if queue already was initialized

# File lib/scruber/queue_adapters/mongo.rb, line 169
def initialized?
  Scruber::Mongo.client[pages_collection_name].find.first.present?
end
push(url_or_page, options={})
Alias for: add
size() click to toggle source

Size of queue

@return [Integer] count of pages in queue

# File lib/scruber/queue_adapters/mongo.rb, line 85
def size
  collection.count
end

Private Instance Methods

build_pages(pages) click to toggle source

Wrapping mongo objects into queue Page objects

@param pages [Hash|Array<Hash>] Mongo document or array of mongo documents

@return [type] [description]

# File lib/scruber/queue_adapters/mongo.rb, line 181
def build_pages(pages)
  if pages.nil?
    nil
  elsif pages.is_a?(Array)
    pages.map{|p| Page.new(self, p.with_indifferent_access.merge(url: p['url']) )}
  else
    Page.new(self, pages.with_indifferent_access.merge(url: pages['url']) )
  end
end
pages_collection_name() click to toggle source

Generating mongo pages collection name

@return [String] name of pages collection

# File lib/scruber/queue_adapters/mongo.rb, line 195
def pages_collection_name
  @_pages_collection_name ||= [Scruber::Mongo.configuration.options['collections_prefix'], @options[:scraper_name], 'pages'].select(&:present?).map(&:to_s).join('_')
end