class Tango::App
Tango
application
@author Mckomo
Attributes
Public Class Methods
@param link_stack
[Tango::LinkStack] @param dispatcher [Tango::Etl::Dispatcher] @param cache [Tango::Resources::Cache] @param http_client [Object] Must implement get method @param parser [Object] Must implement parse method @param db_locker [DatabaseLocker] @param logger [Logger] @return [Tango::App]
# File lib/tango/app.rb, line 20 def initialize( config: {}, link_stack: nil, dispatcher: nil, cache: nil, http_client: nil, parser: nil, db_locker: nil, logger: nil ) # Init app properties @models = {} @operators = {} # Set config @config = config # Set dependencies @link_stack = link_stack || LinkStack.new( config['target_url'] ) @dispatcher = dispatcher || ETL::Dispatcher.new @cache = cache || Resource::Cache.new( Resource::Buffer.new ) @http_client = http_client || HTTParty @parser = parser || Nokogiri::HTML @db_locker = db_locker || DatabaseLocker.new( Multidb.databases ) @logger = logger || Logger.new( STDOUT ) @models = [] @operators = [] end
Public Instance Methods
Filter run after Tango
execution
# File lib/tango/app.rb, line 48 def after end
Filter run before Tango
execution
# File lib/tango/app.rb, line 44 def before end
Register a new handler with the dispatcher
@param handler [Tango::ETL::HandlerInterface] @return [Array]
# File lib/tango/app.rb, line 71 def register_handler( handler ) @dispatcher.register( handler ) end
Register a new model
@param model [Symbol] @return [Array]
# File lib/tango/app.rb, line 55 def register_model( model ) @models << model end
Register a new resource operator
@param operator [Tango::ETL::OperatorInterface] @return [Array]
# File lib/tango/app.rb, line 63 def register_operator( operator ) @operators << operator end
Run ETL
process
@param link_stack
[Tango::LinkStack] @param dispatcher [Tango::Etl::Dispatcher] @param cache [Tango::Resources::Cache] @param http_client [Object] Must implement get method @param parser [Object] Must implement parse method @param logger [Logger] @return [Nil]
# File lib/tango/app.rb, line 84 def run # Save beginning time start_time = Time.now @logger.info "Running Tango v.#{Tango::VERSION} ..." @logger.info "Target: #{@link_stack.host}." # Use next unlocked database pick_database( @db_locker.unlocked ) @logger.info "Using database '#{@db_locker.unlocked}'." @logger.info "Truncating non persistent models ..." truncate_tables( non_persistent_models ) # Load cache for persistent models @logger.info "Loading cache ..." setup_cache( @operators ) load_cache( persistent_models ) # Run before filter @logger.info "Running before callback ..." before # Init counter of crawled links links_counter = 0 @logger.info "Tango starts crawling ..." # Start crawling website while( @link_stack.has_links? ) # Get a link from the stack link = @link_stack.shift # Skip iteration if no handler found if ! handler_klass = @dispatcher.find_handler( link ) @logger.error "No handler for link: #{link}." next end # Try to get contents of the link begin response = @http_client.get( @link_stack.host + link ) rescue StandardError => e @logger.error "Could not download contents of #{@link_stack.host + link} link." @logger.error e.message next end # Continue only when response has code 200 or 201 if ! [ 200, 201 ].include?( response.code ) @logger.error "Response code for link #{link} is #{response.code}. Only codes 200 and 201 are accepted." next end # Parse response contents document = @parser.parse( response.body ) # Init handler handler = handler_klass.new( link, document, @cache ) # Append links fetched from handler @link_stack.append( handler.links ) # Try to fire the handler begin handler.trigger rescue StandardError => e # Log error @logger.error "Link: #{link}. Handler had some troubles." @logger.error e.message @logger.error e.backtrace.join( "\n" ) else links_counter += 1 @logger.debug "Link: #{link}. Handler triggered successfully." end # Sleep to give crawled server time to breath sleep( @config["sleep"] || 0 ) end # Release buffers @logger.info "Releasing buffers ..." release_buffer( @cache.buffer ) # Run after filter @logger.info "Running after callback ..." after # Lock database used in this Tango iteration lock_database( @db_locker.unlocked ) # Get time of script execution ending end_time = Time.now @logger.info "Tango crawled #{links_counter}/#{@link_stack.shifted} links successfully." @logger.info "Start time: #{start_time}, end time: #{end_time}, time elapsed: #{end_time - start_time} seconds." # Close logger @logger.close end
Private Instance Methods
Load cache for given models
@return [Nil]
# File lib/tango/app.rb, line 243 def load_cache( models ) models.each do |model| symbol = Tango::Kernel.symbolize( model ) model.all.each do |record| @cache.set( symbol, record ) end end end
Lock database ( e.g. used in this Tango
run )
@param database [String|Symbol] @return [String|Symbol]
# File lib/tango/app.rb, line 201 def lock_database( database ) @db_locker.lock( database ) end
Fetch list of non persistent model registered with application
@return [Array]
# File lib/tango/app.rb, line 215 def non_persistent_models @models.reject { |m| m.persistent? } end
Fetch list of persistent model registered with application
@return [Array]
# File lib/tango/app.rb, line 208 def persistent_models @models.select { |m| m.persistent? } end
Pick database ( e.g. to be used in this Tango
run )
@param database [String|Symbol] @return [String|Symbol]
# File lib/tango/app.rb, line 193 def pick_database( database ) Multidb.use( database ) end
Release given buffer
@param buffer [Tango::Resource::Buffer] @return [Nil]
# File lib/tango/app.rb, line 256 def release_buffer( buffer ) buffer.release_all end
Register cache with resource operators
@param operators [Array] @return [Array]
# File lib/tango/app.rb, line 232 def setup_cache( operators ) operators.each do |operator| @cache.register( Tango::Kernel.symbolize( operator ) ) do |resource| operator.load( resource ) end end end
Truncate table of given models
@return [Nil]
# File lib/tango/app.rb, line 222 def truncate_tables( models ) models.each do |model| ActiveRecord::Base.connection.execute( "TRUNCATE #{model.table_name}" ) end end