class Tango::App

Tango application

@author Mckomo

Attributes

config[R]
dispatcher[R]
logger[R]

Public Class Methods

new( config: {}, link_stack: nil, dispatcher: nil, cache: nil, http_client: nil, parser: nil, db_locker: nil, logger: nil ) click to toggle source

@param link_stack [Tango::LinkStack] @param dispatcher [Tango::Etl::Dispatcher] @param cache [Tango::Resources::Cache] @param http_client [Object] Must implement get method @param parser [Object] Must implement parse method @param db_locker [DatabaseLocker] @param logger [Logger] @return [Tango::App]

# File lib/tango/app.rb, line 20
def initialize( config: {}, link_stack: nil, dispatcher: nil, cache: nil, http_client: nil, parser: nil, db_locker: nil, logger: nil )
  
  # Init app properties
  @models = {}
  @operators = {}
  
  # Set config
  @config = config
  
  # Set dependencies
  @link_stack = link_stack || LinkStack.new( config['target_url'] )
  @dispatcher = dispatcher || ETL::Dispatcher.new
  @cache = cache || Resource::Cache.new( Resource::Buffer.new )
  @http_client = http_client || HTTParty
  @parser = parser || Nokogiri::HTML
  @db_locker = db_locker || DatabaseLocker.new( Multidb.databases )
  @logger = logger || Logger.new( STDOUT )

  @models = []
  @operators = []
  
end

Public Instance Methods

after() click to toggle source

Filter run after Tango execution

# File lib/tango/app.rb, line 48
def after
end
before() click to toggle source

Filter run before Tango execution

# File lib/tango/app.rb, line 44
def before
end
register_handler( handler ) click to toggle source

Register a new handler with the dispatcher

@param handler [Tango::ETL::HandlerInterface] @return [Array]

# File lib/tango/app.rb, line 71
def register_handler( handler )
  @dispatcher.register( handler )
end
register_model( model ) click to toggle source

Register a new model

@param model [Symbol] @return [Array]

# File lib/tango/app.rb, line 55
def register_model( model )
  @models << model    
end
register_operator( operator ) click to toggle source

Register a new resource operator

@param operator [Tango::ETL::OperatorInterface] @return [Array]

# File lib/tango/app.rb, line 63
def register_operator( operator )
  @operators << operator
end
run() click to toggle source

Run ETL process

@param link_stack [Tango::LinkStack] @param dispatcher [Tango::Etl::Dispatcher] @param cache [Tango::Resources::Cache] @param http_client [Object] Must implement get method @param parser [Object] Must implement parse method @param logger [Logger] @return [Nil]

# File lib/tango/app.rb, line 84
def run

  # Save beginning time
  start_time = Time.now
  
  @logger.info "Running Tango v.#{Tango::VERSION} ..."
  @logger.info "Target: #{@link_stack.host}."
  
  # Use next unlocked database
  pick_database( @db_locker.unlocked )
  @logger.info "Using database '#{@db_locker.unlocked}'."

  @logger.info "Truncating non persistent models ..." 
  truncate_tables( non_persistent_models )
  
  # Load cache for persistent models
  @logger.info "Loading cache ..."
  setup_cache( @operators )
  load_cache( persistent_models )

  # Run before filter
  @logger.info "Running before callback ..."
  before
  
  # Init counter of crawled links
  links_counter = 0
  @logger.info "Tango starts crawling ..."
  
  # Start crawling website
  while( @link_stack.has_links? )
  
    # Get a link from the stack
    link = @link_stack.shift 
    
    # Skip iteration if no handler found
    if ! handler_klass = @dispatcher.find_handler( link )
      @logger.error "No handler for link: #{link}."
      next
    end
    
    # Try to get contents of the link
    begin 
      response = @http_client.get( @link_stack.host + link )
    rescue StandardError => e 
      @logger.error "Could not download contents of #{@link_stack.host + link} link."
      @logger.error e.message
      next
    end
    
    # Continue only when response has code 200 or 201
    if ! [ 200, 201 ].include?( response.code )
      @logger.error "Response code for link #{link} is #{response.code}. Only codes 200 and 201 are accepted."
      next
    end
    
    # Parse response contents
    document = @parser.parse( response.body )
    # Init handler
    handler = handler_klass.new( link, document, @cache )
    
    # Append links fetched from handler
    @link_stack.append( handler.links )
    
    # Try to fire the handler
    begin
      handler.trigger
    rescue StandardError => e  
      # Log error
      @logger.error "Link: #{link}. Handler had some troubles."
      @logger.error e.message
      @logger.error e.backtrace.join( "\n" )
    else
      links_counter += 1
      @logger.debug "Link: #{link}. Handler triggered successfully."
    end
    
    # Sleep to give crawled server time to breath
    sleep( @config["sleep"] || 0 )
    
  end
  
  # Release buffers
  @logger.info "Releasing buffers ..."
  release_buffer( @cache.buffer )
  
  # Run after filter
  @logger.info "Running after callback ..."
  after
  
  # Lock database used in this Tango iteration
  lock_database( @db_locker.unlocked )

  # Get time of script execution ending
  end_time = Time.now
  
  @logger.info "Tango crawled #{links_counter}/#{@link_stack.shifted} links successfully."
  @logger.info "Start time: #{start_time}, end time: #{end_time}, time elapsed: #{end_time - start_time} seconds."
  
  # Close logger
  @logger.close
        
end

Private Instance Methods

load_cache( models ) click to toggle source

Load cache for given models

@return [Nil]

# File lib/tango/app.rb, line 243
def load_cache( models )
  models.each do |model|
    symbol = Tango::Kernel.symbolize( model ) 
    model.all.each do |record|
      @cache.set( symbol, record )
    end
  end
end
lock_database( database ) click to toggle source

Lock database ( e.g. used in this Tango run )

@param database [String|Symbol] @return [String|Symbol]

# File lib/tango/app.rb, line 201
def lock_database( database )
  @db_locker.lock( database )
end
non_persistent_models() click to toggle source

Fetch list of non persistent model registered with application

@return [Array]

# File lib/tango/app.rb, line 215
def non_persistent_models
  @models.reject { |m| m.persistent? }
end
persistent_models() click to toggle source

Fetch list of persistent model registered with application

@return [Array]

# File lib/tango/app.rb, line 208
def persistent_models
  @models.select { |m| m.persistent? }
end
pick_database( database ) click to toggle source

Pick database ( e.g. to be used in this Tango run )

@param database [String|Symbol] @return [String|Symbol]

# File lib/tango/app.rb, line 193
def pick_database( database )
  Multidb.use( database )
end
release_buffer( buffer ) click to toggle source

Release given buffer

@param buffer [Tango::Resource::Buffer] @return [Nil]

# File lib/tango/app.rb, line 256
def release_buffer( buffer )
  buffer.release_all
end
setup_cache( operators ) click to toggle source

Register cache with resource operators

@param operators [Array] @return [Array]

# File lib/tango/app.rb, line 232
def setup_cache( operators )
  operators.each do |operator|
    @cache.register( Tango::Kernel.symbolize( operator ) ) do |resource|
      operator.load( resource )
    end
  end
end
truncate_tables( models ) click to toggle source

Truncate table of given models

@return [Nil]

# File lib/tango/app.rb, line 222
def truncate_tables( models )
  models.each do |model|
    ActiveRecord::Base.connection.execute( "TRUNCATE #{model.table_name}" )
  end
end