Class: Tango::App

Inherits:
Object
  • Object
show all
Defined in:
lib/tango/app.rb

Overview

Tango application

Author:

Instance Attribute Summary (collapse)

Instance Method Summary (collapse)

Constructor Details

- (Tango::App) initialize(config: {}, link_stack: nil, dispatcher: nil, cache: nil, http_client: nil, parser: nil, db_locker: nil, logger: nil)

Parameters:

  • link_stack (Tango::LinkStack)
  • dispatcher (Tango::Etl::Dispatcher)
  • cache (Tango::Resources::Cache)
  • http_client (Object)

    Must implement get method

  • parser (Object)

    Must implement parse method

  • db_locker (DatabaseLocker)
  • logger (Logger)


20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/tango/app.rb', line 20

def initialize( config: {}, link_stack: nil, dispatcher: nil, cache: nil, http_client: nil, parser: nil, db_locker: nil, logger: nil )
  
  # Init app properties
  @models = {}
  @operators = {}
  
  # Set config
  @config = config
  
  # Set dependencies
  @link_stack = link_stack || LinkStack.new( config['target_url'] )
  @dispatcher = dispatcher || ETL::Dispatcher.new
  @cache = cache || Resource::Cache.new( Resource::Buffer.new )
  @http_client = http_client || HTTParty
  @parser = parser || Nokogiri::HTML
  @db_locker = db_locker || DatabaseLocker.new( Multidb.databases )
  @logger = logger || Logger.new( STDOUT )
  
end

Instance Attribute Details

- (Object) config (readonly)

Returns the value of attribute config



10
11
12
# File 'lib/tango/app.rb', line 10

def config
  @config
end

- (Object) dispatcher (readonly)

Returns the value of attribute dispatcher



10
11
12
# File 'lib/tango/app.rb', line 10

def dispatcher
  @dispatcher
end

Returns the value of attribute link_stack



10
11
12
# File 'lib/tango/app.rb', line 10

def link_stack
  @link_stack
end

- (Object) logger (readonly)

Returns the value of attribute logger



10
11
12
# File 'lib/tango/app.rb', line 10

def logger
  @logger
end

Instance Method Details

- (Object) after

Filter run after Tango execution



45
46
# File 'lib/tango/app.rb', line 45

def after
end

- (Object) before

Filter run before Tango execution



41
42
# File 'lib/tango/app.rb', line 41

def before
end

- (Object) register_model(symbol, model)

Register new resource model

Parameters:

  • symbol (Symbol)
  • model (Class)


52
53
54
55
56
57
58
59
60
61
# File 'lib/tango/app.rb', line 52

def register_model( symbol, model )
  
  @models[symbol] = model
  
  # Truncate table of non persistent model
  unless model.persistent?
    ActiveRecord::Base.connection.execute( "TRUNCATE #{model.table_name}" )
  end
  
end

- (Object) register_operator(symbol, operator)

Register new resource operator

Parameters:

  • symbol (Symbol)
  • operator (Class)


67
68
69
70
71
72
73
74
75
76
# File 'lib/tango/app.rb', line 67

def register_operator( symbol, operator )
  
  @operators[symbol] = operator
  
  # Register operator with resource cache system
  @cache.register( symbol ) do |resource|
    operator.load( resource )
  end
  
end

- (Integer) run

Run ETL process

Parameters:

  • link_stack (Tango::LinkStack)
  • dispatcher (Tango::Etl::Dispatcher)
  • cache (Tango::Resources::Cache)
  • http_client (Object)

    Must implement get method

  • parser (Object)

    Must implement parse method

  • logger (Logger)

Returns:

  • (Integer)


87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# File 'lib/tango/app.rb', line 87

def run

  # Save beginning time
  start_time = Time.now
  
  @logger.info "Running Tango v.#{Tango::VERSION} ..."
  @logger.info "Target: #{@link_stack.host}."
  
  # Use next unlocked database
  Multidb.use( @db_locker.unlocked )
  @logger.info "Using database '#{@db_locker.unlocked}'."
  
  # Run before filter
  @logger.info "Loading cache ..."
  load_cache
  
  # Run before filter
  @logger.info "Running before callback ..."
  before
  
  # Init counter of crawled links
  links_counter = 0
  @logger.info "Tango starts crawling ..."
  
  # Start crawling website
  while( @link_stack.has_links? )
  
    # Get a link from the stack
    link = @link_stack.shift 
    
    # Skip iteration if no handler found
    if ! handler_klass = @dispatcher.find_handler( link )
      @logger.error "No handler for link: #{link}."
      next
    end
    
    # Try to get contents of the link
    begin 
      response = @http_client.get( @link_stack.host + link )
    rescue StandardError => e 
      @logger.error "Could not download contents of #{@link_stack.host + link} link."; @logger.error e.message
      next
    end
    
    # Continue only when response has code 200 or 201
    if ! [ 200, 201 ].include?( response.code )
      @logger.error "Response code for link #{link} is #{response.code}. Only code 200 is accepted."
      next
    end
    
    # Parse response contents
    document = @parser.parse( response.body )
    # Init handler
    handler = handler_klass.new( link, document, @cache )
    
    # Append links fetched from handler
    @link_stack.append( handler.links )
    
    # Try to fire the handler
    begin
      handler.trigger
    rescue StandardError => e  
      # Log error
      @logger.error "Link: #{link}. Handler had some troubles."
      @logger.error e.message
      @logger.error e.backtrace.join( "\n" )
    else
      links_counter += 1
      @logger.debug "Link: #{link}. Handler triggered successfully."
    end
    
    # Sleep to give crawled server time to breath
    sleep( @config["sleep"] || 0 )
    
  end
  
  # Release buffers
  @logger.info "Releasing buffers ..."
  @cache.buffer.release_all()
  
  # Run after filter
  @logger.info "Running after callback ..."
  after
  
  # Lock database used in this Tango iteration
  @db_locker.lock( @db_locker.unlocked )

  # Get time of script execution ending
  end_time = Time.now
  
  @logger.info "Tango crawled #{links_counter}/#{@link_stack.shifted} links successfully."
  @logger.info "Start time: #{start_time}, end time: #{end_time}, time elapsed: #{end_time - start_time} seconds."
  
  # Close logger
  @logger.close
        
end