class Browser
A mechanize class that emulates a web-browser, with cache and everything. Progress bars are enabled by default.
Attributes
agent[RW]
cache[RW]
delay[RW]
delay_jitter[RW]
use_cache[RW]
Public Class Methods
new(**options)
click to toggle source
Default options:
:delay => 1, # Sleep 1 second between gets :delay_jitter => 0.2, # Random deviation from delay :use_cache => true, # Cache all gets :use_logs => false, # Don't log the detailed transfer info :cookie_file => "cookies.txt" # Save cookies to file
# File lib/epitools/browser.rb, line 39 def initialize(**options) @last_get = Time.at(0) @delay = options[:delay] || 1 @delay_jitter = options[:delay_jitter] || 0.2 @use_cache = !!(options[:cache] || options[:cached] || options[:use_cache]) @use_logs = options[:logs] || false @cookie_file = options[:cookiefile] || "cookies.txt" @cache_file = options[:cache_file] || "browser-cache.db" # TODO: @progress, @user_agent, @logfile, @cache_file (default location: ~/.epitools?) if options[:proxy] host, port = options[:proxy].split(':') TCPSocket::socks_server = host TCPSocket::socks_port = port.to_i end init_agent! init_cache! end
Public Instance Methods
cache_put(page, url)
click to toggle source
# File lib/epitools/browser.rb, line 112 def cache_put(page, url) if cache.valid_page?(page) if page.content_type =~ %r{(^text/|^application/javascript|javascript)} puts " |_ writing to cache" cache.put(page, url, :overwrite=>true) end end end
cacheable?(page)
click to toggle source
# File lib/epitools/browser.rb, line 105 def cacheable?(page) case page.content_type when %r{^(text|application)} true end end
get(url, **options)
click to toggle source
Retrieve an URL, and return a Mechanize::Page instance (which acts a bit like a Nokogiri::HTML::Document instance.)
Options:
:cached => true/false | check cache before getting page
# File lib/epitools/browser.rb, line 129 def get(url, **options) # TODO: Have a base-URL option #if relative?(url) # url = URI.join("http://base-url/", url).to_s #end # Determine the cache setting use_cache = options[:cached].nil? ? @use_cache : options[:cached] cached_already = cache.include?(url) if use_cache puts puts "[ GET #{url} (using cache: #{!!use_cache}) ]" delay unless cached_already max_retries = 4 retries = 0 begin if use_cache and page = cache.get(url) puts " |_ cached (#{page.content_type})" else page = agent.get(url, [], options[:referer]) @last_get = Time.now cache_put(page, url) if use_cache end puts rescue Net::HTTPBadResponse, Errno::ECONNRESET, SocketError, Timeout::Error, SOCKSError => e raise if e.message == "getaddrinfo: Name or service not known" retries += 1 return if retries >= max_retries puts " |_ ERROR: #{e.inspect} -- retrying" delay(5) retry =begin rescue Mechanize::ResponseCodeError => e case e.response_code when "401" #=> Net::HTTPUnauthorized p e login! page = get(url) puts when "404" p e raise e when "503" puts " |_ ERROR: #{e.inspect} -- retrying" delay(5) retry else raise e end =end end page end
init_agent!()
click to toggle source
# File lib/epitools/browser.rb, line 60 def init_agent! @agent = Mechanize.new do |a| # ["Mechanize", "Mac Mozilla", "Linux Mozilla", "Windows IE 6", "iPhone", "Linux Konqueror", "Windows IE 7", "Mac FireFox", "Mac Safari", "Windows Mozilla"] a.max_history = 10 a.user_agent_alias = "Windows Chrome" a.log = Logger.new "mechanize.log" if @use_logs end load_cookies! end
init_cache!()
click to toggle source
# File lib/epitools/browser.rb, line 82 def init_cache! # TODO: Rescue "couldn't load" exception and disable caching @cache = Cache.new(@cache_file, agent) if @use_cache end
relative?(url)
click to toggle source
# File lib/epitools/browser.rb, line 101 def relative?(url) not url[ %r{^https?://} ] end