class Spidr::Agent
Attributes
HTTP Headers to use for every request
@return [Hash{String => String}]
@since 0.6.0
Delay in between fetching pages
@return [Integer]
List of unreachable URLs
@return [Set<URI::HTTP>]
History containing visited URLs
@return [Set<URI::HTTP>]
HTTP Host `Header` to use
@return [String]
HTTP `Host` Headers to use for specific hosts
@return [Hash{String,Regexp => String}]
The visited URLs and their depth within a site
@return [Hash{URI::HTTP => Integer}]
Maximum number of pages to visit.
@return [Integer]
Maximum depth
@return [Integer]
Queue of URLs to visit
@return [Array<URI::HTTP>]
Queue of URLs to visit
@return [Array<URI::HTTP>]
Referer to use
@return [String]
List of acceptable URL schemes to follow
The session cache
@return [SessionCache]
@since 0.6.0
History containing visited URLs
@return [Set<URI::HTTP>]
Public Class Methods
Creates a new agent and spiders the entire domain.
@param [String] name
The top-level domain to spider.
@param [Hash{Symbol => Object}] kwargs
Additional keyword arguments. See {Agent#initialize}.
@yield [agent]
If a block is given, it will be passed the newly created agent before it begins spidering.
@yieldparam [Agent] agent
The newly created agent.
@return [Agent]
The created agent object.
@see initialize
@since 0.7.0
# File lib/spidr/agent.rb, line 418 def self.domain(name,**kwargs,&block) agent = new(host: /(^|\.)#{Regexp.escape(name)}$/, **kwargs, &block) agent.start_at(URI::HTTP.build(host: name, path: '/')) return agent end
Creates a new agent and spiders the given host.
@param [String] name
The host-name to spider.
@param [Hash{Symbol => Object}] kwargs
Additional keyword arguments. See {Agent#initialize}.
@yield [agent]
If a block is given, it will be passed the newly created agent before it begins spidering.
@yieldparam [Agent] agent
The newly created agent.
@return [Agent]
The created agent object.
@see initialize
# File lib/spidr/agent.rb, line 389 def self.host(name,**kwargs,&block) agent = new(host: name, **kwargs, &block) agent.start_at(URI::HTTP.build(host: name, path: '/')) return agent end
Creates a new Agent
object.
@param [String, nil] host_header
The HTTP `Host` header to use with each request.
@param [Hash{String,Regexp => String}] host_headers
The HTTP `Host` headers to use for specific hosts.
@param [Hash{String => String}] default_headers
Default headers to set for every request.
@param [String, nil] user_agent
The `User-Agent` string to send with each requests.
@param [String, nil] referer
The `Referer` URL to send with each request.
@param [Integer, nil] open_timeout
Optional open connection timeout.
@param [Integer, nil] read_timeout
Optional read timeout.
@param [Integer, nil] ssl_timeout
Optional SSL connection timeout.
@param [Integer, nil] continue_timeout
Optional continue timeout.
@param [Integer, nil] keep_alive_timeout
Optional `Keep-Alive` timeout.
@param [Spidr::Proxy, Hash, URI::HTTP, String, nil] proxy
The proxy information to use.
@option proxy [String] :host
The host the proxy is running on.
@option proxy [Integer] :port (8080)
The port the proxy is running on.
@option proxy [String, nil] :user
The user to authenticate as with the proxy.
@option proxy [String, nil] :password
The password to authenticate with.
@param [Integer] delay
The number of seconds to pause between each request.
@param [Integer, nil] limit
The maximum number of pages to visit.
@param [Integer, nil] max_depth
The maximum link depth to follow.
@param [Set, Array, nil] queue
The initial queue of URLs to visit.
@param [Set, Array, nil] history
The initial list of visited URLs.
@param [Boolean] strip_fragments
Controls whether to strip the fragment components from the URLs.
@param [Boolean] strip_query
Controls whether to strip the query components from the URLs.
@param [Array<String>] schemes
The list of acceptable URI schemes to visit. The `https` scheme will be ignored if `net/https` cannot be loaded.
@param [String] host
The host-name to visit.
@param [Array<String, Regexp, Proc>] hosts
The patterns which match the host-names to visit.
@param [Array<String, Regexp, Proc>] ignore_hosts
The patterns which match the host-names to not visit.
@param [Array<Integer, Regexp, Proc>] ports
The patterns which match the ports to visit.
@param [Array<Integer, Regexp, Proc>] ignore_ports
The patterns which match the ports to not visit.
@param [Array<String, Regexp, Proc>] links
The patterns which match the links to visit.
@param [Array<String, Regexp, Proc>] ignore_links
The patterns which match the links to not visit.
@param [Array<String, Regexp, Proc>] urls
The patterns which match the URLs to visit.
@param [Array<String, Regexp, Proc>] ignore_urls
The patterns which match the URLs to not visit.
@param [Array<String, Regexp, Proc>] exts
The patterns which match the URI path extensions to visit.
@param [Array<String, Regexp, Proc>] ignore_exts
The patterns which match the URI path extensions to not visit.
@param [Boolean] robots
Specifies whether `robots.txt` should be honored.
@yield [agent]
If a block is given, it will be passed the newly created agent for further configuration.
@yieldparam [Agent] agent
The newly created agent.
# File lib/spidr/agent.rb, line 214 def initialize(# header keyword arguments host_header: nil, host_headers: {}, default_headers: {}, user_agent: Spidr.user_agent, referer: nil, # session cache keyword arguments proxy: Spidr.proxy, open_timeout: Spidr.open_timeout, ssl_timeout: Spidr.ssl_timeout, read_timeout: Spidr.read_timeout, continue_timeout: Spidr.continue_timeout, keep_alive_timeout: Spidr.keep_alive_timeout, # spidering controls keyword arguments delay: 0, limit: nil, max_depth: nil, # history keyword arguments queue: nil, history: nil, # sanitizer keyword arguments strip_fragments: true, strip_query: false, # filtering keyword arguments schemes: self.class.default_schemes, host: nil, hosts: nil, ignore_hosts: nil, ports: nil, ignore_ports: nil, links: nil, ignore_links: nil, urls: nil, ignore_urls: nil, exts: nil, ignore_exts: nil, # robots keyword arguments robots: Spidr.robots?) @host_header = host_header @host_headers = host_headers @default_headers = default_headers @user_agent = user_agent @referer = referer @sessions = SessionCache.new( proxy: proxy, open_timeout: open_timeout, ssl_timeout: ssl_timeout, read_timeout: read_timeout, continue_timeout: continue_timeout, keep_alive_timeout: keep_alive_timeout ) @cookies = CookieJar.new @authorized = AuthStore.new @running = false @delay = delay @history = Set[] @failures = Set[] @queue = [] @limit = limit @levels = Hash.new(0) @max_depth = max_depth self.queue = queue if queue self.history = history if history initialize_sanitizers( strip_fragments: strip_fragments, strip_query: strip_query ) initialize_filters( schemes: schemes, host: host, hosts: hosts, ignore_hosts: ignore_hosts, ports: ports, ignore_ports: ignore_ports, links: links, ignore_links: ignore_links, urls: urls, ignore_urls: ignore_urls, exts: exts, ignore_exts: ignore_exts ) initialize_actions initialize_events initialize_robots if robots yield self if block_given? end
Creates a new agent and spiders the web-site located at the given URL.
@param [URI::HTTP, String] url
The web-site to spider.
@param [Hash{Symbol => Object}] kwargs
Additional keyword arguments. See {Agent#initialize}.
@yield [agent]
If a block is given, it will be passed the newly created agent before it begins spidering.
@yieldparam [Agent] agent
The newly created agent.
@return [Agent]
The created agent object.
@see initialize
# File lib/spidr/agent.rb, line 360 def self.site(url,**kwargs,&block) url = URI(url) agent = new(host: url.host, **kwargs, &block) agent.start_at(url) return agent end
Creates a new agent and begin spidering at the given URL.
@param [URI::HTTP, String] url
The URL to start spidering at.
@param [Hash{Symbol => Object}] kwargs
Additional keyword arguments. See {Agent#initialize}.
@yield [agent]
If a block is given, it will be passed the newly created agent before it begins spidering.
@yieldparam [Agent] agent
The newly created agent.
@return [Agent]
The created agent object.
@see initialize @see start_at
# File lib/spidr/agent.rb, line 333 def self.start_at(url,**kwargs,&block) agent = new(**kwargs,&block) agent.start_at(url) return agent end
Protected Class Methods
Determines the default URI
schemes to follow.
@return [Array<String>]
The default URI schemes to follow.
@since 0.6.2
# File lib/spidr/agent/filters.rb, line 429 def self.default_schemes schemes = ['http'] begin require 'net/https' schemes << 'https' rescue Gem::LoadError => e raise(e) rescue ::LoadError warn "Warning: cannot load 'net/https', https support disabled" end return schemes end
Public Instance Methods
Pass the headers from every response the agent receives to a given block.
@yield [headers]
The block will be passed the headers of every response.
@yieldparam [Hash] headers
The headers from a response.
# File lib/spidr/agent/events.rb, line 70 def all_headers every_page { |page| yield page.headers } end
Clears the history of the agent.
# File lib/spidr/agent.rb, line 458 def clear @queue.clear @history.clear @failures.clear return self end
Continue spidering.
@yield [page]
If a block is given, it will be passed every page visited.
@yieldparam [Page] page
The page to be visited.
# File lib/spidr/agent/actions.rb, line 42 def continue!(&block) @paused = false return run(&block) end
Enqueues a given URL for visiting, only if it passes all of the agent's rules for visiting a given URL.
@param [URI::HTTP, String] url
The URL to enqueue for visiting.
@return [Boolean]
Specifies whether the URL was enqueued, or ignored.
# File lib/spidr/agent.rb, line 658 def enqueue(url,level=0) url = sanitize_url(url) if (!queued?(url) && visit?(url)) link = url.to_s begin @every_url_blocks.each { |url_block| url_block.call(url) } @every_url_like_blocks.each do |pattern,url_blocks| match = case pattern when Regexp link =~ pattern else (pattern == link) || (pattern == url) end if match url_blocks.each { |url_block| url_block.call(url) } end end rescue Actions::Paused => action raise(action) rescue Actions::SkipLink return false rescue Actions::Action end @queue << url @levels[url] = level return true end return false end
Pass every Atom document that the agent parses to a given block.
@yield [doc]
The block will be passed every Atom document parsed.
@yieldparam [Nokogiri::XML::Document] doc
A parsed XML document.
@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
# File lib/spidr/agent/events.rb, line 389 def every_atom_doc every_page do |page| if (block_given? && page.atom?) if (doc = page.doc) yield doc end end end end
Pass every Atom feed that the agent visits to a given block.
@yield [feed]
The block will be passed every Atom feed visited.
@yieldparam [Page] feed
A visited page.
# File lib/spidr/agent/events.rb, line 453 def every_atom_page every_page do |page| yield page if (block_given? && page.atom?) end end
Pass every Bad Request page that the agent visits to a given block.
@yield [page]
The block will be passed every Bad Request page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 142 def every_bad_request_page every_page do |page| yield page if (block_given? && page.bad_request?) end end
Pass every CSS page that the agent visits to a given block.
@yield [page]
The block will be passed every CSS page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 423 def every_css_page every_page do |page| yield page if (block_given? && page.css?) end end
Pass every HTML or XML document that the agent parses to a given block.
@yield [doc]
The block will be passed every HTML or XML document parsed.
@yieldparam [Nokogiri::HTML::Document, Nokogiri::XML::Document] doc
A parsed HTML or XML document.
@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html @see nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
# File lib/spidr/agent/events.rb, line 283 def every_doc every_page do |page| if block_given? if (doc = page.doc) yield doc end end end end
Pass each URL that could not be requested to the given block.
@yield [url]
The block will be passed every URL that could not be requested.
@yieldparam [URI::HTTP] url
A failed URL.
# File lib/spidr/agent/events.rb, line 28 def every_failed_url(&block) @every_failed_url_blocks << block return self end
Pass every Forbidden page that the agent visits to a given block.
@yield [page]
The block will be passed every Forbidden page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 172 def every_forbidden_page every_page do |page| yield page if (block_given? && page.forbidden?) end end
Pass every HTML document that the agent parses to a given block.
@yield [doc]
The block will be passed every HTML document parsed.
@yieldparam [Nokogiri::HTML::Document] doc
A parsed HTML document.
@see nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html
# File lib/spidr/agent/events.rb, line 304 def every_html_doc every_page do |page| if (block_given? && page.html?) if (doc = page.doc) yield doc end end end end
Pass every HTML page that the agent visits to a given block.
@yield [page]
The block will be passed every HTML page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 233 def every_html_page every_page do |page| yield page if (block_given? && page.html?) end end
Pass every Internal Server Error page that the agent visits to a given block.
@yield [page]
The block will be passed every Internal Server Error page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 203 def every_internal_server_error_page every_page do |page| yield page if (block_given? && page.had_internal_server_error?) end end
Pass every JavaScript page that the agent visits to a given block.
@yield [page]
The block will be passed every JavaScript page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 408 def every_javascript_page every_page do |page| yield page if (block_given? && page.javascript?) end end
Passes every origin and destination URI
of each link to a given block.
@yield [origin,dest]
The block will be passed every origin and destination URI of each link.
@yieldparam [URI::HTTP] origin
The URI that a link originated from.
@yieldparam [URI::HTTP] dest
The destination URI of a link.
# File lib/spidr/agent/events.rb, line 518 def every_link(&block) @every_link_blocks << block return self end
Pass every Missing page that the agent visits to a given block.
@yield [page]
The block will be passed every Missing page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 187 def every_missing_page every_page do |page| yield page if (block_given? && page.missing?) end end
Pass every MS Word page that the agent visits to a given block.
@yield [page]
The block will be passed every MS Word page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 468 def every_ms_word_page every_page do |page| yield page if (block_given? && page.ms_word?) end end
Pass every OK page that the agent visits to a given block.
@yield [page]
The block will be passed every OK page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 97 def every_ok_page every_page do |page| yield page if (block_given? && page.ok?) end end
Pass every page that the agent visits to a given block.
@yield [page]
The block will be passed every page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 83 def every_page(&block) @every_page_blocks << block return self end
Pass every PDF page that the agent visits to a given block.
@yield [page]
The block will be passed every PDF page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 483 def every_pdf_page every_page do |page| yield page if (block_given? && page.pdf?) end end
Pass every Redirect page that the agent visits to a given block.
@yield [page]
The block will be passed every Redirect page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 112 def every_redirect_page every_page do |page| yield page if (block_given? && page.redirect?) end end
Pass every RSS document that the agent parses to a given block.
@yield [doc]
The block will be passed every RSS document parsed.
@yieldparam [Nokogiri::XML::Document] doc
A parsed XML document.
@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
# File lib/spidr/agent/events.rb, line 368 def every_rss_doc every_page do |page| if (block_given? && page.rss?) if (doc = page.doc) yield doc end end end end
Pass every RSS feed that the agent visits to a given block.
@yield [feed]
The block will be passed every RSS feed visited.
@yieldparam [Page] feed
A visited page.
# File lib/spidr/agent/events.rb, line 438 def every_rss_page every_page do |page| yield page if (block_given? && page.rss?) end end
Pass every Timeout page that the agent visits to a given block.
@yield [page]
The block will be passed every Timeout page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 127 def every_timedout_page every_page do |page| yield page if (block_given? && page.timedout?) end end
Pass every Plain Text page that the agent visits to a given block.
@yield [page]
The block will be passed every Plain Text page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 218 def every_txt_page every_page do |page| yield page if (block_given? && page.txt?) end end
Pass each URL from each page visited to the given block.
@yield [url]
The block will be passed every URL from every page visited.
@yieldparam [URI::HTTP] url
Each URL from each page visited.
# File lib/spidr/agent/events.rb, line 14 def every_url(&block) @every_url_blocks << block return self end
Pass every URL that the agent visits, and matches a given pattern, to a given block.
@param [Regexp, String] pattern
The pattern to match URLs with.
@yield [url]
The block will be passed every URL that matches the given pattern.
@yieldparam [URI::HTTP] url
A matching URL.
@since 0.3.2
# File lib/spidr/agent/events.rb, line 48 def every_url_like(pattern,&block) @every_url_like_blocks[pattern] << block return self end
Pass every XML document that the agent parses to a given block.
@yield [doc]
The block will be passed every XML document parsed.
@yieldparam [Nokogiri::XML::Document] doc
A parsed XML document.
@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
# File lib/spidr/agent/events.rb, line 325 def every_xml_doc every_page do |page| if (block_given? && page.xml?) if (doc = page.doc) yield doc end end end end
Pass every XML page that the agent visits to a given block.
@yield [page]
The block will be passed every XML page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 248 def every_xml_page every_page do |page| yield page if (block_given? && page.xml?) end end
Pass every XML Stylesheet (XSL) that the agent parses to a given block.
@yield [doc]
The block will be passed every XSL Stylesheet (XSL) parsed.
@yieldparam [Nokogiri::XML::Document] doc
A parsed XML document.
@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html
# File lib/spidr/agent/events.rb, line 347 def every_xsl_doc every_page do |page| if (block_given? && page.xsl?) if (doc = page.doc) yield doc end end end end
Pass every XML Stylesheet (XSL) page that the agent visits to a given block.
@yield [page]
The block will be passed every XML Stylesheet (XSL) page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 264 def every_xsl_page every_page do |page| yield page if (block_given? && page.xsl?) end end
Pass every ZIP page that the agent visits to a given block.
@yield [page]
The block will be passed every ZIP page visited.
@yieldparam [Page] page
A visited page.
# File lib/spidr/agent/events.rb, line 498 def every_zip_page every_page do |page| yield page if (block_given? && page.zip?) end end
Determines whether a given URL could not be visited.
@param [URI::HTTP, String] url
The URL to check for failures.
@return [Boolean]
Specifies whether the given URL was unable to be visited.
# File lib/spidr/agent.rb, line 607 def failed?(url) @failures.include?(URI(url)) end
Sets the list of failed URLs.
@param [#each] new_failures
The new list of failed URLs.
@return [Array<URI::HTTP>]
The list of failed URLs.
@example
agent.failures = ['http://localhost/']
# File lib/spidr/agent.rb, line 588 def failures=(new_failures) @failures.clear new_failures.each do |url| @failures << URI(url) end return @failures end
Requests and creates a new Page
object from a given URL.
@param [URI::HTTP] url
The URL to request.
@yield [page]
If a block is given, it will be passed the page that represents the response.
@yieldparam [Page] page
The page for the response.
@return [Page, nil]
The page for the response, or `nil` if the request failed.
# File lib/spidr/agent.rb, line 710 def get_page(url) url = URI(url) prepare_request(url) do |session,path,headers| new_page = Page.new(url,session.get(path,headers)) # save any new cookies @cookies.from_page(new_page) yield new_page if block_given? return new_page end end
Sets the history of URLs that were previously visited.
@param [#each] new_history
A list of URLs to populate the history with.
@return [Set<URI::HTTP>]
The history of the agent.
@example
agent.history = ['http://tenderlovemaking.com/2009/05/06/ann-nokogiri-130rc1-has-been-released/']
# File lib/spidr/agent.rb, line 531 def history=(new_history) @history.clear new_history.each do |url| @history << URI(url) end return @history end
Specifies the patterns that match URI
path extensions to not visit.
@return [Array<String, Regexp, Proc>]
The URI path extension patterns to not visit.
# File lib/spidr/agent/filters.rb, line 330 def ignore_exts @ext_rules.reject end
Adds a given pattern to the {#ignore_exts}.
@param [String, Regexp] pattern
The pattern to match URI path extensions with.
@yield [ext]
If a block is given, it will be used to filter URI path extensions.
@yieldparam [String] ext
A URI path extension to reject or accept.
# File lib/spidr/agent/filters.rb, line 346 def ignore_exts_like(pattern=nil,&block) if pattern ignore_exts << pattern elsif block ignore_exts << block end return self end
Specifies the patterns that match host-names to not visit.
@return [Array<String, Regexp, Proc>]
The host-name patterns to not visit.
# File lib/spidr/agent/filters.rb, line 62 def ignore_hosts @host_rules.reject end
Adds a given pattern to the {#ignore_hosts}.
@param [String, Regexp] pattern
The pattern to match host-names with.
@yield [host]
If a block is given, it will be used to filter host-names.
@yieldparam [String] host
A host-name to reject or accept.
# File lib/spidr/agent/filters.rb, line 78 def ignore_hosts_like(pattern=nil,&block) if pattern ignore_hosts << pattern elsif block ignore_hosts << block end return self end
Specifies the patterns that match links to not visit.
@return [Array<String, Regexp, Proc>]
The link patterns to not visit.
# File lib/spidr/agent/filters.rb, line 194 def ignore_links @link_rules.reject end
Adds a given pattern to the {#ignore_links}.
@param [String, Regexp] pattern
The pattern to match links with.
@yield [link]
If a block is given, it will be used to filter links.
@yieldparam [String] link
A link to reject or accept.
# File lib/spidr/agent/filters.rb, line 210 def ignore_links_like(pattern=nil,&block) if pattern ignore_links << pattern elsif block ignore_links << block end return self end
Specifies the patterns that match ports to not visit.
@return [Array<Integer, Regexp, Proc>]
The port patterns to not visit.
# File lib/spidr/agent/filters.rb, line 126 def ignore_ports @port_rules.reject end
Adds a given pattern to the {#ignore_ports}.
@param [Integer, Regexp] pattern
The pattern to match ports with.
@yield [port]
If a block is given, it will be used to filter ports.
@yieldparam [Integer] port
A port to reject or accept.
# File lib/spidr/agent/filters.rb, line 142 def ignore_ports_like(pattern=nil,&block) if pattern ignore_ports << pattern elsif block ignore_ports << block end return self end
Specifies the patterns that match URLs to not visit.
@return [Array<String, Regexp, Proc>]
The URL patterns to not visit.
@since 0.2.4
# File lib/spidr/agent/filters.rb, line 264 def ignore_urls @url_rules.reject end
Adds a given pattern to the {#ignore_urls}.
@param [String, Regexp] pattern
The pattern to match URLs with.
@yield [url]
If a block is given, it will be used to filter URLs.
@yieldparam [URI::HTTP, URI::HTTPS] url
A URL to reject or accept.
@since 0.2.4
# File lib/spidr/agent/filters.rb, line 282 def ignore_urls_like(pattern=nil,&block) if pattern ignore_urls << pattern elsif block ignore_urls << block end return self end
Initializes the robots filter.
# File lib/spidr/agent/robots.rb, line 13 def initialize_robots unless Object.const_defined?(:Robots) raise(ArgumentError,":robots option given but unable to require 'robots' gem") end @robots = Robots.new(@user_agent) end
Pauses the agent, causing spidering to temporarily stop.
@raise [Paused]
Indicates to the agent, that it should pause spidering.
# File lib/spidr/agent/actions.rb, line 63 def pause! @paused = true raise(Actions::Paused) end
Sets the pause state of the agent.
@param [Boolean] state
The new pause state of the agent.
# File lib/spidr/agent/actions.rb, line 53 def pause=(state) @paused = state end
Determines whether the agent is paused.
@return [Boolean]
Specifies whether the agent is paused.
# File lib/spidr/agent/actions.rb, line 74 def paused? @paused == true end
Posts supplied form data and creates a new Page
object from a given URL.
@param [URI::HTTP] url
The URL to request.
@param [String] post_data
Form option data.
@yield [page]
If a block is given, it will be passed the page that represents the response.
@yieldparam [Page] page
The page for the response.
@return [Page, nil]
The page for the response, or `nil` if the request failed.
@since 0.2.2
# File lib/spidr/agent.rb, line 745 def post_page(url,post_data='') url = URI(url) prepare_request(url) do |session,path,headers| new_page = Page.new(url,session.post(path,post_data,headers)) # save any new cookies @cookies.from_page(new_page) yield new_page if block_given? return new_page end end
The proxy information the agent uses.
@return [Proxy]
The proxy information.
@see SessionCache#proxy
@since 0.2.2
# File lib/spidr/agent.rb, line 434 def proxy @sessions.proxy end
Sets the proxy information that the agent uses.
@param [Proxy, Hash, URI::HTTP, String, nil] new_proxy
The new proxy information.
@return [Proxy]
The new proxy information.
@see SessionCache#proxy=
@since 0.2.2
# File lib/spidr/agent.rb, line 451 def proxy=(new_proxy) @sessions.proxy = new_proxy end
Sets the queue of URLs to visit.
@param [#each] new_queue
The new list of URLs to visit.
@return [Array<URI::HTTP>]
The list of URLs to visit.
@example
agent.queue = ['http://www.vimeo.com/', 'http://www.reddit.com/']
# File lib/spidr/agent.rb, line 625 def queue=(new_queue) @queue.clear new_queue.each do |url| @queue << URI(url) end return @queue end
Determines whether a given URL has been enqueued.
@param [URI::HTTP] url
The URL to search for in the queue.
@return [Boolean]
Specifies whether the given URL has been queued for visiting.
# File lib/spidr/agent.rb, line 644 def queued?(url) @queue.include?(url) end
Determines whether a URL is allowed by the robot policy.
@param [URI::HTTP, String] url
The URL to check.
@return [Boolean]
Specifies whether a URL is allowed by the robot policy.
# File lib/spidr/agent/robots.rb, line 30 def robot_allowed?(url) if @robots @robots.allowed?(url) else true end end
Start spidering until the queue becomes empty or the agent is paused.
@yield [page]
If a block is given, it will be passed every page visited.
@yieldparam [Page] page
A page which has been visited.
# File lib/spidr/agent.rb, line 492 def run(&block) @running = true until (@queue.empty? || paused? || limit_reached?) begin visit_page(dequeue,&block) rescue Actions::Paused return self rescue Actions::Action end end @running = false @sessions.clear return self end
Determines if the agent is running.
@return [Boolean]
Specifies whether the agent is running or stopped.
# File lib/spidr/agent.rb, line 515 def running? @running == true end
Sanitizes a URL based on filtering options.
@param [URI::HTTP, URI::HTTPS, String] url
The URL to be sanitized
@return [URI::HTTP, URI::HTTPS]
The new sanitized URL.
@since 0.2.2
# File lib/spidr/agent/sanitizers.rb, line 25 def sanitize_url(url) url = URI(url) url.fragment = nil if @strip_fragments url.query = nil if @strip_query return url end
Sets the list of acceptable URL schemes to visit.
@param [Array] new_schemes
The new schemes to visit.
@example
agent.schemes = ['http']
# File lib/spidr/agent/filters.rb, line 20 def schemes=(new_schemes) @schemes = new_schemes.map(&:to_s) end
Causes the agent to skip the link being enqueued.
@raise [SkipLink]
Indicates to the agent, that the current link should be skipped, and not enqueued or visited.
# File lib/spidr/agent/actions.rb, line 85 def skip_link! raise(Actions::SkipLink) end
Causes the agent to skip the page being visited.
@raise [SkipPage]
Indicates to the agent, that the current page should be skipped.
# File lib/spidr/agent/actions.rb, line 95 def skip_page! raise(Actions::SkipPage) end
Start spidering at a given URL.
@param [URI::HTTP, String] url
The URL to start spidering at.
@yield [page]
If a block is given, it will be passed every page visited.
@yieldparam [Page] page
A page which has been visited.
# File lib/spidr/agent.rb, line 477 def start_at(url,&block) enqueue(url) return run(&block) end
Converts the agent into a Hash.
@return [Hash]
The agent represented as a Hash containing the `history` and the `queue` of the agent.
# File lib/spidr/agent.rb, line 819 def to_hash {history: @history, queue: @queue} end
@see every_url_like
# File lib/spidr/agent/events.rb, line 56 def urls_like(pattern,&block) every_url_like(pattern,&block) end
Specifies the patterns that match the URI
path extensions to visit.
@return [Array<String, Regexp, Proc>]
The URI path extensions patterns to visit.
# File lib/spidr/agent/filters.rb, line 298 def visit_exts @ext_rules.accept end
Adds a given pattern to the {#visit_exts}.
@param [String, Regexp] pattern
The pattern to match URI path extensions with.
@yield [ext]
If a block is given, it will be used to filter URI path extensions.
@yieldparam [String] ext
A URI path extension to accept or reject.
# File lib/spidr/agent/filters.rb, line 314 def visit_exts_like(pattern=nil,&block) if pattern visit_exts << pattern elsif block visit_exts << block end return self end
Specifies the patterns that match host-names to visit.
@return [Array<String, Regexp, Proc>]
The host-name patterns to visit.
# File lib/spidr/agent/filters.rb, line 30 def visit_hosts @host_rules.accept end
Adds a given pattern to the {#visit_hosts}.
@param [String, Regexp] pattern
The pattern to match host-names with.
@yield [host]
If a block is given, it will be used to filter host-names.
@yieldparam [String] host
A host-name to accept or reject.
# File lib/spidr/agent/filters.rb, line 46 def visit_hosts_like(pattern=nil,&block) if pattern visit_hosts << pattern elsif block visit_hosts << block end return self end
Specifies the patterns that match the links to visit.
@return [Array<String, Regexp, Proc>]
The link patterns to visit.
@since 0.2.4
# File lib/spidr/agent/filters.rb, line 160 def visit_links @link_rules.accept end
Adds a given pattern to the {#visit_links}
@param [String, Regexp] pattern
The pattern to match link with.
@yield [link]
If a block is given, it will be used to filter links.
@yieldparam [String] link
A link to accept or reject.
@since 0.2.4
# File lib/spidr/agent/filters.rb, line 178 def visit_links_like(pattern=nil,&block) if pattern visit_links << pattern elsif block visit_links << block end return self end
Visits a given URL, and enqueues the links recovered from the URL to be visited later.
@param [URI::HTTP, String] url
The URL to visit.
@yield [page]
If a block is given, it will be passed the page which was visited.
@yieldparam [Page] page
The page which was visited.
@return [Page, nil]
The page that was visited. If `nil` is returned, either the request for the page failed, or the page was skipped.
# File lib/spidr/agent.rb, line 776 def visit_page(url) url = sanitize_url(url) get_page(url) do |page| @history << page.url begin @every_page_blocks.each { |page_block| page_block.call(page) } yield page if block_given? rescue Actions::Paused => action raise(action) rescue Actions::SkipPage return nil rescue Actions::Action end page.each_url do |next_url| begin @every_link_blocks.each do |link_block| link_block.call(page.url,next_url) end rescue Actions::Paused => action raise(action) rescue Actions::SkipLink next rescue Actions::Action end if (@max_depth.nil? || @max_depth > @levels[url]) enqueue(next_url,@levels[url] + 1) end end end end
Specifies the patterns that match the ports to visit.
@return [Array<Integer, Regexp, Proc>]
The port patterns to visit.
# File lib/spidr/agent/filters.rb, line 94 def visit_ports @port_rules.accept end
Adds a given pattern to the {#visit_ports}.
@param [Integer, Regexp] pattern
The pattern to match ports with.
@yield [port]
If a block is given, it will be used to filter ports.
@yieldparam [Integer] port
A port to accept or reject.
# File lib/spidr/agent/filters.rb, line 110 def visit_ports_like(pattern=nil,&block) if pattern visit_ports << pattern elsif block visit_ports << block end return self end
Specifies the patterns that match the URLs to visit.
@return [Array<String, Regexp, Proc>]
The link patterns to visit.
@since 0.2.4
# File lib/spidr/agent/filters.rb, line 228 def visit_urls @url_rules.accept end
Adds a given pattern to the {#visit_urls}
@param [String, Regexp] pattern
The pattern to match URLs with.
@yield [url]
If a block is given, it will be used to filter URLs.
@yieldparam [URI::HTTP, URI::HTTPS] url
A URL to accept or reject.
@since 0.2.4
# File lib/spidr/agent/filters.rb, line 246 def visit_urls_like(pattern=nil,&block) if pattern visit_urls << pattern elsif block visit_urls << block end return self end
Determines whether a URL was visited or not.
@param [URI::HTTP, String] url
The URL to search for.
@return [Boolean]
Specifies whether a URL was visited.
# File lib/spidr/agent.rb, line 572 def visited?(url) @history.include?(URI(url)) end
Specifies all hosts that were visited.
@return [Array<String>]
The hosts which have been visited.
# File lib/spidr/agent.rb, line 559 def visited_hosts visited_urls.map(&:host).uniq end
Specifies the links which have been visited.
@return [Array<String>]
The links which have been visited.
# File lib/spidr/agent.rb, line 549 def visited_links @history.map(&:to_s) end
Protected Instance Methods
Dequeues a URL that will later be visited.
@return [URI::HTTP]
The URL that was at the front of the queue.
# File lib/spidr/agent.rb, line 922 def dequeue @queue.shift end
Adds a given URL to the failures list.
@param [URI::HTTP] url
The URL to add to the failures list.
# File lib/spidr/agent.rb, line 963 def failed(url) @failures << url @every_failed_url_blocks.each { |fail_block| fail_block.call(url) } return true end
# File lib/spidr/agent/actions.rb, line 101 def initialize_actions @paused = false end
# File lib/spidr/agent/events.rb, line 525 def initialize_events @every_url_blocks = [] @every_failed_url_blocks = [] @every_url_like_blocks = Hash.new { |hash,key| hash[key] = [] } @every_page_blocks = [] @every_link_blocks = [] end
Initializes filtering rules.
@param [Array<String>] schemes
The list of acceptable URI schemes to visit. The `https` scheme will be ignored if `net/https` cannot be loaded.
@param [String] host
The host-name to visit.
@param [Array<String, Regexp, Proc>] hosts
The patterns which match the host-names to visit.
@param [Array<String, Regexp, Proc>] ignore_hosts
The patterns which match the host-names to not visit.
@param [Array<Integer, Regexp, Proc>] ports
The patterns which match the ports to visit.
@param [Array<Integer, Regexp, Proc>] ignore_ports
The patterns which match the ports to not visit.
@param [Array<String, Regexp, Proc>] links
The patterns which match the links to visit.
@param [Array<String, Regexp, Proc>] ignore_links
The patterns which match the links to not visit.
@param [Array<String, Regexp, Proc>] urls
The patterns which match the URLs to visit.
@param [Array<String, Regexp, Proc>] ignore_urls
The patterns which match the URLs to not visit.
@param [Array<String, Regexp, Proc>] exts
The patterns which match the URI path extensions to visit.
@param [Array<String, Regexp, Proc>] ignore_exts
The patterns which match the URI path extensions to not visit.
# File lib/spidr/agent/filters.rb, line 398 def initialize_filters(schemes: self.class.default_schemes, host: nil, hosts: nil, ignore_hosts: nil, ports: nil, ignore_ports: nil, links: nil, ignore_links: nil, urls: nil, ignore_urls: nil, exts: nil, ignore_exts: nil) @schemes = schemes.map(&:to_s) @host_rules = Rules.new(accept: hosts, reject: ignore_hosts) @port_rules = Rules.new(accept: ports, reject: ignore_ports) @link_rules = Rules.new(accept: links, reject: ignore_links) @url_rules = Rules.new(accept: urls, reject: ignore_urls) @ext_rules = Rules.new(accept: exts, reject: ignore_exts) visit_hosts_like(host) if host end
Initializes the Sanitizer rules.
@param [Boolean] strip_fragments
Specifies whether or not to strip the fragment component from URLs.
@param [Boolean] strip_query
Specifies whether or not to strip the query component from URLs.
@since 0.2.2
# File lib/spidr/agent/sanitizers.rb, line 47 def initialize_sanitizers(strip_fragments: true, strip_query: false) @strip_fragments = strip_fragments @strip_query = strip_query end
Determines if the maximum limit has been reached.
@return [Boolean]
@since 0.6.0
# File lib/spidr/agent.rb, line 933 def limit_reached? @limit && @history.length >= @limit end
Normalizes the request path and grabs a session to handle page get and post requests.
@param [URI::HTTP] url
The URL to request.
@yield [request]
A block whose purpose is to make a page request.
@yieldparam [Net::HTTP] session
An HTTP session object.
@yieldparam [String] path
Normalized URL string.
@yieldparam [Hash] headers
A Hash of request header options.
@since 0.2.2
# File lib/spidr/agent.rb, line 885 def prepare_request(url,&block) path = unless url.path.empty? url.path else '/' end # append the URL query to the path path += "?#{url.query}" if url.query headers = prepare_request_headers(url) begin sleep(@delay) if @delay > 0 yield @sessions[url], path, headers rescue SystemCallError, Timeout::Error, SocketError, IOError, OpenSSL::SSL::SSLError, Net::HTTPBadResponse, Zlib::Error @sessions.kill!(url) failed(url) return nil end end
Prepares request headers for the given URL.
@param [URI::HTTP] url
The URL to prepare the request headers for.
@return [Hash{String => String}]
The prepared headers.
@since 0.6.0
# File lib/spidr/agent.rb, line 836 def prepare_request_headers(url) # set any additional HTTP headers headers = @default_headers.dup unless @host_headers.empty? @host_headers.each do |name,header| if url.host.match(name) headers['Host'] = header break end end end headers['Host'] ||= @host_header if @host_header headers['User-Agent'] = @user_agent if @user_agent headers['Referer'] = @referer if @referer if (authorization = @authorized.for_url(url)) headers['Authorization'] = "Basic #{authorization}" end if (header_cookies = @cookies.for_host(url.host)) headers['Cookie'] = header_cookies end return headers end
Determines if a given URL should be visited.
@param [URI::HTTP] url
The URL in question.
@return [Boolean]
Specifies whether the given URL should be visited.
# File lib/spidr/agent.rb, line 946 def visit?(url) !visited?(url) && visit_scheme?(url.scheme) && visit_host?(url.host) && visit_port?(url.port) && visit_link?(url.to_s) && visit_url?(url) && visit_ext?(url.path) && robot_allowed?(url.to_s) end
Determines if a given URI
path extension should be visited.
@param [String] path
The path that contains the extension.
@return [Boolean]
Specifies whether the given URI path extension should be visited.
# File lib/spidr/agent/filters.rb, line 525 def visit_ext?(path) @ext_rules.accept?(File.extname(path)[1..-1]) end
Determines if a given host-name should be visited.
@param [String] host
The host-name.
@return [Boolean]
Specifies whether the given host-name should be visited.
# File lib/spidr/agent/filters.rb, line 471 def visit_host?(host) @host_rules.accept?(host) end
Determines if a given link should be visited.
@param [String] link
The link.
@return [Boolean]
Specifies whether the given link should be visited.
# File lib/spidr/agent/filters.rb, line 497 def visit_link?(link) @link_rules.accept?(link) end
Determines if a given port should be visited.
@param [Integer] port
The port number.
@return [Boolean]
Specifies whether the given port should be visited.
# File lib/spidr/agent/filters.rb, line 484 def visit_port?(port) @port_rules.accept?(port) end
Determines if a given URI
scheme should be visited.
@param [String] scheme
The URI scheme.
@return [Boolean]
Specifies whether the given scheme should be visited.
# File lib/spidr/agent/filters.rb, line 454 def visit_scheme?(scheme) if scheme @schemes.include?(scheme) else true end end
Determines if a given URL should be visited.
@param [URI::HTTP, URI::HTTPS] link
The URL.
@return [Boolean]
Specifies whether the given URL should be visited.
@since 0.2.4
# File lib/spidr/agent/filters.rb, line 512 def visit_url?(link) @url_rules.accept?(link) end