class Spidr::Page

Represents a requested page from a website.

Constants

Reserved names used within Cookie strings

Attributes

headers[R]

Headers returned with the body

response[R]

HTTP Response

url[R]

URL of the page

Public Class Methods

new(url,response) click to toggle source

Creates a new Page object.

@param [URI::HTTP] url

The URL of the page.

@param [Net::HTTPResponse] response

The response from the request for the page.
# File lib/spidr/page.rb, line 27
def initialize(url,response)
  @url      = url
  @response = response
  @headers  = response.to_hash
  @doc      = nil
end

Public Instance Methods

%(*arguments)
Alias for: at
/(*paths)
Alias for: search
at(*arguments) click to toggle source

Searches for the first occurrence an XPath or CSS Path expression.

@return [Nokogiri::HTML::Node, Nokogiri::XML::Node, nil]

The first matched node. Returns `nil` if no nodes could be matched,
or if the page is not a HTML or XML document.

@example

page.at('//title')

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Node.html#M000251

# File lib/spidr/page.rb, line 110
def at(*arguments)
  if doc
    doc.at(*arguments)
  end
end
Also aliased as: %
atom?() click to toggle source

Determines if the page is an Atom feed.

@return [Boolean]

Specifies whether the page is an Atom feed.
# File lib/spidr/page/content_types.rb, line 193
def atom?
  is_content_type?('application/atom+xml')
end
bad_request?() click to toggle source

Determines if the response code is `400`.

@return [Boolean]

Specifies whether the response code is `400`.
# File lib/spidr/page/status_codes.rb, line 33
def bad_request?
  code == 400
end
body() click to toggle source

The body of the response.

@return [String]

The body of the response.
# File lib/spidr/page.rb, line 40
def body
  (response.body || '')
end
Also aliased as: to_s
code() click to toggle source

The response code from the page.

@return [Integer]

Response code from the page.
# File lib/spidr/page/status_codes.rb, line 11
def code
  @response.code.to_i
end
content_charset() click to toggle source

The charset included in the Content-Type.

@return [String, nil]

The charset of the content.

@since 0.4.0

# File lib/spidr/page/content_types.rb, line 35
def content_charset
  content_types.each do |value|
    if value.include?(';')
      value.split(';').each do |param|
        param.strip!

        if param.start_with?('charset=')
          return param.split('=',2).last
        end
      end
    end
  end

  return nil
end
content_type() click to toggle source

The Content-Type of the page.

@return [String]

The Content-Type of the page.
# File lib/spidr/page/content_types.rb, line 11
def content_type
  @response['Content-Type'] || ''
end
content_types() click to toggle source

The content types of the page.

@return [Array<String>]

The values within the Content-Type header.

@since 0.2.2

# File lib/spidr/page/content_types.rb, line 23
def content_types
  @response.get_fields('content-type') || []
end
cookies() click to toggle source

The Cookie values sent along with the page.

@return [Array<String>]

The Cookies from the response.

@since 0.2.2

# File lib/spidr/page/cookies.rb, line 32
def cookies
  (@response.get_fields('Set-Cookie') || [])
end
css?() click to toggle source

Determines if the page is a CSS stylesheet.

@return [Boolean]

Specifies whether the page is a CSS stylesheet.
# File lib/spidr/page/content_types.rb, line 172
def css?
  is_content_type?('text/css')
end
directory?() click to toggle source

Determines if the page is a Directory Listing.

@return [Boolean]

Specifies whether the page is a Directory Listing.

@since 0.3.0

# File lib/spidr/page/content_types.rb, line 108
def directory?
  is_content_type?('text/directory')
end
doc() click to toggle source

Returns a parsed document object for HTML, XML, RSS and Atom pages.

@return [Nokogiri::HTML::Document, Nokogiri::XML::Document, nil]

The document that represents HTML or XML pages.
Returns `nil` if the page is neither HTML, XML, RSS, Atom or if
the page could not be parsed properly.

@see nokogiri.rubyforge.org/nokogiri/Nokogiri/XML/Document.html @see nokogiri.rubyforge.org/nokogiri/Nokogiri/HTML/Document.html

# File lib/spidr/page.rb, line 57
def doc
  unless body.empty?
    doc_class = if html?
                  Nokogiri::HTML::Document
                elsif rss? || atom? || xml? || xsl?
                  Nokogiri::XML::Document
                end

    if doc_class
      begin
        @doc ||= doc_class.parse(body, @url.to_s, content_charset)
      rescue
      end
    end
  end
end
each()
Alias for: each_url
each_mailto() { |get_attribute('href')[7..-1]| ... } click to toggle source

Enumerates over every `mailto:` link in the page.

@yield [link]

The given block will be passed every `mailto:` link from the page.

@yieldparam [String] link

A `mailto:` link from the page.

@return [Enumerator]

If no block is given, an enumerator object will be returned.

@since 0.5.0

# File lib/spidr/page/html.rb, line 147
def each_mailto
  return enum_for(__method__) unless block_given?

  if (html? && doc)
    doc.search('//a[starts-with(@href,"mailto:")]').each do |a|
      yield a.get_attribute('href')[7..-1]
    end
  end
end
each_meta_redirect() { |redirect| ... } click to toggle source

Enumerates over the meta-redirect links in the page.

@yield [link]

If a block is given, it will be passed every meta-redirect link
from the page.

@yieldparam [String] link

A meta-redirect link from the page.

@return [Enumerator]

If no block is given, an enumerator object will be returned.

@since 0.3.0

# File lib/spidr/page/html.rb, line 38
def each_meta_redirect
  return enum_for(__method__) unless block_given?

  if (html? && doc)
    search('//meta[@http-equiv and @content]').each do |node|
      if node.get_attribute('http-equiv') =~ /refresh/i
        content = node.get_attribute('content')

        if (redirect = content.match(/url=(\S+)$/))
          yield redirect[1]
        end
      end
    end
  end
end
each_redirect(&block) click to toggle source

Enumerates over every HTTP or meta-redirect link in the page.

@yield [link]

The given block will be passed every redirection link from the page.

@yieldparam [String] link

A HTTP or meta-redirect link from the page.

@return [Enumerator]

If no block is given, an enumerator object will be returned.

@since 0.3.0

# File lib/spidr/page/html.rb, line 108
def each_redirect(&block)
  return enum_for(__method__) unless block

  locations = @response.get_fields('Location')

  unless (locations.nil? || locations.empty?)
    # Location headers override any meta-refresh redirects in the HTML
    locations.each(&block)
  else
    # check page-level meta redirects if there isn't a location header
    each_meta_redirect(&block)
  end
end
each_url() { |url| ... } click to toggle source

Enumerates over every absolute URL in the page.

@yield [url]

The given block will be passed every URL in the page.

@yieldparam [URI::HTTP] url

An absolute URL in the page.

@return [Enumerator]

If no block is given, an enumerator object will be returned.

@since 0.3.0

# File lib/spidr/page/html.rb, line 236
def each_url
  return enum_for(__method__) unless block_given?

  each_link do |link|
    if (url = to_absolute(link))
      yield url
    end
  end
end
Also aliased as: each
forbidden?()
Alias for: is_forbidden?
gif?() click to toggle source

Determines if the page is a GIF image.

@return [Boolean]

Specifies whether the page is a GIF image.

@since 0.7.0

# File lib/spidr/page/content_types.rb, line 247
def gif?
  is_content_type?('image/gif')
end
had_internal_server_error?() click to toggle source

Determines if the response code is `500`.

@return [Boolean]

Specifies whether the response code is `500`.
# File lib/spidr/page/status_codes.rb, line 91
def had_internal_server_error?
  code == 500
end
html?() click to toggle source

Determines if the page is HTML document.

@return [Boolean]

Specifies whether the page is HTML document.
# File lib/spidr/page/content_types.rb, line 118
def html?
  is_content_type?('text/html')
end
ico?() click to toggle source

Determines if the page is a ICO image.

@return [Boolean]

Specifies whether the page is a ICO image.

@since 0.7.0

# File lib/spidr/page/content_types.rb, line 271
def ico?
  is_content_type?('image/x-icon') ||
    is_content_type?('image/vnd.microsoft.icon')
end
Also aliased as: icon?
icon?()
Alias for: ico?
is_content_type?(type) click to toggle source

Determines if any of the content-types of the page include a given type.

@param [String] type

The content-type to test for.

@return [Boolean]

Specifies whether the page includes the given content-type.

@example Match the Content-Type

page.is_content_type?('application/json')

@example Match the sub-type of the Content-Type

page.is_content_type?('json')

@since 0.4.0

# File lib/spidr/page/content_types.rb, line 69
def is_content_type?(type)
  if type.include?('/')
    # otherwise only match the first param
    content_types.any? do |value|
      value = value.split(';',2).first

      value == type
    end
  else
    # otherwise only match the sub-type
    content_types.any? do |value|
      value = value.split(';',2).first
      value = value.split('/',2).last

      value == type
    end
  end
end
is_forbidden?() click to toggle source

Determines if the response code is `403`.

@return [Boolean]

Specifies whether the response code is `403`.
# File lib/spidr/page/status_codes.rb, line 55
def is_forbidden?
  code == 403
end
Also aliased as: forbidden?
is_missing?() click to toggle source

Determines if the response code is `404`.

@return [Boolean]

Specifies whether the response code is `404`.
# File lib/spidr/page/status_codes.rb, line 67
def is_missing?
  code == 404
end
Also aliased as: missing?
is_ok?() click to toggle source

Determines if the response code is `200`.

@return [Boolean]

Specifies whether the response code is `200`.
# File lib/spidr/page/status_codes.rb, line 21
def is_ok?
  code == 200
end
Also aliased as: ok?
is_redirect?() click to toggle source

Determines if the response code is `300`, `301`, `302`, `303` or `307`. Also checks for “soft” redirects added at the page level by a meta refresh tag.

@return [Boolean]

Specifies whether the response code is a HTTP Redirect code.
# File lib/spidr/page/status_codes.rb, line 103
def is_redirect?
  case code
  when 300..303, 307
    true
  when 200
    meta_redirect?
  else
    false
  end
end
Also aliased as: redirect?
is_timedout?() click to toggle source

Determines if the response code is `408`.

@return [Boolean]

Specifies whether the response code is `408`.
# File lib/spidr/page/status_codes.rb, line 79
def is_timedout?
  code == 408
end
Also aliased as: timedout?
is_unauthorized?() click to toggle source

Determines if the response code is `401`.

@return [Boolean]

Specifies whether the response code is `401`.
# File lib/spidr/page/status_codes.rb, line 43
def is_unauthorized?
  code == 401
end
Also aliased as: unauthorized?
javascript?() click to toggle source

Determines if the page is JavaScript.

@return [Boolean]

Specifies whether the page is JavaScript.
# File lib/spidr/page/content_types.rb, line 149
def javascript?
  is_content_type?('text/javascript') || \
    is_content_type?('application/javascript')
end
jpeg?() click to toggle source

Determines if the page is a JPEG image.

@return [Boolean]

Specifies whether the page is a JPEG image.

@since 0.7.0

# File lib/spidr/page/content_types.rb, line 259
def jpeg?
  is_content_type?('image/jpeg')
end
json?() click to toggle source

Determines if the page is JSON.

@return [Boolean]

Specifies whether the page is JSON.

@since 0.3.0

# File lib/spidr/page/content_types.rb, line 162
def json?
  is_content_type?('application/json')
end
mailtos() click to toggle source

`mailto:` links in the page.

@return [Array<String>]

The `mailto:` links found within the page.

@since 0.5.0

# File lib/spidr/page/html.rb, line 165
def mailtos
  each_mailto.to_a
end
meta_redirect() click to toggle source

The meta-redirect links of the page.

@return [Array<String>]

All meta-redirect links in the page.

@deprecated

Deprecated in 0.3.0 and will be removed in 0.4.0.
Use {#meta_redirects} instead.
# File lib/spidr/page/html.rb, line 87
def meta_redirect
  warn 'DEPRECATION: Spidr::Page#meta_redirect will be removed in 0.3.0'
  warn 'DEPRECATION: Use Spidr::Page#meta_redirects instead'

  meta_redirects
end
meta_redirect?() click to toggle source

Returns a boolean indicating whether or not page-level meta redirects are present in this page.

@return [Boolean]

Specifies whether the page includes page-level redirects.
# File lib/spidr/page/html.rb, line 61
def meta_redirect?
  !each_meta_redirect.first.nil?
end
meta_redirects() click to toggle source

The meta-redirect links of the page.

@return [Array<String>]

All meta-redirect links in the page.

@since 0.3.0

# File lib/spidr/page/html.rb, line 73
def meta_redirects
  each_meta_redirect.to_a
end
missing?()
Alias for: is_missing?
ms_word?() click to toggle source

Determines if the page is a MS Word document.

@return [Boolean]

Specifies whether the page is a MS Word document.
# File lib/spidr/page/content_types.rb, line 203
def ms_word?
  is_content_type?('application/msword')
end
ok?()
Alias for: is_ok?
pdf?() click to toggle source

Determines if the page is a PDF document.

@return [Boolean]

Specifies whether the page is a PDF document.
# File lib/spidr/page/content_types.rb, line 213
def pdf?
  is_content_type?('application/pdf')
end
plain_text?() click to toggle source

Determines if the page is plain-text.

@return [Boolean]

Specifies whether the page is plain-text.
# File lib/spidr/page/content_types.rb, line 94
def plain_text?
  is_content_type?('text/plain')
end
Also aliased as: txt?
png?() click to toggle source

Determines if the page is a PNG image.

@return [Boolean]

Specifies whether the page is a PNG image.

@since 0.7.0

# File lib/spidr/page/content_types.rb, line 235
def png?
  is_content_type?('image/png')
end
redirect?()
Alias for: is_redirect?
redirects_to() click to toggle source

URLs that this document redirects to.

@return [Array<String>]

The links that this page redirects to (usually found in a
location header or by way of a page-level meta redirect).
# File lib/spidr/page/html.rb, line 129
def redirects_to
  each_redirect.to_a
end
rss?() click to toggle source

Determines if the page is a RSS feed.

@return [Boolean]

Specifies whether the page is a RSS feed.
# File lib/spidr/page/content_types.rb, line 182
def rss?
  is_content_type?('application/rss+xml') || \
    is_content_type?('application/rdf+xml')
end
timedout?()
Alias for: is_timedout?
title() click to toggle source

The title of the HTML page.

@return [String]

The inner-text of the title element of the page.
# File lib/spidr/page/html.rb, line 17
def title
  if (node = at('//title'))
    node.inner_text
  end
end
to_absolute(link) click to toggle source

Normalizes and expands a given link into a proper URI.

@param [String] link

The link to normalize and expand.

@return [URI::HTTP]

The normalized URI.
# File lib/spidr/page/html.rb, line 267
def to_absolute(link)
  link    = link.to_s
  new_url = begin
              url.merge(link)
            rescue URI::Error
              return
            end

  if (!new_url.opaque) && (path = new_url.path)
    # ensure that paths begin with a leading '/' for URI::FTP
    if (new_url.scheme == 'ftp' && !path.start_with?('/'))
      path.insert(0,'/')
    end

    # make sure the path does not contain any .. or . directories,
    # since URI::Generic#merge cannot normalize paths such as
    # "/stuff/../"
    new_url.path = URI.expand_path(path)
  end

  return new_url
end
to_s()
Alias for: body
txt?()
Alias for: plain_text?
unauthorized?()
Alias for: is_unauthorized?
urls() click to toggle source

Absolute URIs from within the page.

@return [Array<URI::HTTP>]

The links from within the page, converted to absolute URIs.
# File lib/spidr/page/html.rb, line 254
def urls
  each_url.to_a
end
xml?() click to toggle source

Determines if the page is XML document.

@return [Boolean]

Specifies whether the page is XML document.
# File lib/spidr/page/content_types.rb, line 128
def xml?
  is_content_type?('text/xml') || \
    is_content_type?('application/xml')
end
xsl?() click to toggle source

Determines if the page is XML Stylesheet (XSL).

@return [Boolean]

Specifies whether the page is XML Stylesheet (XSL).
# File lib/spidr/page/content_types.rb, line 139
def xsl?
  is_content_type?('text/xsl')
end
zip?() click to toggle source

Determines if the page is a ZIP archive.

@return [Boolean]

Specifies whether the page is a ZIP archive.
# File lib/spidr/page/content_types.rb, line 223
def zip?
  is_content_type?('application/zip')
end

Protected Instance Methods

method_missing(name,*arguments,&block) click to toggle source

Provides transparent access to the values in {#headers}.

@param [Symbol] name

The name of the missing method.

@param [Array] arguments

Additional arguments for the missing method.

@return [String]

The missing method mapped to a header in {#headers}.

@raise [NoMethodError]

The missing method did not map to a header in {#headers}.
Calls superclass method
# File lib/spidr/page.rb, line 136
def method_missing(name,*arguments,&block)
  if (arguments.empty? && block.nil?)
    header_name = name.to_s.tr('_','-')

    if @response.key?(header_name)
      return @response[header_name]
    end
  end

  return super(name,*arguments,&block)
end