module RelatonIetf::Scrapper

Scrapper module

Constants

BCP_URI_PATTERN
GH_URL

Public Class Methods

fetch_rfc(reference, is_relation: false, url: nil, ver: nil) click to toggle source

@param reference [Nokogiri::XML::Element, nil] @param is_relation [TrueClass, FalseClass] @param url [String, NilClass] @param ver [String, NilClass] Internet Draft version @return [RelatonIetf::IetfBibliographicItem]

# File lib/relaton_ietf/scrapper.rb, line 36
def fetch_rfc(reference, is_relation: false, url: nil, ver: nil) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
  return unless reference

  ietf_item(
    is_relation: is_relation,
    id: reference[:anchor],
    type: "standard",
    docid: docids(reference, ver),
    status: status(reference),
    language: [language(reference)],
    link: link(reference, url, ver),
    title: titles(reference),
    formattedref: formattedref(reference),
    abstract: abstracts(reference),
    contributor: contributors(reference),
    relation: relations(reference),
    date: dates(reference),
    series: series(reference),
    place: ["Fremont, CA"],
    keyword: reference.xpath("front/keyword").map(&:text),
    doctype: doctype(reference[:anchor]),
  )
end
scrape_page(text, is_relation: false) click to toggle source

@param text [String] @param is_relation [TrueClass, FalseClass] @return [RelatonIetf::IetfBibliographicItem]

# File lib/relaton_ietf/scrapper.rb, line 19
def scrape_page(text, is_relation: false)
  # Remove initial "IETF " string if specified
  ref = text.gsub(/^IETF /, "")
  /^(?:RFC|BCP|FYI|STD)\s(?<num>\d+)/ =~ ref
  ref.sub! /(?<=^(?:RFC|BCP|FYI|STD)\s)(\d+)/, num.rjust(4, "0") if num
  rfc_item ref, is_relation
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
       Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
       Net::ProtocolError, SocketError
  raise RelatonBib::RequestError, "No document found for #{ref} reference"
end

Private Class Methods

abstracts(ref) click to toggle source

@param reference [Nokogiri::XML::Element] @return [Array<RelatonBib::FormattedString>]

# File lib/relaton_ietf/scrapper.rb, line 158
def abstracts(ref)
  ref.xpath("./front/abstract").map do |a|
    RelatonBib::FormattedString.new(
      content: a.text.gsub(/\\n\\t{2,4}/, " ").strip,
      language: language(ref), script: "Latn"
    )
  end
end
add_contact(contacts, type, value) click to toggle source

@param type [String] allowed “phone”, “email” or “uri” @param value [String]

# File lib/relaton_ietf/scrapper.rb, line 253
def add_contact(contacts, type, value)
  return unless value

  contacts << RelatonBib::Contact.new(type: type, value: value.text)
end
address(postal) click to toggle source

@param postal [Nokogiri::XML::Element] @rerurn [RelatonBib::Address]

# File lib/relaton_ietf/scrapper.rb, line 241
def address(postal) # rubocop:disable Metrics/CyclomaticComplexity
  RelatonBib::Address.new(
    street: [(postal.at("./postalLine") || postal.at("./street"))&.text],
    city: postal.at("./city")&.text,
    postcode: postal.at("./code")&.text,
    country: postal.at("./country")&.text,
    state: postal.at("./region")&.text,
  )
end
affiliation(author) click to toggle source

@param author [Nokogiri::XML::Element] @return [RelatonBib::Affiliation]

# File lib/relaton_ietf/scrapper.rb, line 261
def affiliation(author)
  organization = author.at("./organization")
  org = if organization.nil? || organization&.text&.empty?
          new_org
        else
          new_org organization.text, organization[:abbrev]
        end
  RelatonBib::Affiliation.new organization: org
end
contacts(addr) click to toggle source

@param postal [Nokogiri::XML::Element] @return [Array<RelatonBib::Address, RelatonBib::Phone>]

# File lib/relaton_ietf/scrapper.rb, line 227
def contacts(addr)
  contacts = []
  return contacts unless addr

  postal = addr.at("./postal")
  contacts << address(postal) if postal
  add_contact(contacts, "phone", addr.at("./phone"))
  add_contact(contacts, "email", addr.at("./email"))
  add_contact(contacts, "uri", addr.at("./uri"))
  contacts
end
contributor_role(author) click to toggle source

@param author [Nokogiri::XML::Document] @return [Hash]

# File lib/relaton_ietf/scrapper.rb, line 280
def contributor_role(author)
  { type: author[:role] || "author" }
end
contributors(reference) click to toggle source

@param reference [Nokogiri::XML::Element] @return [Array<Hash>]

# File lib/relaton_ietf/scrapper.rb, line 169
def contributors(reference)
  persons(reference) + organizations(reference)
end
dates(reference) click to toggle source

Extract date from reference.

@param reference [Nokogiri::XML::Element] @return [Array<RelatonBib::BibliographicDate>] published data.

# File lib/relaton_ietf/scrapper.rb, line 297
def dates(reference)
  return unless (date = reference.at "./front/date")

  d = [date[:year], month(date[:month]),
       (date[:day] || 1)].compact.join "-"
  date = Time.parse(d).strftime "%Y-%m-%d"
  [RelatonBib::BibliographicDate.new(type: "published", on: date)]
end
docids(reference, ver) click to toggle source

Extract document identifiers from reference

@param reference [Nokogiri::XML::Element] @param ver [String, NilClass] Internet Draft version

@return [Array<RelatonBib::DocumentIdentifier>]

# File lib/relaton_ietf/scrapper.rb, line 314
def docids(reference, ver) # rubocop:disable Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity,Metrics/AbcSize
  id = (reference[:anchor] || reference[:docName] || reference[:number])
  ret = []
  if id
    ret << RelatonBib::DocumentIdentifier.new(
      type: "IETF", id: id.sub(/^(RFC)/, "\\1 "),
    )
  end
  if (id = reference[:anchor])
    ret << RelatonBib::DocumentIdentifier.new(type: "rfc-anchor", id: id)
  end
  ret + reference.xpath("./seriesInfo").map do |si|
    next unless ["DOI", "Internet-Draft"].include? si[:name]

    id = si[:value]
    id.sub! /(?<=-)\d{2}$/, ver if ver && si[:name] == "Internet-Draft"
    RelatonBib::DocumentIdentifier.new(id: id, type: si[:name])
  end.compact
end
doctype(anchor) click to toggle source

@param anchor [String] @return [String]

# File lib/relaton_ietf/scrapper.rb, line 64
def doctype(anchor)
  anchor&.include?("I-D") ? "internet-draft" : "rfc"
end
formattedref(reference) click to toggle source

@param reference [Nokogiri::XML::Element] @return [RelatonBib::FormattedRef, nil]

# File lib/relaton_ietf/scrapper.rb, line 145
def formattedref(reference)
  return if reference.at "./fornt/title"

  cont = (reference[:anchor] || reference[:docName] || reference[:number])
  if cont
    RelatonBib::FormattedRef.new(
      content: cont, language: language(reference), script: "Latn",
    )
  end
end
full_name(author, ref) click to toggle source

@param author [Nokogiri::XML::Element] @param ref [Nokogiri::XML::Element] @return [RelatonBib::FullName]

# File lib/relaton_ietf/scrapper.rb, line 207
def full_name(author, ref)
  lang = language ref
  RelatonBib::FullName.new(
    completename: localized_string(author[:fullname], lang),
    initial: [localized_string(author[:initials], lang)].compact,
    surname: localized_string(author[:surname], lang),
  )
end
get_page(uri) click to toggle source

@param uri [String] @return [String] HTTP response body

# File lib/relaton_ietf/scrapper.rb, line 119
def get_page(uri)
  res = Net::HTTP.get_response(URI(uri))
  return unless res.code == "200"

  #   raise RelatonBib::RequestError, "No document found at #{uri}"
  # end

  res.body
end
ietf_item(**attrs) click to toggle source

@param attrs [Hash] @return [RelatonIetf::IetfBibliographicItem]

# File lib/relaton_ietf/scrapper.rb, line 87
def ietf_item(**attrs)
  attrs[:fetched] = Date.today.to_s unless attrs.delete(:is_relation)
  attrs[:script] = ["Latn"]
  RelatonIetf::IetfBibliographicItem.new **attrs
end
language(reference) click to toggle source

@param reference [Nokogiri::XML::Element] @return [String]

# File lib/relaton_ietf/scrapper.rb, line 131
def language(reference)
  reference[:lang] || "en"
end
localized_string(content, lang) click to toggle source

@param content [String] @param lang [String] @return [RelatonBib::LocalizedString]

# File lib/relaton_ietf/scrapper.rb, line 219
def localized_string(content, lang)
  return unless content

  RelatonBib::LocalizedString.new(content, lang)
end
month(mon) click to toggle source
# File lib/relaton_ietf/scrapper.rb, line 284
def month(mon)
  return 1 if !mon || mon.empty?
  return mon if /^\d+$/.match? mon

  Date::MONTHNAMES.index(mon)
end
new_org(name = "Internet Engineering Task Force", abbr = "IETF") click to toggle source

@param name [String] @param abbr [String] @return [RelatonBib::Organization]

# File lib/relaton_ietf/scrapper.rb, line 274
def new_org(name = "Internet Engineering Task Force", abbr = "IETF")
  RelatonBib::Organization.new name: name, abbreviation: abbr
end
organizations(reference) click to toggle source

@param reference [Nokogiri::XML::Element] @return [Array<Hash{Symbol=>RelatonBib::Organization,

Symbol=>Array<String>}>]
# File lib/relaton_ietf/scrapper.rb, line 190
def organizations(reference)
  publisher = { entity: new_org, role: [type: "publisher"] }
  orgs = reference.xpath("./seriesinfo").reduce([publisher]) do |mem, si|
    next mem unless si[:stream]

    mem << { entity: new_org(si[:stream], nil), role: [type: "author"] }
  end
  orgs + reference.xpath(
    "front/author[not(@surname)][not(@fullname)]/organization",
  ).map do |org|
    { entity: new_org(org.text, nil), role: [type: "author"] }
  end
end
persons(reference) click to toggle source

@param reference [Nokogiri::XML::Element] @return [Array<Hash{Symbol=>RelatonBib::Person,Symbol=>Array<String>}>]

# File lib/relaton_ietf/scrapper.rb, line 175
def persons(reference)
  reference.xpath("./front/author[@surname]|./front/author[@fullname]")
    .map do |author|
    entity = RelatonBib::Person.new(
      name: full_name(author, reference),
      affiliation: [affiliation(author)],
      contact: contacts(author.at("./address")),
    )
    { entity: entity, role: [contributor_role(author)] }
  end
end
relations(reference) click to toggle source

@param reference [Nokogiri::XML::Element] @return [Hash]

# File lib/relaton_ietf/scrapper.rb, line 111
def relations(reference)
  reference.xpath("reference").map do |ref|
    { type: "includes", bibitem: fetch_rfc(ref, is_relation: true) }
  end
end
rfc_item(ref, is_relation) click to toggle source

@param ref [String] @param is_relation [Boolen, nil] @return [RelatonIetf::IetfBibliographicItem]

# File lib/relaton_ietf/scrapper.rb, line 96
def rfc_item(ref, is_relation)
  /(?<=-)(?<ver>\d{2})$/ =~ ref
  if /^I-D/.match? ref
    ref.sub! /-\d{2}/, "" if ver
    ref.sub! /(?<=I-D\.)draft-/, ""
  end

  uri = "#{GH_URL}#{ref.sub(/\s|\u00a0/, '.')}.xml"
  doc = Nokogiri::XML get_page(uri)
  r = doc.at("/referencegroup", "/reference")
  fetch_rfc r, is_relation: is_relation, url: uri, ver: ver
end
series(reference) click to toggle source

Extract series form reference @param reference [Nokogiri::XML::Element]

@return [Array<RelatonBib::Series>]

# File lib/relaton_ietf/scrapper.rb, line 340
def series(reference)
  reference.xpath("./seriesInfo").map do |si|
    next if si[:name] == "DOI" || si[:stream] || si[:status]

    RelatonBib::Series.new(
      title: RelatonBib::TypedTitleString.new(
        content: si[:name], language: language(reference), script: "Latn",
      ),
      number: si[:value],
      type: "main",
    )
  end.compact
end
status(reference) click to toggle source

extract status @param reference [Nokogiri::XML::Element]

@return [RelatonBib::DocumentStatus]

# File lib/relaton_ietf/scrapper.rb, line 360
def status(reference)
  st = reference.at("./seriesinfo[@status]")
  return unless st

  RelatonBib::DocumentStatus.new(stage: st[:status])
end
titles(reference) click to toggle source

@param reference [Nokogiri::XML::Element] @return [Array<Hash>]

# File lib/relaton_ietf/scrapper.rb, line 137
def titles(reference)
  reference.xpath("./front/title").map do |title|
    { content: title.text, language: language(reference), script: "Latn" }
  end
end