module RelatonIetf::Scrapper
Scrapper
module
Constants
- BCP_URI_PATTERN
- GH_URL
Public Class Methods
@param reference [Nokogiri::XML::Element, nil] @param is_relation [TrueClass, FalseClass] @param url [String, NilClass] @param ver [String, NilClass] Internet Draft version @return [RelatonIetf::IetfBibliographicItem]
# File lib/relaton_ietf/scrapper.rb, line 36 def fetch_rfc(reference, is_relation: false, url: nil, ver: nil) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength return unless reference ietf_item( is_relation: is_relation, id: reference[:anchor], type: "standard", docid: docids(reference, ver), status: status(reference), language: [language(reference)], link: link(reference, url, ver), title: titles(reference), formattedref: formattedref(reference), abstract: abstracts(reference), contributor: contributors(reference), relation: relations(reference), date: dates(reference), series: series(reference), place: ["Fremont, CA"], keyword: reference.xpath("front/keyword").map(&:text), doctype: doctype(reference[:anchor]), ) end
@param text [String] @param is_relation [TrueClass, FalseClass] @return [RelatonIetf::IetfBibliographicItem]
# File lib/relaton_ietf/scrapper.rb, line 19 def scrape_page(text, is_relation: false) # Remove initial "IETF " string if specified ref = text.gsub(/^IETF /, "") /^(?:RFC|BCP|FYI|STD)\s(?<num>\d+)/ =~ ref ref.sub! /(?<=^(?:RFC|BCP|FYI|STD)\s)(\d+)/, num.rjust(4, "0") if num rfc_item ref, is_relation rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError, SocketError raise RelatonBib::RequestError, "No document found for #{ref} reference" end
Private Class Methods
@param reference [Nokogiri::XML::Element] @return [Array<RelatonBib::FormattedString>]
# File lib/relaton_ietf/scrapper.rb, line 158 def abstracts(ref) ref.xpath("./front/abstract").map do |a| RelatonBib::FormattedString.new( content: a.text.gsub(/\\n\\t{2,4}/, " ").strip, language: language(ref), script: "Latn" ) end end
@param type [String] allowed “phone”, “email” or “uri” @param value [String]
# File lib/relaton_ietf/scrapper.rb, line 253 def add_contact(contacts, type, value) return unless value contacts << RelatonBib::Contact.new(type: type, value: value.text) end
@param postal [Nokogiri::XML::Element] @rerurn [RelatonBib::Address]
# File lib/relaton_ietf/scrapper.rb, line 241 def address(postal) # rubocop:disable Metrics/CyclomaticComplexity RelatonBib::Address.new( street: [(postal.at("./postalLine") || postal.at("./street"))&.text], city: postal.at("./city")&.text, postcode: postal.at("./code")&.text, country: postal.at("./country")&.text, state: postal.at("./region")&.text, ) end
@param author [Nokogiri::XML::Element] @return [RelatonBib::Affiliation]
# File lib/relaton_ietf/scrapper.rb, line 261 def affiliation(author) organization = author.at("./organization") org = if organization.nil? || organization&.text&.empty? new_org else new_org organization.text, organization[:abbrev] end RelatonBib::Affiliation.new organization: org end
@param postal [Nokogiri::XML::Element] @return [Array<RelatonBib::Address, RelatonBib::Phone>]
# File lib/relaton_ietf/scrapper.rb, line 227 def contacts(addr) contacts = [] return contacts unless addr postal = addr.at("./postal") contacts << address(postal) if postal add_contact(contacts, "phone", addr.at("./phone")) add_contact(contacts, "email", addr.at("./email")) add_contact(contacts, "uri", addr.at("./uri")) contacts end
@param author [Nokogiri::XML::Document] @return [Hash]
# File lib/relaton_ietf/scrapper.rb, line 280 def contributor_role(author) { type: author[:role] || "author" } end
@param reference [Nokogiri::XML::Element] @return [Array<Hash>]
# File lib/relaton_ietf/scrapper.rb, line 169 def contributors(reference) persons(reference) + organizations(reference) end
Extract date from reference.
@param reference [Nokogiri::XML::Element] @return [Array<RelatonBib::BibliographicDate>] published data.
# File lib/relaton_ietf/scrapper.rb, line 297 def dates(reference) return unless (date = reference.at "./front/date") d = [date[:year], month(date[:month]), (date[:day] || 1)].compact.join "-" date = Time.parse(d).strftime "%Y-%m-%d" [RelatonBib::BibliographicDate.new(type: "published", on: date)] end
Extract document identifiers from reference
@param reference [Nokogiri::XML::Element] @param ver [String, NilClass] Internet Draft version
@return [Array<RelatonBib::DocumentIdentifier>]
# File lib/relaton_ietf/scrapper.rb, line 314 def docids(reference, ver) # rubocop:disable Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity,Metrics/AbcSize id = (reference[:anchor] || reference[:docName] || reference[:number]) ret = [] if id ret << RelatonBib::DocumentIdentifier.new( type: "IETF", id: id.sub(/^(RFC)/, "\\1 "), ) end if (id = reference[:anchor]) ret << RelatonBib::DocumentIdentifier.new(type: "rfc-anchor", id: id) end ret + reference.xpath("./seriesInfo").map do |si| next unless ["DOI", "Internet-Draft"].include? si[:name] id = si[:value] id.sub! /(?<=-)\d{2}$/, ver if ver && si[:name] == "Internet-Draft" RelatonBib::DocumentIdentifier.new(id: id, type: si[:name]) end.compact end
@param anchor [String] @return [String]
# File lib/relaton_ietf/scrapper.rb, line 64 def doctype(anchor) anchor&.include?("I-D") ? "internet-draft" : "rfc" end
@param reference [Nokogiri::XML::Element] @return [RelatonBib::FormattedRef, nil]
# File lib/relaton_ietf/scrapper.rb, line 145 def formattedref(reference) return if reference.at "./fornt/title" cont = (reference[:anchor] || reference[:docName] || reference[:number]) if cont RelatonBib::FormattedRef.new( content: cont, language: language(reference), script: "Latn", ) end end
@param author [Nokogiri::XML::Element] @param ref [Nokogiri::XML::Element] @return [RelatonBib::FullName]
# File lib/relaton_ietf/scrapper.rb, line 207 def full_name(author, ref) lang = language ref RelatonBib::FullName.new( completename: localized_string(author[:fullname], lang), initial: [localized_string(author[:initials], lang)].compact, surname: localized_string(author[:surname], lang), ) end
@param uri [String] @return [String] HTTP response body
# File lib/relaton_ietf/scrapper.rb, line 119 def get_page(uri) res = Net::HTTP.get_response(URI(uri)) return unless res.code == "200" # raise RelatonBib::RequestError, "No document found at #{uri}" # end res.body end
@param attrs [Hash] @return [RelatonIetf::IetfBibliographicItem]
# File lib/relaton_ietf/scrapper.rb, line 87 def ietf_item(**attrs) attrs[:fetched] = Date.today.to_s unless attrs.delete(:is_relation) attrs[:script] = ["Latn"] RelatonIetf::IetfBibliographicItem.new **attrs end
@param reference [Nokogiri::XML::Element] @return [String]
# File lib/relaton_ietf/scrapper.rb, line 131 def language(reference) reference[:lang] || "en" end
@param reference [Nokogiri::XML::Element] @param url [String] @param ver [String, NilClass] Internet Draft version @return [Array<Hash>]
# File lib/relaton_ietf/scrapper.rb, line 72 def link(reference, url, ver) l = [] l << { type: "xml", content: url } if url l << { type: "src", content: reference[:target] } if reference[:target] if /^I-D/.match? reference[:anchor] reference.xpath("format").each do |f| c = ver ? f[:target].sub(/(?<=-)\d{2}(?=\.)/, ver) : f[:target] l << { type: f[:type], content: c } end end l end
@param content [String] @param lang [String] @return [RelatonBib::LocalizedString]
# File lib/relaton_ietf/scrapper.rb, line 219 def localized_string(content, lang) return unless content RelatonBib::LocalizedString.new(content, lang) end
# File lib/relaton_ietf/scrapper.rb, line 284 def month(mon) return 1 if !mon || mon.empty? return mon if /^\d+$/.match? mon Date::MONTHNAMES.index(mon) end
@param name [String] @param abbr [String] @return [RelatonBib::Organization]
# File lib/relaton_ietf/scrapper.rb, line 274 def new_org(name = "Internet Engineering Task Force", abbr = "IETF") RelatonBib::Organization.new name: name, abbreviation: abbr end
@param reference [Nokogiri::XML::Element] @return [Array<Hash{Symbol=>RelatonBib::Organization,
Symbol=>Array<String>}>]
# File lib/relaton_ietf/scrapper.rb, line 190 def organizations(reference) publisher = { entity: new_org, role: [type: "publisher"] } orgs = reference.xpath("./seriesinfo").reduce([publisher]) do |mem, si| next mem unless si[:stream] mem << { entity: new_org(si[:stream], nil), role: [type: "author"] } end orgs + reference.xpath( "front/author[not(@surname)][not(@fullname)]/organization", ).map do |org| { entity: new_org(org.text, nil), role: [type: "author"] } end end
@param reference [Nokogiri::XML::Element] @return [Array<Hash{Symbol=>RelatonBib::Person,Symbol=>Array<String>}>]
# File lib/relaton_ietf/scrapper.rb, line 175 def persons(reference) reference.xpath("./front/author[@surname]|./front/author[@fullname]") .map do |author| entity = RelatonBib::Person.new( name: full_name(author, reference), affiliation: [affiliation(author)], contact: contacts(author.at("./address")), ) { entity: entity, role: [contributor_role(author)] } end end
@param reference [Nokogiri::XML::Element] @return [Hash]
# File lib/relaton_ietf/scrapper.rb, line 111 def relations(reference) reference.xpath("reference").map do |ref| { type: "includes", bibitem: fetch_rfc(ref, is_relation: true) } end end
@param ref [String] @param is_relation [Boolen, nil] @return [RelatonIetf::IetfBibliographicItem]
# File lib/relaton_ietf/scrapper.rb, line 96 def rfc_item(ref, is_relation) /(?<=-)(?<ver>\d{2})$/ =~ ref if /^I-D/.match? ref ref.sub! /-\d{2}/, "" if ver ref.sub! /(?<=I-D\.)draft-/, "" end uri = "#{GH_URL}#{ref.sub(/\s|\u00a0/, '.')}.xml" doc = Nokogiri::XML get_page(uri) r = doc.at("/referencegroup", "/reference") fetch_rfc r, is_relation: is_relation, url: uri, ver: ver end
Extract series form reference @param reference [Nokogiri::XML::Element]
@return [Array<RelatonBib::Series>]
# File lib/relaton_ietf/scrapper.rb, line 340 def series(reference) reference.xpath("./seriesInfo").map do |si| next if si[:name] == "DOI" || si[:stream] || si[:status] RelatonBib::Series.new( title: RelatonBib::TypedTitleString.new( content: si[:name], language: language(reference), script: "Latn", ), number: si[:value], type: "main", ) end.compact end
extract status @param reference [Nokogiri::XML::Element]
@return [RelatonBib::DocumentStatus]
# File lib/relaton_ietf/scrapper.rb, line 360 def status(reference) st = reference.at("./seriesinfo[@status]") return unless st RelatonBib::DocumentStatus.new(stage: st[:status]) end
@param reference [Nokogiri::XML::Element] @return [Array<Hash>]
# File lib/relaton_ietf/scrapper.rb, line 137 def titles(reference) reference.xpath("./front/title").map do |title| { content: title.text, language: language(reference), script: "Latn" } end end