module RelatonCen::Scrapper

Scrapper.

Constants

COMMITTEES

Public Class Methods

parse_page(hit) click to toggle source

Parse page. @param hit [RelatonCen::Hit] @return [RelatonBib::BibliographicItem]

# File lib/relaton_cen/scrapper.rb, line 15
def parse_page(hit) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
  doc = hit.hit_collection.agent.get hit.hit[:url]
  RelatonIsoBib::IsoBibliographicItem.new(
    fetched: Date.today.to_s,
    type: "standard",
    docid: fetch_docid(hit.hit[:code]),
    language: ["en"],
    script: ["Latn"],
    title: fetch_titles(doc),
    doctype: "international-standard",
    docstatus: fetch_status(doc),
    ics: fetch_ics(doc),
    date: fetch_dates(doc),
    # contributor: fetch_contributors(doc),
    editorialgroup: fetch_editorialgroup(doc),
    structuredidentifier: fetch_structuredid(hit.hit),
    abstract: fetch_abstract(doc),
    copyright: fetch_copyright(doc),
    link: fetch_link(doc.uri.to_s),
    relation: fetch_relations(doc),
    place: ["London"],
  )
end

Private Class Methods

fetch_abstract(doc) click to toggle source

Fetch abstracts. @param doc [Mechanize::Page] @return [Array<Hash>]

# File lib/relaton_cen/scrapper.rb, line 52
def fetch_abstract(doc)
  content = doc.at("//tr[th[.='Abstract/Scope']]/td")
  [{ content: content.text, language: "en", script: "Latn" }]
end
fetch_dates(doc) click to toggle source

Fetch dates @param hit [Mechanize::Page] @return [Array<Hash>]

# File lib/relaton_cen/scrapper.rb, line 143
def fetch_dates(doc) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/MethodLength
  doc.xpath("//div[@id='DASHBOARD_LISTIMPLEMENTATIONDATES']/table/tr")
    .each_with_object([]) do |d, a|
    on = d.at("td").text
    next if on.empty?

    t = d.at("th").text
    type = case t
           when /DOR/ then "adapted"
           when /DAV/ then "issued"
           when /DOA/ then "announced"
           when /DOP/ then "published"
           when /DOW/ then "obsoleted"
           else t.downcase
           end
    a << { type: type, on: on }
  end
end
fetch_docid(ref) click to toggle source

Fetch docid. @param ref [String] @return [Array<RelatonBib::DocumentIdentifier>]

# File lib/relaton_cen/scrapper.rb, line 60
def fetch_docid(ref)
  [RelatonBib::DocumentIdentifier.new(type: "CEN", id: ref)]
end
fetch_editorialgroup(doc) click to toggle source

Fetch workgroup. @param doc [Mechanize::Page] @return [RelatonIsoBib::EditorialGroup]

# File lib/relaton_cen/scrapper.rb, line 77
def fetch_editorialgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
  code = doc.at("//tr/td/h1/text()").text
  title = doc.at("//tr/td[3]/h1").text
  %r{/(?<type>\w+)(?:\s(?<num>[^/]+))?$} =~ code
  tc = []
  COMMITTEES.each do |k, v|
    next unless code.include? k

    t, n = k.split
    tc << RelatonBib::WorkGroup.new(name: v, type: t, number: n)
  end
  sc = []
  if tc.any?
    sc << RelatonBib::WorkGroup.new(name: title, type: type, number: num)
  else
    tc << RelatonBib::WorkGroup.new(name: title, type: type, number: num)
  end
  RelatonIsoBib::EditorialGroup.new(technical_committee: tc,
                                    subcommittee: sc)
end
fetch_ics(doc) click to toggle source

@param doc [Mechanize::Page] @return [Array<RelatonIsobib::Ics>]

# File lib/relaton_cen/scrapper.rb, line 43
def fetch_ics(doc)
  doc.xpath("//tr[th[.='ICS']]/td/text()").map do |ics|
    RelatonIsoBib::Ics.new ics.text.match(/[^\s]+/).to_s
  end
end
fetch_relations(doc) click to toggle source

Fetch relations. @param doc [Mechanize::Page] @return [Array<Hash>]

# File lib/relaton_cen/scrapper.rb, line 110
def fetch_relations(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
  doc.xpath(
    "//div[@id='DASHBOARD_LISTRELATIONS']/table/tr[th[.!='Sales Points']]",
  ).each_with_object([]) do |rt, a|
    t = rt.at("th").text
    type = case t
           when "Supersedes" then "obsoletes"
           when /Normative reference/ then "cites"
           else t.downcase
           end
    rt.xpath("td/a").each do |r|
      fref = RelatonBib::FormattedRef.new(content: r.text, language: "en",
                                          script: "Latn")
      link = fetch_link HitCollection::DOMAIN + r[:href]
      bibitem = RelatonBib::BibliographicItem.new(
        formattedref: fref, type: "standard", link: link,
      )
      a << { type: type, bibitem: bibitem }
    end
  end
end
fetch_status(doc) click to toggle source

Fetch status. @param doc [Mechanize::Page] @return [RelatonBib::DocumentStatus, NilClass]

# File lib/relaton_cen/scrapper.rb, line 67
def fetch_status(doc)
  s = doc.at("//tr[th[.='Status']]/td")
  return unless s

  RelatonBib::DocumentStatus.new(stage: s.text.strip)
end
fetch_structuredid(hit) click to toggle source

@param hit [RelatonCen::Hit] @return [RelatonIsoBib::StructuredIdentifier]

# File lib/relaton_cen/scrapper.rb, line 100
def fetch_structuredid(hit)
  %r{(?<pnum>\d+)(?:-(?<part>\d+))?(?:-(?<subpart>\d+))?} =~ hit[:code]
  RelatonIsoBib::StructuredIdentifier.new(
    project_number: pnum, part: part, subpart: subpart, type: "CEN",
  )
end
fetch_titles(doc) click to toggle source

Fetch titles. @param doc [Mechanize::Page] @return [RelatonBib::TypedTitleStringCollection]

# File lib/relaton_cen/scrapper.rb, line 135
def fetch_titles(doc)
  te = doc.at("//tr[th[.='Title']]/td").text.strip
  RelatonBib::TypedTitleString.from_string te, "en", "Latn"
end
owner_entity() click to toggle source

@return [Hash]

# File lib/relaton_cen/scrapper.rb, line 189
def owner_entity
  {
    abbreviation: "CEN",
    name: "European Committee for Standardization",
    url: "https://cen.eu",
  }
end