class RelatonOgc::DataFetcher

Public Class Methods

fetch(output: "data", format: "yaml") click to toggle source
# File lib/relaton_ogc/data_fetcher.rb, line 56
def self.fetch(output: "data", format: "yaml")
  t1 = Time.now
  puts "Started at: #{t1}"
  FileUtils.mkdir_p output unless Dir.exist? output
  new(output, format).fetch
  t2 = Time.now
  puts "Stopped at: #{t2}"
  puts "Done in: #{(t2 - t1).round} sec."
end
new(output, format) click to toggle source

Create DataFetcher instance

@param [String] output directory to save the documents @param [String] format output format “yaml” or “xmo”

# File lib/relaton_ogc/data_fetcher.rb, line 48
def initialize(output, format)
  @output = output
  @etagfile = File.join output, "etag.txt"
  @format = format
  @docids = []
  @dupids = []
end

Public Instance Methods

fetch() click to toggle source
# File lib/relaton_ogc/data_fetcher.rb, line 66
def fetch # rubocop:disable Metrics/MethodLength, Metrics/AbcSize
  get_data do |etag, json|
    no_errors = true
    json.each do |_, hit|
      next if hit["type"] == "CC"

      bib = Scrapper.parse_page hit
      write_document bib
    rescue StandardError => e
      no_errors = false
      warn "Fetching document: #{hit['identifier']}"
      warn "#{e.class} #{e.message}"
      warn e.backtrace
    end
    warn "[relaton-ogc] WARNING Duplicated documents: #{@dupids.uniq.join(', ')}" if @dupids.any?
    self.etag = etag if no_errors
  end
end
write_document(bib) click to toggle source
# File lib/relaton_ogc/data_fetcher.rb, line 85
def write_document(bib) # rubocop:disable Metrics/AbcSize
  if @docids.include?(bib.docidentifier[0].id)
    @dupids << bib.docidentifier[0].id
    return
  end

  @docids << bib.docidentifier[0].id
  name = bib.docidentifier[0].id.upcase.gsub(/[\s:.]/, "_")
  file = "#{@output}/#{name}.#{@format}"
  content = @format == "xml" ? bib.to_xml(bibdata: true) : bib.to_hash.to_yaml
  File.write file, content, encoding: "UTF-8"
end