class Cacofonix::CodeListExtractor

A utility class that processes the code list XSD from the ONIX spec and creates a set of TSV files. The generated files are used by this library to make hashes of the code lists available to users.

Constants

FORMATS

Public Class Methods

new(filename, format = :tsv) click to toggle source

Creates a new extractor. Expects the path to a copy of the code lists file from the spec (called ONIX_BookProduct_CodeLists.xsd on my system).

# File lib/cacofonix/utils/code_list_extractor.rb, line 16
def initialize(filename, format = :tsv)
  raise ArgumentError, "#{filename} not found" unless File.file?(filename)

  @filename = filename
  raise "Unknown format: #{format}"  unless FORMATS.include?(format)
  @format = format
end

Public Instance Methods

run(dir) click to toggle source

Generate one file for each list in the specified format in the given directory. Creates the directory if it doesn't exist. This will overwrite any existing files.

# File lib/cacofonix/utils/code_list_extractor.rb, line 28
def run(dir)
  FileUtils.mkdir_p(dir)

  each_list do |number, data|
    send("write_to_#{@format}_format", dir, number, data)
  end
end

Private Instance Methods

data() click to toggle source
# File lib/cacofonix/utils/code_list_extractor.rb, line 39
def data
  @data ||= File.open(@filename) { |f| f.read }
end
document() click to toggle source
# File lib/cacofonix/utils/code_list_extractor.rb, line 43
def document
  @document ||= Nokogiri::XML(data)
  @document.remove_namespaces! if @document.namespaces.size > 0
  @document
end
each_list() { |list_number, list_data(list_number)| ... } click to toggle source
# File lib/cacofonix/utils/code_list_extractor.rb, line 49
def each_list(&block)
  document.xpath("//simpleType").each do |node|
    list_name   = node.xpath("./@name").first.value
    list_number = list_name[/List(\d+)/,1].to_i
    if list_number > 0
      yield list_number, list_data(list_number)
    end
  end
end
list_data(num) click to toggle source
# File lib/cacofonix/utils/code_list_extractor.rb, line 59
def list_data(num)
  nodes = document.xpath("//simpleType[@name='List#{num}']/restriction/enumeration")
  nodes.inject([]) do |arr, node|
    code  = node.xpath("./@value").first.value
    desc  = node.xpath("./annotation/documentation").first.text
    ldesc = node.xpath("./annotation/documentation").last.text
    arr.tap { |a| a << [code, desc, ldesc] }
  end
end
write_to_ruby_format(dir, number, data) click to toggle source
# File lib/cacofonix/utils/code_list_extractor.rb, line 76
def write_to_ruby_format(dir, number, data)
  list_num = number.to_s.rjust(3, "0")
  file = list_num + ".rb"
  path = File.join(dir, file)
  str2str = lambda { |str|
    str.gsub!("\342\200\230", "'")
    str.gsub!("\342\200\231", "'")
    str.gsub!("\342\200\234", '"')
    str.gsub!("\342\200\235", '"')
    str.gsub!(/"/, '\"')
    "\"#{str}\""
  }
  # An alternative format: two-dimensional array. More complete,
  # but less addressable.
  #
  # out = ["# coding: utf-8"]
  # out << "module Cacofonix; module CodeLists"
  # out << "  LIST_#{number} = ["
  # out += data.collect { |row|
  #   row = row.collect { |f| "      #{str2str.call(f)}" }
  #   "    [\n#{row.join(",\n")}\n    ]"
  # }.join(",\n")
  # out << "  ]"
  # out << "end; end"
  out = ["# coding: utf-8\n"]
  out << "module Cacofonix; module CodeLists"
  out << "  LIST_#{number} = {"
  out << data.collect { |row|
    "    #{str2str.call(row[0])} => #{str2str.call(row[1])}"
  }.join(",\n")
  out << "  }"
  out << "end; end"
  File.open(path, "w") { |f| f.write out.join("\n") }
end
write_to_tsv_format(dir, number, data) click to toggle source
# File lib/cacofonix/utils/code_list_extractor.rb, line 69
def write_to_tsv_format(dir, number, data)
  file = number.to_s.rjust(3, "0") + ".tsv"
  path = File.join(dir, file)
  out = data.collect { |row| row.join("\t") }.join("\n")
  File.open(path, "w") { |f| f.write out }
end