class DataCollector::Input
Attributes
raw[R]
Public Class Methods
new()
click to toggle source
# File lib/data_collector/input.rb, line 21 def initialize @logger = Logger.new(STDOUT) end
Public Instance Methods
from_uri(source, options = {}) { |data| ... }
click to toggle source
# File lib/data_collector/input.rb, line 25 def from_uri(source, options = {}) source = CGI.unescapeHTML(source) @logger.info("Loading #{source}") uri = URI(source) begin data = nil case uri.scheme when 'http' data = from_http(uri, options) when 'https' data = from_https(uri, options) when 'file' data = from_file(uri, options) else raise "Do not know how to process #{source}" end data = data.nil? ? 'no data found' : data if block_given? yield data else data end rescue => e @logger.info(e.message) puts e.backtrace.join("\n") nil end end
Private Instance Methods
csv_to_hash(data)
click to toggle source
# File lib/data_collector/input.rb, line 152 def csv_to_hash(data) csv = CSV.parse(data, headers: true, header_converters: [:downcase, :symbol]) csv.collect do |record| record.to_hash end end
escape_uri(uri)
click to toggle source
# File lib/data_collector/input.rb, line 160 def escape_uri(uri) #"#{uri.to_s.gsub(uri.query, '')}#{CGI.escape(CGI.unescape(uri.query))}" uri.to_s end
file_type_from(headers)
click to toggle source
# File lib/data_collector/input.rb, line 165 def file_type_from(headers) file_type = 'application/octet-stream' file_type = if headers.include?('Content-Type') headers['Content-Type'].split(';').first else MIME::Types.of(filename_from(headers)).first.content_type end return file_type end
from_file(uri, options = {})
click to toggle source
# File lib/data_collector/input.rb, line 115 def from_file(uri, options = {}) data = nil absolute_path = File.absolute_path("#{uri.host}#{uri.path}") unless options.has_key?('raw') && options['raw'] == true @raw = data = File.read("#{absolute_path}") case File.extname(absolute_path) when '.jsonld' data = JSON.parse(data) when '.json' data = JSON.parse(data) when '.xml' data = xml_to_hash(data) when '.gz' Minitar.open(Zlib::GzipReader.new(File.open("#{absolute_path}", 'rb'))) do |i| i.each do |entry| data = entry.read end end data = xml_to_hash(data) when '.csv' data = csv_to_hash(data) else raise "Do not know how to process #{uri.to_s}" end end data end
from_http(uri, options = {})
click to toggle source
# File lib/data_collector/input.rb, line 58 def from_http(uri, options = {}) from_https(uri, options) end
from_https(uri, options = {})
click to toggle source
# File lib/data_collector/input.rb, line 62 def from_https(uri, options = {}) data = nil http = HTTP if options.keys.include?(:user) && options.keys.include?(:password) user = options[:user] password = options[:password] http = HTTP.basic_auth(user: user, pass: password) else @logger.warn ("User or Password parameter not found") end http_response = http.get(escape_uri(uri)) case http_response.code when 200 @raw = data = http_response.body.to_s # File.open("#{rand(1000)}.xml", 'wb') do |f| # f.puts data # end file_type = options.with_indifferent_access.has_key?(:content_type) ? options.with_indifferent_access[:content_type] : file_type_from(http_response.headers) unless options.with_indifferent_access.has_key?(:raw) && options.with_indifferent_access[:raw] == true case file_type when 'application/ld+json' data = JSON.parse(data) when 'application/json' data = JSON.parse(data) when 'application/atom+xml' data = xml_to_hash(data) when 'text/csv' data = csv_to_hash(data) when 'application/xml' data = xml_to_hash(data) when 'text/xml' data = xml_to_hash(data) else data = xml_to_hash(data) end end when 401 raise 'Unauthorized' when 404 raise 'Not found' else raise "Unable to process received status code = #{http_response.code}" end data end
xml_to_hash(data)
click to toggle source
# File lib/data_collector/input.rb, line 144 def xml_to_hash(data) #gsub('<\/', '< /') outherwise wrong XML-parsing (see records lirias1729192 ) data = data.gsub /</, '< /' nori = Nori.new(parser: :nokogiri, strip_namespaces: true, convert_tags_to: lambda { |tag| tag.gsub(/^@/, '_') }) nori.parse(data) #JSON.parse(nori.parse(data).to_json) end