class Importer::HtmlReader
Public Class Methods
new(importer)
click to toggle source
Calls superclass method
Importer::DataReader::new
# File lib/iron/import/html_reader.rb, line 5 def initialize(importer) super(importer, :html) supports_file! supports_stream! @tables = nil end
Public Instance Methods
init_source(mode, source)
click to toggle source
# File lib/iron/import/html_reader.rb, line 12 def init_source(mode, source) if mode == :stream @html = Nokogiri::HTML(source) elsif mode == :file @html = File.open(source) {|f| Nokogiri::HTML(f) } else add_error("Unsupported HTML mode: #{mode}") return false end if @html true else add_error("Failed parsing of HTML") false end rescue Exception => e add_exception(e) false end
load_raw(scopes, &block)
click to toggle source
# File lib/iron/import/html_reader.rb, line 34 def load_raw(scopes, &block) # Default to searching all tables in the document if scopes.nil? || scopes.empty? scopes = ['table'] end # Catch here lets us break out of the nested loop cleanly catch(:found) do # Run each scope, which should be a valid css selector scopes.each do |scope| @html.css(scope).each do |table_node| rows = [] table_node.css('tr').each do |row_node| row = [] row_node.children.each do |cell_node| if ['th', 'td'].include?(cell_node.name) row << cell_node.text.strip # Handle col-span values appropriately span_count = cell_node.attr('colspan') (span_count.to_i - 1).times do row << nil end end end rows << row end found = block.call(rows) throw(:found, true) if found end end end rescue Exception => e # Not sure why we'd get here, but we strive for error-freedom here, yessir. add_exception(e) end