module RTesseract::Box
Public Class Methods
Source
# File lib/rtesseract/box.rb, line 19 def parse(content) content.lines.map { |line| parse_line(line) }.compact end
Source
# File lib/rtesseract/box.rb, line 48 def parse_confidence(line) line.match(/(?<=;)(.*?)(?=')/).to_s.split end
Source
# File lib/rtesseract/box.rb, line 23 def parse_line(line) return unless line.match?(/oc(rx|r)_word/) word = line.to_s.scan(/>(.*)</).flatten.first.to_s return if word.strip == '' word_info(word, parse_position(line), parse_confidence(line)) end
Source
# File lib/rtesseract/box.rb, line 44 def parse_position(line) line.match(/(?<=title)(.*?)(?=;)/).to_s.split end
Source
# File lib/rtesseract/box.rb, line 8 def run(source, errors, options) options = options.merge({ tessedit_create_hocr: 1 }) RTesseract::Command.new(source, temp_file_path, errors, options).run do |output_path| filename = "#{output_path}.hocr" content = File.read(filename) remove_tmp_file(filename) parse(content) end end
Source
# File lib/rtesseract/box.rb, line 33 def word_info(word, positions, confidence) { word: word, confidence: confidence[-1].to_i, x_start: positions[1].to_i, y_start: positions[2].to_i, x_end: positions[3].to_i, y_end: positions[4].to_i } end