class PageExtractor
Attributes
image_path[RW]
items[RW]
page[RW]
pdf_path[RW]
results[RW]
Public Class Methods
new(page)
click to toggle source
# File lib/pdf_extract.rb, line 9 def initialize(page) @image_path = page[:image_path] @pdf_path = page[:pdf_path] @items = page[:items] @page_num = page[:page] ||= 1 @results = {} end
Public Instance Methods
crop_image(d)
click to toggle source
# File lib/pdf_extract.rb, line 32 def crop_image(d) new_image_name = "CR.png" # ImageVoodoo.with_image(image_path) do |img| x1 = d[:x1] x2 = d[:x2] y1 = d[:y1] y2 = d[:y2] # img.with_crop(x1,y1,x2,y2) { |img2| img2.save new_image_name } # end return new_image_name end
extract_ocr(item)
click to toggle source
# File lib/pdf_extract.rb, line 27 def extract_ocr(item) dimensions = item[:dimensions] @results[item[:name]] = ocr_text(crop_image(dimensions)) end
extract_table(item)
click to toggle source
# File lib/pdf_extract.rb, line 44 def extract_table(item) table = run_tabula(item[:dimensions]) @results[item[:name]] = lines_to_array(table) end
lines_to_array(table)
click to toggle source
# File lib/pdf_extract.rb, line 55 def lines_to_array(table) table.lines.map(&:chomp).map { |l| l.split(",") } end
ocr_text(image_path,blacklist='|',language=:eng)
click to toggle source
# File lib/pdf_extract.rb, line 61 def ocr_text(image_path,blacklist='|',language=:eng) e = Tesseract::Engine.new {|e| e.language = language e.blacklist = blacklist } return e.text_for(image_path).strip end
process()
click to toggle source
# File lib/pdf_extract.rb, line 17 def process items.each do |item| case item[:kind] when 'ocr' then extract_ocr(item) when 'table' then extract_table(item) end end end
run_tabula(d)
click to toggle source
# File lib/pdf_extract.rb, line 49 def run_tabula(d) area = [d[:y1],d[:x1],d[:y2],d[:x2]].join(", ") table = `tabula --area='#{area}' #{pdf_path} --page=#{page_num}` return table end