class PageExtractor

Attributes

image_path[RW]
items[RW]
page[RW]
pdf_path[RW]
results[RW]

Public Class Methods

new(page) click to toggle source
# File lib/pdf_extract.rb, line 9
def initialize(page)
        @image_path = page[:image_path]
        @pdf_path = page[:pdf_path]
        @items = page[:items]
        @page_num = page[:page] ||= 1
        @results = {}
end

Public Instance Methods

crop_image(d) click to toggle source
# File lib/pdf_extract.rb, line 32
def crop_image(d)
        new_image_name = "CR.png"
#      ImageVoodoo.with_image(image_path) do |img|
                x1 = d[:x1]  
                x2 = d[:x2]
                y1 = d[:y1]
                y2 = d[:y2]
#              img.with_crop(x1,y1,x2,y2) { |img2| img2.save new_image_name }
#      end
        return new_image_name
end
extract_ocr(item) click to toggle source
# File lib/pdf_extract.rb, line 27
def extract_ocr(item)
        dimensions = item[:dimensions]
        @results[item[:name]] = ocr_text(crop_image(dimensions))      
end
extract_table(item) click to toggle source
# File lib/pdf_extract.rb, line 44
def extract_table(item)
        table = run_tabula(item[:dimensions])
        @results[item[:name]] = lines_to_array(table)
end
lines_to_array(table) click to toggle source
# File lib/pdf_extract.rb, line 55
def lines_to_array(table)
  table.lines.map(&:chomp).map { |l|
    l.split(",")
  }
end
ocr_text(image_path,blacklist='|',language=:eng) click to toggle source
# File lib/pdf_extract.rb, line 61
def ocr_text(image_path,blacklist='|',language=:eng)
        e = Tesseract::Engine.new {|e|
          e.language  = language
          e.blacklist = blacklist
        }
        return e.text_for(image_path).strip
end
process() click to toggle source
# File lib/pdf_extract.rb, line 17
def process
        items.each do |item|
                case item[:kind]
                when 'ocr' then extract_ocr(item)
                when 'table' then extract_table(item)
                end
        end

end
run_tabula(d) click to toggle source
# File lib/pdf_extract.rb, line 49
def run_tabula(d)
area = [d[:y1],d[:x1],d[:y2],d[:x2]].join(", ")
table = `tabula --area='#{area}' #{pdf_path} --page=#{page_num}`
return table
end