class PDFextract
Attributes
base_dir[RW]
file_path[RW]
image_dir[RW]
options[RW]
output_dir[RW]
pages[RW]
results[RW]
text_dir[RW]
Public Class Methods
example_schema()
click to toggle source
# File lib/pdf_extract.rb, line 203 def self.example_schema { file_path: "test_files/dream-may.pdf", options: { remove_protection: false, password: nil, extract_all_text: true, extract_text: [] }, pages: [{ match: "page_num", page: 1, items: [ { name: 'title', kind: 'ocr', #alternative is kind table dimensions: { x1: 10, x2: 282, y1: 50, y2: 100 } }, { name: 'units_table', kind: 'table', dimensions: { x1: 0, x2: 265.73, y1: 184.94, y2: 233.84 } } ] }] } end
extract_ocr(image_path,coords)
click to toggle source
# File lib/pdf_extract.rb, line 188 def self.extract_ocr(image_path,coords) x = coords["x1"] y = coords["y1"] width = coords["x2"] - x height = coords["y2"] - y puts image_path puts [x,y,width,height] engine = Tesseract::Engine.new(language: :eng) engine.image = image_path engine.select x,y,width,height text = engine.text.strip return text end
new(schema)
click to toggle source
# File lib/pdf_extract.rb, line 84 def initialize(schema) schema.symbolize_keys! @base_dir = Time.now.to_i.to_s setup_folders(@base_dir) @text_dir = @base_dir+'/text_files' @image_dir = @base_dir+'/image_files' @output_dir = @base_dir+'/output' if schema[:file_url] @file_path = get_file_from_url(schema[:file_url]) else @file_path = get_file_from_path(schema[:file_path]) puts @file_path end @options = schema[:options] if schema[:options] @pages = schema[:pages] if schema[:options] @results = {} end
Public Instance Methods
cleanup()
click to toggle source
# File lib/pdf_extract.rb, line 130 def cleanup `rm -r #{base_dir}` end
convert_to_image(pages = "all")
click to toggle source
# File lib/pdf_extract.rb, line 165 def convert_to_image(pages = "all") pdf_to_image_files(pages) images = [] Dir.glob(image_dir+"/*.png").each do |file| images << file end end
convert_to_text(pages = "all")
click to toggle source
# File lib/pdf_extract.rb, line 154 def convert_to_text(pages = "all") pdf_to_text_files(pages) text = {} #take the text from the pdf pages and load em into this shit Dir.glob(text_dir+"/*.txt").each do |file| page_num = file.split("_")[-1].split(".")[0] text[page_num] = File.open(file).read end puts text return text end
extract_with_ocr(page_path,dimensions)
click to toggle source
# File lib/pdf_extract.rb, line 180 def extract_with_ocr(page_path,dimensions) engine = Tesseract::Engine.new(language: :eng) engine.image = page_path engine.select 1,34,59,281 text = engine.text.strip dimensions[:result] = text return text end
get_file_from_path(path)
click to toggle source
# File lib/pdf_extract.rb, line 117 def get_file_from_path(path) new_path = @base_dir+"/temp-file.pdf" `cp #{path} #{new_path}` return new_path end
get_file_from_url(file_url)
click to toggle source
# File lib/pdf_extract.rb, line 110 def get_file_from_url(file_url) file_data = open(file_url).read temp_file = open(@base_dir+"/temp-file.pdf","w") temp_file.write file_data temp_file.close return temp_file.path end
pdf_to_image_files(pages)
click to toggle source
# File lib/pdf_extract.rb, line 173 def pdf_to_image_files(pages) Docsplit.extract_images(file_path,:output => image_dir, :format => [:png]) end
pdf_to_text_files(pages)
click to toggle source
# File lib/pdf_extract.rb, line 177 def pdf_to_text_files(pages) Docsplit.extract_text(file_path, :output => text_dir,:pages => pages) end
process()
click to toggle source
# File lib/pdf_extract.rb, line 123 def process remove_protection if options[:remove_protection] == true results[:images] = pdf_to_image_files("all") results[:text] = convert_to_text if options[:extract_all_text] == true process_pages cleanup end
process_pages()
click to toggle source
# File lib/pdf_extract.rb, line 138 def process_pages pages.each do |page| if page[:match] == "page_num" page_num = page[:page] page[:image_path] = image_dir+"/temp-file_#{page_num}.png" page[:pdf_path] = file_path end page_extractor = PageExtractor.new(page) page_extractor.process results[page_num] = page_extractor.results end end
remove_protection()
click to toggle source
# File lib/pdf_extract.rb, line 133 def remove_protection #todo end
setup_folders(folder_name)
click to toggle source
# File lib/pdf_extract.rb, line 103 def setup_folders(folder_name) `rm -r #{folder_name}` if Dir.exists? folder_name `mkdir #{folder_name}` `mkdir #{text_dir}` `mkdir #{output_dir}` end