class PDFextract

Attributes

base_dir[RW]
file_path[RW]
image_dir[RW]
options[RW]
output_dir[RW]
pages[RW]
results[RW]
text_dir[RW]

Public Class Methods

example_schema() click to toggle source
# File lib/pdf_extract.rb, line 203
def self.example_schema 
        {
                file_path: "test_files/dream-may.pdf",
                options: {
                        remove_protection: false,
                        password: nil,
                        extract_all_text: true,
                        extract_text: []
                },
                pages: [{
                        match: "page_num",
                        page: 1,
                        items: [
                                {
                                        name: 'title',
                                        kind: 'ocr', #alternative is kind table
                                        dimensions:  {
                                                x1: 10,
                                                x2: 282,
                                                y1: 50,
                                                y2: 100
                                        }
                                },
                                {
                                        name: 'units_table',
                                        kind: 'table',
                                        dimensions: {
                                                x1: 0,
                                                x2: 265.73,
                                                y1: 184.94,
                                                y2: 233.84
                                        }
                                }
                        ]
                }]
        }
end
extract_ocr(image_path,coords) click to toggle source
# File lib/pdf_extract.rb, line 188
def self.extract_ocr(image_path,coords)
        
        x = coords["x1"]
        y = coords["y1"]
        width = coords["x2"] - x
        height = coords["y2"] - y
        puts image_path
        puts [x,y,width,height]
        engine = Tesseract::Engine.new(language: :eng)
        engine.image = image_path
        engine.select x,y,width,height
        text = engine.text.strip
        return text
end
new(schema) click to toggle source
# File lib/pdf_extract.rb, line 84
def initialize(schema)
        schema.symbolize_keys!

        @base_dir = Time.now.to_i.to_s
        setup_folders(@base_dir)
        @text_dir = @base_dir+'/text_files'
        @image_dir = @base_dir+'/image_files'
        @output_dir = @base_dir+'/output'
        if schema[:file_url]
                @file_path = get_file_from_url(schema[:file_url])
        else
                @file_path = get_file_from_path(schema[:file_path])
                puts @file_path
        end
        @options = schema[:options] if schema[:options]
        @pages = schema[:pages] if schema[:options]
        @results = {}

end

Public Instance Methods

cleanup() click to toggle source
# File lib/pdf_extract.rb, line 130
def cleanup
        `rm -r #{base_dir}`
end
convert_to_image(pages = "all") click to toggle source
# File lib/pdf_extract.rb, line 165
def convert_to_image(pages = "all")
        pdf_to_image_files(pages)
        images = []
        Dir.glob(image_dir+"/*.png").each do |file|  
                images << file 
        end
end
convert_to_text(pages = "all") click to toggle source
# File lib/pdf_extract.rb, line 154
def convert_to_text(pages = "all")
        pdf_to_text_files(pages)
        text = {}
        #take the text from the pdf pages and load em into this shit
        Dir.glob(text_dir+"/*.txt").each do |file|  
                page_num = file.split("_")[-1].split(".")[0]
                text[page_num] = File.open(file).read 
        end
        puts text
        return text
end
extract_with_ocr(page_path,dimensions) click to toggle source
# File lib/pdf_extract.rb, line 180
def extract_with_ocr(page_path,dimensions)
        engine = Tesseract::Engine.new(language: :eng)
        engine.image = page_path
        engine.select 1,34,59,281
        text = engine.text.strip
        dimensions[:result] = text 
        return text
end
get_file_from_path(path) click to toggle source
# File lib/pdf_extract.rb, line 117
def get_file_from_path(path)
        new_path = @base_dir+"/temp-file.pdf"
        `cp #{path} #{new_path}` 
        return new_path
end
get_file_from_url(file_url) click to toggle source
# File lib/pdf_extract.rb, line 110
def get_file_from_url(file_url)
        file_data = open(file_url).read
        temp_file = open(@base_dir+"/temp-file.pdf","w")
        temp_file.write file_data
        temp_file.close
        return temp_file.path
end
pdf_to_image_files(pages) click to toggle source
# File lib/pdf_extract.rb, line 173
def pdf_to_image_files(pages)
        Docsplit.extract_images(file_path,:output => image_dir, :format => [:png])
end
pdf_to_text_files(pages) click to toggle source
# File lib/pdf_extract.rb, line 177
def pdf_to_text_files(pages)
    Docsplit.extract_text(file_path, :output => text_dir,:pages => pages)
end
process() click to toggle source
# File lib/pdf_extract.rb, line 123
def process
        remove_protection if options[:remove_protection] == true 
        results[:images] = pdf_to_image_files("all")
        results[:text] = convert_to_text if options[:extract_all_text] == true 
        process_pages
        cleanup
end
process_pages() click to toggle source
# File lib/pdf_extract.rb, line 138
def process_pages
        pages.each do |page|
                if page[:match] == "page_num"
                        page_num = page[:page]
                        page[:image_path] = image_dir+"/temp-file_#{page_num}.png"
                        page[:pdf_path] = file_path
                                                        
                end
                page_extractor = PageExtractor.new(page)
                page_extractor.process
                results[page_num] = page_extractor.results
        end

end
remove_protection() click to toggle source
# File lib/pdf_extract.rb, line 133
def remove_protection
        #todo

end
setup_folders(folder_name) click to toggle source
# File lib/pdf_extract.rb, line 103
def setup_folders(folder_name)
                `rm -r #{folder_name}` if Dir.exists? folder_name
                `mkdir #{folder_name}`
                `mkdir #{text_dir}`
                `mkdir #{output_dir}`
end