module Pdftohtml

PDF To HTML Module: Root Module for Pdftohtml.

PDF To HTML Module

Constants

VERSION

Version

Public Class Methods

convert(pdf_file) click to toggle source

Convert PDF to HTML: Converts the file pointed to by pdf_file into a hash of HTML pages. @param [String] pdf_file Path to a PDF file @return [Hash] A hash of HTML Pages { 0 => ['Line0', 'Line1', …], 1 => ['Line0', 'Line1', …], … }

# File lib/pdftohtml.rb, line 15
def self.convert pdf_file

        # Generate Output Directory
        out_path = "/tmp/pdftohtml-#{Time.now.to_f.to_s.gsub '.', (rand * 10000000000).to_i.to_s}"
        FileUtils.rmtree out_path
        FileUtils.mkdir out_path

        # Run pdftohtml
        `pdftohtml -c -i "#{pdf_file}" "#{out_path}/output"`

        # Drop shit files
        File.unlink "#{out_path}/output.html"
        File.unlink "#{out_path}/output_ind.html"

        # Acquire Files
        files = Dir["#{out_path}/*.html"]

        # Load up Document Pages
        pages = files.sort.collect { |f| { /#{out_path}\/output-([0-9]+).html/.match(f)[1].to_i => File.readlines(f).collect { |l| l.chomp } } }.inject({}, :merge)

        # Drop temp files
        FileUtils.rmtree out_path

        pages
end