class PdfSearch::PdfIndex

Attributes

properties[R]
search_input_fields[R]
daemon[RW]

Public Class Methods

create_index() click to toggle source
# File lib/pdf_index.rb, line 12
def self.create_index
              ::PdfSearch::ElasticSearchClient.indices.create(
                      index: 'pdf_pages',
                      body: {
                              mappings: {
                                      document: {
                                              properties: {
                                                      text: {
                                                              type: 'text'
                                                      }
                                              }.merge(@properties ||= {})
                                      }
                              }
                      }
              )

              return true

      rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
              if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message
                      return false
              else
                      raise e
              end
end
new(pdf_dir) click to toggle source
# File lib/pdf_index.rb, line 65
def initialize(pdf_dir)
        @pdf_dir = pdf_dir
        @els_client = ::PdfSearch::ElasticSearchClient
end
property(property_name, type, options = {}) click to toggle source
# File lib/pdf_index.rb, line 38
def self.property(property_name, type, options = {})
  @properties ||= {}
  @properties[property_name] = {type: type}

  search_input_type = options.delete(:search)

  if search_input_type
    @search_input_fields ||= {}
    @search_input_fields[property_name] = search_input_type

    @search_input_fields_by_type ||= Hash.new { |h,k| h[k] = [] }
    @search_input_fields_by_type[search_input_type].push(property_name)
  end
end
start_daemon(dir) click to toggle source
# File lib/pdf_index.rb, line 70
    def self.start_daemon(dir)
      pdf_index = self.new(::PdfSearch::PdfDir.new(dir))
if ENV['DEBUG_PDF_INDEXING']
  pdf_index.index_loop
else
        pdf_index.daemon = Daemons.call(multiple: true, &pdf_index.method(:index_loop))
end
      pdf_index
    end

Public Instance Methods

additional_document_data(page, reader, pdf_id) click to toggle source

additional_document_data can be overridden by your custom Index

class CustomIndex < PdfSearch::PdfIndex

# The attribute that is used (organisation_id) has to be declared to be created when creating or updating the index like following:

        # Name of property   TYPE
property :organisation_id, 'string' # or 'text' etc.

def get_organisation_id
  # ...
end

def additional_document_data(page, reader, pdf_id)
 return {
   organisation_id: get_organisation_id(pdf_id, page)
 }
end

end

# File lib/pdf_index.rb, line 109
def additional_document_data(page, reader, pdf_id)
  return {}
end
combined_pdf_page_id(pdf_id, text) click to toggle source
# File lib/pdf_index.rb, line 127
def combined_pdf_page_id(pdf_id, text)
  "#{pdf_id}-#{Digest::SHA256.hexdigest(text)}"
end
create_page_document(pdf_id, text, additional_data) click to toggle source
# File lib/pdf_index.rb, line 116
def create_page_document(pdf_id, text, additional_data)
        @els_client.create(
    index: 'pdf_pages',
    type: 'document',
    id: combined_pdf_page_id(pdf_id, text),
    body: {
      text: text
    }.merge(additional_data)
  )
end
index_loop() click to toggle source
# File lib/pdf_index.rb, line 80
def index_loop
        loop do
                self.reindex
        end
end
pid() click to toggle source
# File lib/pdf_index.rb, line 86
   def pid
daemon.pid.pid
   end
properties() click to toggle source
# File lib/pdf_index.rb, line 61
def properties
  self.class.properties
end
reindex() click to toggle source
# File lib/pdf_index.rb, line 132
        def reindex
                @pdf_dir.pages.each.with_index do |(page, reader, pdf_id), index|
      additional_data = additional_document_data(page, reader, pdf_id)
      begin
        create_page_document(pdf_id, page.text, additional_data)
                  rescue Elasticsearch::Transport::Transport::Errors::Conflict => e
        puts "Skipping document #{combined_pdf_page_id(pdf_id, page.text)
}, already indexed"
      end
                end
        end
search_input_fields() click to toggle source
# File lib/pdf_index.rb, line 57
def search_input_fields
  self.class.search_input_fields
end
search_input_fields_by_type() click to toggle source
# File lib/pdf_index.rb, line 53
def search_input_fields_by_type
  self.class.instance_variable_get(:@search_input_fields_by_type)
end
update_page_document(pdf_id, text, additional_data) click to toggle source
# File lib/pdf_index.rb, line 113
def update_page_document(pdf_id, text, additional_data)
end