class PdfSearch::PdfIndex
Attributes
properties[R]
search_input_fields[R]
daemon[RW]
Public Class Methods
create_index()
click to toggle source
# File lib/pdf_index.rb, line 12 def self.create_index ::PdfSearch::ElasticSearchClient.indices.create( index: 'pdf_pages', body: { mappings: { document: { properties: { text: { type: 'text' } }.merge(@properties ||= {}) } } } ) return true rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e if /"type":"(?:resource|index)_already_exists_exception"/ =~ e.message return false else raise e end end
new(pdf_dir)
click to toggle source
# File lib/pdf_index.rb, line 65 def initialize(pdf_dir) @pdf_dir = pdf_dir @els_client = ::PdfSearch::ElasticSearchClient end
property(property_name, type, options = {})
click to toggle source
# File lib/pdf_index.rb, line 38 def self.property(property_name, type, options = {}) @properties ||= {} @properties[property_name] = {type: type} search_input_type = options.delete(:search) if search_input_type @search_input_fields ||= {} @search_input_fields[property_name] = search_input_type @search_input_fields_by_type ||= Hash.new { |h,k| h[k] = [] } @search_input_fields_by_type[search_input_type].push(property_name) end end
start_daemon(dir)
click to toggle source
# File lib/pdf_index.rb, line 70 def self.start_daemon(dir) pdf_index = self.new(::PdfSearch::PdfDir.new(dir)) if ENV['DEBUG_PDF_INDEXING'] pdf_index.index_loop else pdf_index.daemon = Daemons.call(multiple: true, &pdf_index.method(:index_loop)) end pdf_index end
Public Instance Methods
additional_document_data(page, reader, pdf_id)
click to toggle source
additional_document_data
can be overridden by your custom Index
class CustomIndex < PdfSearch::PdfIndex
# The attribute that is used (organisation_id) has to be declared to be created when creating or updating the index like following: # Name of property TYPE property :organisation_id, 'string' # or 'text' etc. def get_organisation_id # ... end def additional_document_data(page, reader, pdf_id) return { organisation_id: get_organisation_id(pdf_id, page) } end
end
# File lib/pdf_index.rb, line 109 def additional_document_data(page, reader, pdf_id) return {} end
combined_pdf_page_id(pdf_id, text)
click to toggle source
# File lib/pdf_index.rb, line 127 def combined_pdf_page_id(pdf_id, text) "#{pdf_id}-#{Digest::SHA256.hexdigest(text)}" end
create_page_document(pdf_id, text, additional_data)
click to toggle source
# File lib/pdf_index.rb, line 116 def create_page_document(pdf_id, text, additional_data) @els_client.create( index: 'pdf_pages', type: 'document', id: combined_pdf_page_id(pdf_id, text), body: { text: text }.merge(additional_data) ) end
index_loop()
click to toggle source
# File lib/pdf_index.rb, line 80 def index_loop loop do self.reindex end end
pid()
click to toggle source
# File lib/pdf_index.rb, line 86 def pid daemon.pid.pid end
properties()
click to toggle source
# File lib/pdf_index.rb, line 61 def properties self.class.properties end
reindex()
click to toggle source
# File lib/pdf_index.rb, line 132 def reindex @pdf_dir.pages.each.with_index do |(page, reader, pdf_id), index| additional_data = additional_document_data(page, reader, pdf_id) begin create_page_document(pdf_id, page.text, additional_data) rescue Elasticsearch::Transport::Transport::Errors::Conflict => e puts "Skipping document #{combined_pdf_page_id(pdf_id, page.text) }, already indexed" end end end
search_input_fields()
click to toggle source
# File lib/pdf_index.rb, line 57 def search_input_fields self.class.search_input_fields end
search_input_fields_by_type()
click to toggle source
# File lib/pdf_index.rb, line 53 def search_input_fields_by_type self.class.instance_variable_get(:@search_input_fields_by_type) end
update_page_document(pdf_id, text, additional_data)
click to toggle source
# File lib/pdf_index.rb, line 113 def update_page_document(pdf_id, text, additional_data) end