class RSemantic::VectorSpace::Builder

A algebraic model for representing text documents as vectors of identifiers. A document is represented as a vector. Each dimension of the vector corresponds to a separate term. If a term occurs in the document, then the value in the vector is non-zero.

Attributes

parsed_document_cache[R]

Public Class Methods

new(options = {}) click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 9
def initialize(options = {})
  @parser = Parser.new(
    :filter_stop_words => options[:filter_stop_words],
    :locale => options[:locale]
  )
  @parsed_document_cache = []
end

Public Instance Methods

build_document_matrix(documents) click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 17
def build_document_matrix(documents)
  @vector_keyword_index = build_vector_keyword_index(documents)

  document_vectors = documents.enum_for(:each_with_index).map{|document,document_id| build_vector(document, document_id)}

  n = document_vectors.size
  m = document_vectors.first.size

  # TODO check where else we use document_vectors and if we can directly use column based ones
  document_matrix = GSL::Matrix.alloc(*document_vectors.map {|v| v.transpose})

  Model.new(document_matrix, @vector_keyword_index)
end
build_query_vector(term_list) click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 31
def build_query_vector(term_list)
  build_vector(term_list.join(" "))
end

Private Instance Methods

build_vector(word_string, document_id = nil) click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 62
def build_vector(word_string, document_id = nil)
  if document_id.nil?
    word_list = @parser.tokenise_and_filter(word_string)
  else
    word_list = @parsed_document_cache[document_id]
  end

  vector = GSL::Vector.alloc(@vector_keyword_index.length)
  word_list.each { |word|
    if @vector_keyword_index.has_key?(word)
      vector[@vector_keyword_index[word]] += 1
    end
  }

  vector.respond_to?(:to_v) ? vector.to_v : vector
end
build_vector_keyword_index(documents) click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 36
def build_vector_keyword_index(documents)
  parse_and_cache(documents)
  vocabulary_list = find_unique_vocabulary
  map_vocabulary_to_vector_positions(vocabulary_list)
end
find_unique_vocabulary() click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 48
def find_unique_vocabulary
  @parsed_document_cache.flatten.reverse.uniq
end
map_vocabulary_to_vector_positions(vocabulary_list) click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 52
def map_vocabulary_to_vector_positions(vocabulary_list)
  vector_index={}
  column = 0
  vocabulary_list.each do |word|
    vector_index[word] = column
    column += 1
  end
  vector_index
end
parse_and_cache(documents) click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 42
def parse_and_cache(documents)
  documents.each_with_index do |document, index|
    @parsed_document_cache[index] = @parser.tokenise_and_filter(document)
  end
end