class RSemantic::VectorSpace::Builder
A algebraic model for representing text documents as vectors of identifiers. A document is represented as a vector. Each dimension of the vector corresponds to a separate term. If a term occurs in the document, then the value in the vector is non-zero.
Attributes
parsed_document_cache[R]
Public Class Methods
new(options = {})
click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 9 def initialize(options = {}) @parser = Parser.new( :filter_stop_words => options[:filter_stop_words], :locale => options[:locale] ) @parsed_document_cache = [] end
Public Instance Methods
build_document_matrix(documents)
click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 17 def build_document_matrix(documents) @vector_keyword_index = build_vector_keyword_index(documents) document_vectors = documents.enum_for(:each_with_index).map{|document,document_id| build_vector(document, document_id)} n = document_vectors.size m = document_vectors.first.size # TODO check where else we use document_vectors and if we can directly use column based ones document_matrix = GSL::Matrix.alloc(*document_vectors.map {|v| v.transpose}) Model.new(document_matrix, @vector_keyword_index) end
build_query_vector(term_list)
click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 31 def build_query_vector(term_list) build_vector(term_list.join(" ")) end
Private Instance Methods
build_vector(word_string, document_id = nil)
click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 62 def build_vector(word_string, document_id = nil) if document_id.nil? word_list = @parser.tokenise_and_filter(word_string) else word_list = @parsed_document_cache[document_id] end vector = GSL::Vector.alloc(@vector_keyword_index.length) word_list.each { |word| if @vector_keyword_index.has_key?(word) vector[@vector_keyword_index[word]] += 1 end } vector.respond_to?(:to_v) ? vector.to_v : vector end
build_vector_keyword_index(documents)
click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 36 def build_vector_keyword_index(documents) parse_and_cache(documents) vocabulary_list = find_unique_vocabulary map_vocabulary_to_vector_positions(vocabulary_list) end
find_unique_vocabulary()
click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 48 def find_unique_vocabulary @parsed_document_cache.flatten.reverse.uniq end
map_vocabulary_to_vector_positions(vocabulary_list)
click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 52 def map_vocabulary_to_vector_positions(vocabulary_list) vector_index={} column = 0 vocabulary_list.each do |word| vector_index[word] = column column += 1 end vector_index end
parse_and_cache(documents)
click to toggle source
# File lib/rsemantic/vector_space/builder.rb, line 42 def parse_and_cache(documents) documents.each_with_index do |document, index| @parsed_document_cache[index] = @parser.tokenise_and_filter(document) end end