class RSemantic::Parser
Public Class Methods
new(options = {})
click to toggle source
# File lib/rsemantic/parser.rb, line 6 def initialize(options = {}) # English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop # TODO: nicer way to reference stop file location? @filter_stop_words = options[:filter_stop_words] @stem_words = options[:stem_words] locale = options[:locale] || 'en' if @filter_stop_words File.open("#{File.dirname(__FILE__)}/../../resources/#{locale}.stop", 'r') do |file| @stopwords = Set.new(file.read().split()) end end end
Public Instance Methods
clean(string)
click to toggle source
remove any nasty grammar tokens from string
# File lib/rsemantic/parser.rb, line 26 def clean(string) string = string.gsub(".","") string = string.gsub(/\s+/," ") string = string.downcase return string end
remove_stop_words(list)
click to toggle source
stop words are common words which have no search value
# File lib/rsemantic/parser.rb, line 34 def remove_stop_words(list) if @filter_stop_words list.select {|word| !@stopwords.include?(word) } else list end end
tokenise_and_filter(string)
click to toggle source
# File lib/rsemantic/parser.rb, line 20 def tokenise_and_filter(string) word_list = tokenise_and_stem(string) remove_stop_words(word_list) end
tokenise_and_stem(string)
click to toggle source
# File lib/rsemantic/parser.rb, line 42 def tokenise_and_stem(string) string = clean(string) words = string.split(" ") if @stem_words words.map(&:stem) else words end end