class RSemantic::Parser

Public Class Methods

new(options = {}) click to toggle source
# File lib/rsemantic/parser.rb, line 6
def initialize(options = {})
  # English stopwords from ftp://ftp.cs.cornell.edu/pub/smart/english.stop
  # TODO: nicer way to reference stop file location?
  @filter_stop_words = options[:filter_stop_words]
  @stem_words        = options[:stem_words]
  locale             = options[:locale] || 'en'

  if @filter_stop_words
    File.open("#{File.dirname(__FILE__)}/../../resources/#{locale}.stop", 'r') do |file|
      @stopwords = Set.new(file.read().split())
    end
  end
end

Public Instance Methods

clean(string) click to toggle source

remove any nasty grammar tokens from string

# File lib/rsemantic/parser.rb, line 26
def clean(string)
  string = string.gsub(".","")
  string = string.gsub(/\s+/," ")
  string = string.downcase
  return string
end
remove_stop_words(list) click to toggle source

stop words are common words which have no search value

# File lib/rsemantic/parser.rb, line 34
def remove_stop_words(list)
  if @filter_stop_words
    list.select {|word| !@stopwords.include?(word) }
  else
    list
  end
end
tokenise_and_filter(string) click to toggle source
# File lib/rsemantic/parser.rb, line 20
def tokenise_and_filter(string)
  word_list = tokenise_and_stem(string)
  remove_stop_words(word_list)
end
tokenise_and_stem(string) click to toggle source
# File lib/rsemantic/parser.rb, line 42
def tokenise_and_stem(string)
  string = clean(string)
  words = string.split(" ")

  if @stem_words
    words.map(&:stem)
  else
    words
  end
end