class KeywordFinder::Keywords

Public Instance Methods

clean_sentence(sentence) click to toggle source
# File lib/keyword_finder/keywords.rb, line 27
def clean_sentence sentence
  sentence.gsub(/(\.|\?|\,|\;)/," $1 ").gsub(/\n|\s/, '  ').gsub(/[[:space:]]/,' ')
end
combine_more_specifics(sentence) click to toggle source
# File lib/keyword_finder/keywords.rb, line 31
def combine_more_specifics sentence
  sentence.
    gsub(/([A-Za-z]*\([A-Za-z]*\)[A-Za-z]+)/) { |s| s.gsub(/(\(|\))/,'') }.
    gsub(/([A-Za-z]+\([A-Za-z]*\)[A-Za-z]*)/) { |s| s.gsub(/(\(|\))/,'') }
end
escape_regex_chars(string) click to toggle source
# File lib/keyword_finder/keywords.rb, line 6
def escape_regex_chars string
  Regexp.escape(string).downcase
end
find_in(sentence, options={}) click to toggle source

find in a sentence

@param [String] sentence that might contain the keywords this instance was initalized with @param [Hash] options; notably the :subsentences_strategy, which can be one of :none, :ignore_if_found_in_main, :always_ignore and the :entire_words_only boolean, which can be either true, false, or :when_short

# File lib/keyword_finder/keywords.rb, line 55
def find_in sentence, options={}
  options = {
    subsentences_strategy: :none, # :none, :ignore_if_found_in_main, :always_ignore
    entire_words_only: true
  }.merge(options)

  sentence = sentence.downcase.gsub(/\n/," ")

  full_sentence_results = self.scan_part(sentence, options)

  sentence = self.combine_more_specifics(sentence)
  main_and_subs = self.separate_main_and_sub_sentences(sentence)
  main_results = self.scan_part(main_and_subs[:main], options)

  sub_results = []
  unless (
    options[:subsentences_strategy] == :always_ignore or
    (main_results.count > 0 and options[:subsentences_strategy] == :ignore_if_found_in_main)
    )
    sub_results = main_and_subs[:subs].collect{|subsentence| self.scan_part(subsentence, options)}.flatten
  end

  clean_sentence_results = main_results + sub_results

  return select_the_best_results(clean_sentence_results, full_sentence_results)
end
ordered_by_length() click to toggle source
# File lib/keyword_finder/keywords.rb, line 3
def ordered_by_length
  self.sort{|a,b| b.length <=> a.length }
end
scan_in(sentence, options={}) click to toggle source
# File lib/keyword_finder/keywords.rb, line 23
def scan_in sentence, options={}
  " #{sentence} ".scan(self.to_regex(options))
end
scan_part(sentence, options={}) click to toggle source
# File lib/keyword_finder/keywords.rb, line 37
def scan_part sentence, options={}
  scan_results = self.scan_in(self.clean_sentence(sentence), options)
  scan_results.flatten!
  scan_results.uniq!
  scan_results.compact!
  results = []
  scan_results.each do |result|
    results << result.strip unless result.strip.empty?
  end
  results.collect{|a| a.gsub('  ', ' ')}
end
select_the_best_results(result_set_a, result_set_b) click to toggle source
# File lib/keyword_finder/keywords.rb, line 82
def select_the_best_results result_set_a, result_set_b
  ## check whether there are better matches in the full sentence approach (or the other way around)
  result_set_a_to_delete = []
  result_set_b_to_delete = []

  result_set_a.each do |result_a|
    result_set_b.each do |result_b|
      if result_a.match(escape_regex_chars(result_b))
        result_set_b_to_delete << result_b
      elsif result_b.match(escape_regex_chars(result_a))
        result_set_a_to_delete << result_a
      end
    end
  end

  result_set_a_to_delete.each do |a|
    result_set_a.delete(a)
  end
  result_set_b_to_delete.each do |a|
    result_set_b.delete(a)
  end

  return result_set_a + result_set_b
end
separate_main_and_sub_sentences(sentence) click to toggle source
# File lib/keyword_finder/keywords.rb, line 107
def separate_main_and_sub_sentences sentence
  subs = sentence.scan(/(\(.*\))/).flatten
  subs.each do |subsentence|
    sentence = sentence.gsub(subsentence,"")
  end
  {main:sentence.strip,subs:subs.collect{|a| a[1..(a.length-2)].strip}}
end
to_regex(options={}) click to toggle source
# File lib/keyword_finder/keywords.rb, line 10
def to_regex(options={})
  options = {entire_words_only: true}.merge(options)
  spacer = options[:entire_words_only] ? "\\s" : ""
  @to_regex = {} unless defined?(@to_regex)
  @to_regex[options[:entire_words_only]] ||= Regexp.new("(#{
    self.ordered_by_length.collect do |a|
      a_spacer = spacer
      a_spacer = "" if (options[:entire_words_only] == :when_short and a.length > 3)
      "#{a_spacer}#{self.escape_regex_chars(a.gsub(' ', '  '))}#{a_spacer}"
    end.join("|")
  })")
end