class Attentive::Tokenizer

Constants

CHARACTER_SUBSTITIONS
CONDITIONAL_NUMBER
CONDITIONAL_NUMBER_START
EMOJI_END
EMOJI_START
ENTITY
ENTITY_START
NUMBER
NUMBER_START
PUNCTUATION
REGEXP_START
WHITESPACE
WORD

Attributes

chars[R]
message[R]
options[R]
tokens[R]

Public Class Methods

new(message, options={}) click to toggle source
# File lib/attentive/tokenizer.rb, line 18
def initialize(message, options={})
  @message = message.downcase
  @chars = self.message.each_char.to_a
  @options = options
end
tokenize(message, options={}) click to toggle source
# File lib/attentive/tokenizer.rb, line 12
def self.tokenize(message, options={})
  self.new(message, options).tokenize
end

Public Instance Methods

match_entities?() click to toggle source
# File lib/attentive/tokenizer.rb, line 24
def match_entities?
  options.fetch(:entities, false)
end
match_regexps?() click to toggle source
# File lib/attentive/tokenizer.rb, line 28
def match_regexps?
  options.fetch(:regexps, false)
end
perform_substitutions?() click to toggle source
# File lib/attentive/tokenizer.rb, line 32
def perform_substitutions?
  options.fetch(:substitutions, true)
end
tokenize() click to toggle source
# File lib/attentive/tokenizer.rb, line 38
def tokenize
  i = 0
  @tokens = []
  @leaves = []

  while i < chars.length
    char = chars[i]
    char = CHARACTER_SUBSTITIONS.fetch(char, char)
    pos = tokens.any? ? tokens.last.end : 0

    if WHITESPACE === char && string = match_whitespace_at(i)
      add_token whitespace(string, pos: pos)
      i += string.length

    elsif ENTITY_START === char && string = match_entity_at(i)
      add_token entity(string, pos: pos)
      i += string.length + 4

    elsif NUMBER_START === char && string = match_number_at(i)
      add_token word(string, pos: pos)
      i += string.length

    elsif EMOJI_START === char && string = match_emoji_at(i)
      add_token emoji(string, pos: pos)
      i += string.length + 2

    elsif REGEXP_START === char && string = match_regexp_at(i)
      add_token regexp(string, pos: pos)
      i += string.length

    elsif PUNCTUATION === char
      add_token punctuation(char, pos: pos)
      i += 1

    else string = match_word_at(i)
      add_token word(string, pos: pos)
      i += string.length

    end
  end

  Attentive::Phrase.new(tokens)
end

Private Instance Methods

add_token(token) click to toggle source
# File lib/attentive/tokenizer.rb, line 162
def add_token(token)
  @tokens << token
  return unless perform_substitutions?
  @leaves = add_token_to_leaves token, @leaves
end
add_token_to_leaves(token, leaves) click to toggle source
# File lib/attentive/tokenizer.rb, line 168
def add_token_to_leaves(token, leaves)
  (leaves + [Attentive.substitutions]).each_with_object([]) do |leaf, new_leaves|
    if new_leaf = leaf[token]
      if new_leaf.fin?
        i = -1 - leaf.depth
        offset = tokens[i].begin
        replacement = new_leaf.fin.dup.each { |token| token.begin += offset }
        tokens[i..-1] = replacement
        return add_token_to_leaves replacement.last, []
      else
        new_leaves.push new_leaf
      end
    end
  end
end
if_present?(string) click to toggle source
# File lib/attentive/tokenizer.rb, line 156
def if_present?(string)
  string.empty? ? false : string
end
match_emoji_at(i) click to toggle source
# File lib/attentive/tokenizer.rb, line 87
def match_emoji_at(i)
  emoji = ""
  while (i += 1) < chars.length
    return if_present?(emoji) if EMOJI_END === chars[i]
    return false if WHITESPACE === chars[i]
    emoji << chars[i]
  end
  false
end
match_entity_at(i) click to toggle source
# File lib/attentive/tokenizer.rb, line 97
def match_entity_at(i)
  return false unless match_entities?
  return false unless chars[i += 1] == "{"
  entity = ""
  while (i += 1) < chars.length
    return if_present?(entity) if ["}", "}"] == chars[i, 2]
    return false unless ENTITY === chars[i]
    entity << chars[i]
  end
  false
end
match_number_at(i) click to toggle source
# File lib/attentive/tokenizer.rb, line 137
def match_number_at(i)
  return false if CONDITIONAL_NUMBER_START === chars[i] && !(NUMBER === chars[i + 1])
  number = chars[i].dup
  while (i += 1) < chars.length
    break unless NUMBER === chars[i] || (CONDITIONAL_NUMBER === chars[i] && NUMBER === chars[i + 1])
    number << chars[i]
  end
  number
end
match_regexp_at(i) click to toggle source
# File lib/attentive/tokenizer.rb, line 109
def match_regexp_at(i)
  return false unless match_regexps?
  return false unless chars[i += 1] == "?"
  regexp = "(?"
  parens = 1
  inside_square_bracket = false
  while (i += 1) < chars.length
    regexp << chars[i]
    next if chars[i - 1] == "\\"
    inside_square_bracket = true if chars[i] == "["
    inside_square_bracket = false if chars[i] == "]"
    next if inside_square_bracket
    parens += 1 if chars[i] == "("
    parens -= 1 if chars[i] == ")"
    return if_present?(regexp) if parens == 0
  end
  false
end
match_whitespace_at(i) click to toggle source
# File lib/attentive/tokenizer.rb, line 128
def match_whitespace_at(i)
  whitespace = chars[i].dup
  while (i += 1) < chars.length
    break unless WHITESPACE === chars[i]
    whitespace << chars[i]
  end
  whitespace
end
match_word_at(i) click to toggle source
# File lib/attentive/tokenizer.rb, line 147
def match_word_at(i)
  word = chars[i].dup
  while (i += 1) < chars.length
    break unless WORD === chars[i]
    word << chars[i]
  end
  word
end