class Attentive::Tokenizer
Constants
- CHARACTER_SUBSTITIONS
- CONDITIONAL_NUMBER
- CONDITIONAL_NUMBER_START
- EMOJI_END
- EMOJI_START
- ENTITY
- ENTITY_START
- NUMBER
- NUMBER_START
- PUNCTUATION
- REGEXP_START
- WHITESPACE
- WORD
Attributes
chars[R]
message[R]
options[R]
tokens[R]
Public Class Methods
new(message, options={})
click to toggle source
# File lib/attentive/tokenizer.rb, line 18 def initialize(message, options={}) @message = message.downcase @chars = self.message.each_char.to_a @options = options end
tokenize(message, options={})
click to toggle source
# File lib/attentive/tokenizer.rb, line 12 def self.tokenize(message, options={}) self.new(message, options).tokenize end
Public Instance Methods
match_entities?()
click to toggle source
# File lib/attentive/tokenizer.rb, line 24 def match_entities? options.fetch(:entities, false) end
match_regexps?()
click to toggle source
# File lib/attentive/tokenizer.rb, line 28 def match_regexps? options.fetch(:regexps, false) end
perform_substitutions?()
click to toggle source
# File lib/attentive/tokenizer.rb, line 32 def perform_substitutions? options.fetch(:substitutions, true) end
tokenize()
click to toggle source
# File lib/attentive/tokenizer.rb, line 38 def tokenize i = 0 @tokens = [] @leaves = [] while i < chars.length char = chars[i] char = CHARACTER_SUBSTITIONS.fetch(char, char) pos = tokens.any? ? tokens.last.end : 0 if WHITESPACE === char && string = match_whitespace_at(i) add_token whitespace(string, pos: pos) i += string.length elsif ENTITY_START === char && string = match_entity_at(i) add_token entity(string, pos: pos) i += string.length + 4 elsif NUMBER_START === char && string = match_number_at(i) add_token word(string, pos: pos) i += string.length elsif EMOJI_START === char && string = match_emoji_at(i) add_token emoji(string, pos: pos) i += string.length + 2 elsif REGEXP_START === char && string = match_regexp_at(i) add_token regexp(string, pos: pos) i += string.length elsif PUNCTUATION === char add_token punctuation(char, pos: pos) i += 1 else string = match_word_at(i) add_token word(string, pos: pos) i += string.length end end Attentive::Phrase.new(tokens) end
Private Instance Methods
add_token(token)
click to toggle source
# File lib/attentive/tokenizer.rb, line 162 def add_token(token) @tokens << token return unless perform_substitutions? @leaves = add_token_to_leaves token, @leaves end
add_token_to_leaves(token, leaves)
click to toggle source
# File lib/attentive/tokenizer.rb, line 168 def add_token_to_leaves(token, leaves) (leaves + [Attentive.substitutions]).each_with_object([]) do |leaf, new_leaves| if new_leaf = leaf[token] if new_leaf.fin? i = -1 - leaf.depth offset = tokens[i].begin replacement = new_leaf.fin.dup.each { |token| token.begin += offset } tokens[i..-1] = replacement return add_token_to_leaves replacement.last, [] else new_leaves.push new_leaf end end end end
if_present?(string)
click to toggle source
# File lib/attentive/tokenizer.rb, line 156 def if_present?(string) string.empty? ? false : string end
match_emoji_at(i)
click to toggle source
# File lib/attentive/tokenizer.rb, line 87 def match_emoji_at(i) emoji = "" while (i += 1) < chars.length return if_present?(emoji) if EMOJI_END === chars[i] return false if WHITESPACE === chars[i] emoji << chars[i] end false end
match_entity_at(i)
click to toggle source
# File lib/attentive/tokenizer.rb, line 97 def match_entity_at(i) return false unless match_entities? return false unless chars[i += 1] == "{" entity = "" while (i += 1) < chars.length return if_present?(entity) if ["}", "}"] == chars[i, 2] return false unless ENTITY === chars[i] entity << chars[i] end false end
match_number_at(i)
click to toggle source
# File lib/attentive/tokenizer.rb, line 137 def match_number_at(i) return false if CONDITIONAL_NUMBER_START === chars[i] && !(NUMBER === chars[i + 1]) number = chars[i].dup while (i += 1) < chars.length break unless NUMBER === chars[i] || (CONDITIONAL_NUMBER === chars[i] && NUMBER === chars[i + 1]) number << chars[i] end number end
match_regexp_at(i)
click to toggle source
# File lib/attentive/tokenizer.rb, line 109 def match_regexp_at(i) return false unless match_regexps? return false unless chars[i += 1] == "?" regexp = "(?" parens = 1 inside_square_bracket = false while (i += 1) < chars.length regexp << chars[i] next if chars[i - 1] == "\\" inside_square_bracket = true if chars[i] == "[" inside_square_bracket = false if chars[i] == "]" next if inside_square_bracket parens += 1 if chars[i] == "(" parens -= 1 if chars[i] == ")" return if_present?(regexp) if parens == 0 end false end
match_whitespace_at(i)
click to toggle source
# File lib/attentive/tokenizer.rb, line 128 def match_whitespace_at(i) whitespace = chars[i].dup while (i += 1) < chars.length break unless WHITESPACE === chars[i] whitespace << chars[i] end whitespace end
match_word_at(i)
click to toggle source
# File lib/attentive/tokenizer.rb, line 147 def match_word_at(i) word = chars[i].dup while (i += 1) < chars.length break unless WORD === chars[i] word << chars[i] end word end