class Arboretum::Scandent::Tokenizer
A class with class methods used to tokenize a Scandent
string Has information regarding which character patterns match which tokens Has information regarding which tokens trigger which state in the tokenizer
Public Class Methods
literal_type(literal)
click to toggle source
Returns the type of a string of characters that does not match a pattern
# File lib/arboretum/scandent.rb, line 779 def self.literal_type(literal) return :LITERAL_IDENT if literal =~ /^[[:alpha:]][[:alnum:]-_]*$/ return :LITERAL_INT if literal =~ /^\d+$/ return :LITERAL_FLOAT if literal =~ /^\d*\.?\d+$/ return :LITERAL_STRING if literal =~ /^.+$/ return :LITERAL_UNKNOWN end
tokenize(input)
click to toggle source
If no matches found for a letter, move to next letter, even if the possible patterns would match more
# File lib/arboretum/scandent.rb, line 788 def self.tokenize(input) state = [:STATE_ROOT_PATH] # State stack for the tokenizer, state.last will return the current state match_start = 0 match_end = 0 unmatched_buffer = '' largest_full_match = nil token_list = [] # The list of tokens in the input, each item is an Array in the form: # [Token type, Pattern that matched, State of the tokenizer after the token] # Until we have checked and matched at or on every single character while match_start < input.length # Start building a substring from a single character match_end = match_start # Start will all patterns for current state as candidates candidates = @@tokens[state.last].keys # Start will no full match detected largest_full_match = nil # Until nothing can match substring or end of input has been reached until candidates.empty? or match_end >= input.length # String that candidate patterns will have to match matched_string = input[match_start..match_end] # Check to see if each remaining candidate pattern matches # - If a full match, set as largest full match # - Delete if the pattern does not match the string candidates.delete_if do |pattern| largest_full_match = [pattern, match_start, match_end] if pattern.eql?(matched_string) !pattern.start_with?(matched_string) # Element deleted if true is the last statement in block end # Increase size of match by one if further matching is to be done match_end += 1 if not candidates.empty? end # Substring is now one character too large to be matched to # If no full match found, add the last checked character as unmatched # Otherwise: # - Parse the unmatched_buffer as a literal and store # - Activate state triggers associated with the largest fully matched token # - Store the largest fully matched token # - Start again where the matched token completes if largest_full_match.nil? # Add last checked character to the unmatched buffer unmatched_buffer << input[match_start] # Start matching again on the next letter match_start += 1 else # Do not activate state triggers associate with the parsed literal # Is there a use case? # Parse the unmatched_buffer as a literal and store (if it exists), then clear the buffer token_list << [Tokenizer.literal_type(unmatched_buffer), unmatched_buffer, state.last] if unmatched_buffer.length > 0 unmatched_buffer = '' # Info from the largest fully matched token matched_pattern, pattern_start, pattern_end = largest_full_match matched_token_type = @@tokens[state.last][matched_pattern] # Activate state triggers associated with the largest fully matched token current_state_triggers = @@triggers[state.last] if current_state_triggers[:open].has_key?(matched_token_type) state.push(current_state_triggers[:open][matched_token_type]) elsif current_state_triggers[:close].include?(matched_token_type) state.pop end # Store the largest fully matched token token_list << [matched_token_type, matched_pattern, state.last] # Start again where the matched token completes match_start = pattern_end + 1 end # Next token has been added to list end # Input has been fully tokenized # Parse and store the unmatched_buffer one last time token_list << [Tokenizer.literal_type(unmatched_buffer), unmatched_buffer, state.last] if unmatched_buffer.length > 0 token_list end