class Arboretum::Scandent::Tokenizer

A class with class methods used to tokenize a Scandent string Has information regarding which character patterns match which tokens Has information regarding which tokens trigger which state in the tokenizer

Public Class Methods

literal_type(literal) click to toggle source

Returns the type of a string of characters that does not match a pattern

# File lib/arboretum/scandent.rb, line 779
def self.literal_type(literal)
  return :LITERAL_IDENT if literal =~ /^[[:alpha:]][[:alnum:]-_]*$/
  return :LITERAL_INT if literal =~ /^\d+$/
  return :LITERAL_FLOAT if literal =~ /^\d*\.?\d+$/
  return :LITERAL_STRING if literal =~ /^.+$/
  return :LITERAL_UNKNOWN
end
tokenize(input) click to toggle source

If no matches found for a letter, move to next letter, even if the possible patterns would match more

# File lib/arboretum/scandent.rb, line 788
def self.tokenize(input)
  state = [:STATE_ROOT_PATH] # State stack for the tokenizer, state.last will return the current state
  match_start = 0
  match_end = 0
  unmatched_buffer = ''
  largest_full_match = nil
  token_list = [] # The list of tokens in the input, each item is an Array in the form:
                  # [Token type, Pattern that matched, State of the tokenizer after the token]

  # Until we have checked and matched at or on every single character
  while match_start < input.length
    # Start building a substring from a single character
    match_end = match_start
    # Start will all patterns for current state as candidates
    candidates = @@tokens[state.last].keys
    # Start will no full match detected
    largest_full_match = nil

    # Until nothing can match substring or end of input has been reached
    until candidates.empty? or match_end >= input.length
      # String that candidate patterns will have to match
      matched_string = input[match_start..match_end]
      # Check to see if each remaining candidate pattern matches
      #  - If a full match, set as largest full match
      #  - Delete if the pattern does not match the string
      candidates.delete_if do |pattern|
        largest_full_match = [pattern, match_start, match_end] if pattern.eql?(matched_string)
        !pattern.start_with?(matched_string) # Element deleted if true is the last statement in block
      end
      # Increase size of match by one if further matching is to be done
      match_end += 1 if not candidates.empty?
    end # Substring is now one character too large to be matched to

    # If no full match found, add the last checked character as unmatched
    # Otherwise:
    #  - Parse the unmatched_buffer as a literal and store
    #  - Activate state triggers associated with the largest fully matched token
    #  - Store the largest fully matched token
    #  - Start again where the matched token completes
    if largest_full_match.nil?
      # Add last checked character to the unmatched buffer
      unmatched_buffer << input[match_start]
      # Start matching again on the next letter
      match_start += 1
    else

      # Do not activate state triggers associate with the parsed literal
      # Is there a use case?

      # Parse the unmatched_buffer as a literal and store (if it exists), then clear the buffer
      token_list << [Tokenizer.literal_type(unmatched_buffer), unmatched_buffer, state.last] if unmatched_buffer.length > 0
      unmatched_buffer = ''

      # Info from the largest fully matched token
      matched_pattern, pattern_start, pattern_end = largest_full_match
      matched_token_type = @@tokens[state.last][matched_pattern]

      # Activate state triggers associated with the largest fully matched token
      current_state_triggers = @@triggers[state.last]
      if current_state_triggers[:open].has_key?(matched_token_type)
        state.push(current_state_triggers[:open][matched_token_type])
      elsif current_state_triggers[:close].include?(matched_token_type)
        state.pop
      end

      # Store the largest fully matched token
      token_list << [matched_token_type, matched_pattern, state.last]

      # Start again where the matched token completes
      match_start = pattern_end + 1

    end # Next token has been added to list
  end # Input has been fully tokenized

  # Parse and store the unmatched_buffer one last time
  token_list << [Tokenizer.literal_type(unmatched_buffer), unmatched_buffer, state.last] if unmatched_buffer.length > 0

  token_list
end