class RKelly::Tokenizer
Constants
- KEYWORDS
- KEYWORDS_THAT_IMPLY_DIVISION
Some keywords can be followed by regular expressions (eg, return and throw). Others can be followed by division.
- KEYWORDS_THAT_IMPLY_REGEX
- LITERALS
- RESERVED
- SINGLE_CHARS_THAT_IMPLY_DIVISION
Public Class Methods
new(&block)
click to toggle source
# File lib/rkelly/tokenizer.rb, line 58 def initialize(&block) @lexemes = [] token(:COMMENT, /\A\/(?:\*(?:.)*?\*\/|\/[^\n]*)/m) token(:STRING, /\A"(?:[^"\\]*(?:\\.[^"\\]*)*)"|\A'(?:[^'\\]*(?:\\.[^'\\]*)*)'/m) # A regexp to match floating point literals (but not integer literals). token(:NUMBER, /\A\d+\.\d*(?:[eE][-+]?\d+)?|\A\d+(?:\.\d*)?[eE][-+]?\d+|\A\.\d+(?:[eE][-+]?\d+)?/m) do |type, value| value.gsub!(/\.(\D)/, '.0\1') if value =~ /\.\w/ value.gsub!(/\.$/, '.0') if value =~ /\.$/ value.gsub!(/^\./, '0.') if value =~ /^\./ [type, eval(value)] end token(:NUMBER, /\A0[xX][\da-fA-F]+|\A0[0-7]*|\A\d+/) do |type, value| [type, eval(value)] end token(:LITERALS, Regexp.new(LITERALS.keys.sort_by { |x| x.length }.reverse.map { |x| "\\A#{x.gsub(/([|+*^])/, '\\\\\1')}" }.join('|') )) do |type, value| [LITERALS[value], value] end token(:RAW_IDENT, /\A([_\$A-Za-z][_\$0-9A-Za-z]*)/) do |type,value| if KEYWORDS.include?(value) [value.upcase.to_sym, value] elsif RESERVED.include?(value) [:RESERVED, value] else [:IDENT, value] end end # To distinguish regular expressions from comments, we require that # regular expressions start with a non * character (ie, not look like # /*foo*/). Note that we can't depend on the length of the match to # correctly distinguish, since `/**/i` is longer if matched as a regular # expression than as matched as a comment. # Incidentally, we're also not matching empty regular expressions # (eg, // and //g). Here we could depend on match length and priority to # determine that these are actually comments, but it turns out to be # easier to not match them in the first place. token(:REGEXP, /\A\/(?:[^\/\r\n\\*]|\\[^\r\n])[^\/\r\n\\]*(?:\\[^\r\n][^\/\r\n\\]*)*\/[gim]*/) token(:S, /\A[\s\r\n]*/m) token(:SINGLE_CHAR, /\A./) do |type, value| [value, value] end end
Public Instance Methods
raw_tokens(string)
click to toggle source
# File lib/rkelly/tokenizer.rb, line 114 def raw_tokens(string) tokens = [] line_number = 1 accepting_regexp = true while string.length > 0 longest_token = nil @lexemes.each { |lexeme| next if lexeme.name == :REGEXP && !accepting_regexp match = lexeme.match(string) next if match.nil? longest_token = match if longest_token.nil? next if longest_token.value.length >= match.value.length longest_token = match } if longest_token.name != :S accepting_regexp = followable_by_regex(longest_token) end longest_token.line = line_number line_number += longest_token.value.scan(/\n/).length string = string.slice(Range.new(longest_token.value.length, -1)) tokens << longest_token end tokens end
tokenize(string)
click to toggle source
# File lib/rkelly/tokenizer.rb, line 110 def tokenize(string) raw_tokens(string).map { |x| x.to_racc_token } end
Private Instance Methods
followable_by_regex(current_token)
click to toggle source
# File lib/rkelly/tokenizer.rb, line 148 def followable_by_regex(current_token) case current_token.name when :RAW_IDENT KEYWORDS_THAT_IMPLY_REGEX.include?(current_token.value) when :NUMBER false when :SINGLE_CHAR !SINGLE_CHARS_THAT_IMPLY_DIVISION.include?(current_token.value) else true end end
token(name, pattern = nil, &block)
click to toggle source
# File lib/rkelly/tokenizer.rb, line 144 def token(name, pattern = nil, &block) @lexemes << Lexeme.new(name, pattern, &block) end