class RsPathTokenizer::Tokenizer
Constants
- PT_DEBUG
Public Class Methods
new(tokens = nil)
click to toggle source
PT_DEBUG
= true
# File lib/rs_path_tokenizer/tokenizer.rb, line 6 def initialize(tokens = nil) return if tokens.nil? @single_tokens = {} tokens.keys.each do |t| parts = url2token(t) st = parts[0] raise Error.new('Token cant starts with asterisk') if st == '*' @single_tokens[st] = [] if @single_tokens[st].nil? @single_tokens[st].push parts end @token_map = tokens end
Public Instance Methods
marshal_dump()
click to toggle source
# File lib/rs_path_tokenizer/tokenizer.rb, line 19 def marshal_dump [@single_tokens, @token_map] end
marshal_load(array)
click to toggle source
# File lib/rs_path_tokenizer/tokenizer.rb, line 23 def marshal_load array @single_tokens, @token_map = array end
tokenize(string)
click to toggle source
best result
# File lib/rs_path_tokenizer/tokenizer.rb, line 28 def tokenize(string) tokens = tokenize_all(string).first return if tokens.nil? result_to_hash(tokens) end
Protected Instance Methods
merge_results(results, found, other)
click to toggle source
# File lib/rs_path_tokenizer/tokenizer.rb, line 113 def merge_results(results, found, other) if other.empty? unless found.nil? results.push [found] end else if found.nil? other.each do |o| results.push o end else other.each do |o| results.push [found] + o end end end results.map(&:uniq).uniq end
recursive_parse(array, possible_tokens, limiter = 1)
click to toggle source
# File lib/rs_path_tokenizer/tokenizer.rb, line 66 def recursive_parse(array, possible_tokens, limiter = 1) if limiter > 30 raise Error.new('Too deep recursion') end st = array.first return [] if st.to_s.strip == '' tokens = possible_tokens[st] if tokens.nil? puts "#{" " * limiter}NO tokens for #{st}" if PT_DEBUG return recursive_parse(array.slice(1..-1), possible_tokens) end results = [] puts "#{" " * limiter}possible tokens for #{st} are: #{tokens.inspect}" if PT_DEBUG tokens.each do |token| found, out, rest = try_match(token, array) puts "#{" " * limiter}matching #{token.inspect}" if PT_DEBUG if found if out != token @out_token_map[token2url(token)] = out end puts "#{" " * limiter}found a token: #{token.inspect}, parsing rest: #{rest.inspect}" if PT_DEBUG more = recursive_parse(rest.dup, possible_tokens, limiter + 1) results = merge_results(results, token, more) else puts "#{" " * limiter}found none on this level, NOT parsing rest: #{rest.inspect}" if PT_DEBUG more = recursive_parse(array.dup.slice(1..-1), possible_tokens, limiter + 1) results = merge_results(results, nil, more) end end if PT_DEBUG puts "#{" " * limiter}results:" results.each do |r| puts "#{" " * limiter} #{r.inspect}" end end results end
result_to_hash(array)
click to toggle source
# File lib/rs_path_tokenizer/tokenizer.rb, line 59 def result_to_hash(array) Hash[array.map do |e| k = token2url(e) [k, @out_token_map[k]] end] end
sort_results(results)
click to toggle source
# File lib/rs_path_tokenizer/tokenizer.rb, line 50 def sort_results(results) results.sort do |a, b| result = b.flatten.length <=> a.flatten.length result = b.length <=> a.length if result == 0 puts "sorting: #{a.inspect} #{b.inspect} #{result}" if PT_DEBUG result end end
token2url(token)
click to toggle source
# File lib/rs_path_tokenizer/tokenizer.rb, line 157 def token2url(token) token.join('-') end
tokenize_all(string)
click to toggle source
all results
# File lib/rs_path_tokenizer/tokenizer.rb, line 38 def tokenize_all(string) array = url2token(string) raise Error.new('Too long URL') if array.length > 500 possible_tokens = Hash[@single_tokens.keys.select do |st| array.include?(st) end.map do |st| [st, @single_tokens[st]] end] @out_token_map = @token_map sort_results(recursive_parse(array, possible_tokens)) end
try_match(token, array)
click to toggle source
# File lib/rs_path_tokenizer/tokenizer.rb, line 132 def try_match(token, array) found, out = [], [] rest = array.dup token.each do |token_part| url_part = rest.shift break if url_part.nil? if token_part == '*' out.push url_part found.push token_part elsif token_part == url_part found.push token_part out.push token_part end end if found == token [true, out, rest] else [false, out, array] end end
url2token(url)
click to toggle source
# File lib/rs_path_tokenizer/tokenizer.rb, line 160 def url2token(url) url.split("-") end