class RMMSeg::ComplexAlgorithm
Constants
- MATCH_CACHE_MAX_LENGTH
Public Class Methods
new(text, token=Token)
click to toggle source
Create a new ComplexAlgorithm
. Rules used by this algorithm includes MMRule
, LAWLRule
, SVWLRule
and LSDMFOCWRule
.
Calls superclass method
RMMSeg::Algorithm::new
# File lib/rmmseg/complex_algorithm.rb, line 15 def initialize(text, token=Token) super @rules = [ MMRule, LAWLRule, SVWLRule, LSDMFOCWRule ] @match_cache = Array.new(MATCH_CACHE_MAX_LENGTH) @match_cache_idx = 0 end
Public Instance Methods
create_chunks()
click to toggle source
Create all possible three-word (or less) chunks starting from +@index+ .
# File lib/rmmseg/complex_algorithm.rb, line 54 def create_chunks chunks = Array.new for w0 in find_match_words(@index) index0 = @index + w0.length if index0 < @chars.length for w1 in find_match_words(index0) index1 = index0 + w1.length if index1 < @chars.length for w2 in find_match_words(index1) if w2.type == Word::TYPES[:unrecognized] chunks << [w0, w1] else chunks << [w0, w1, w2] end end elsif index1 == @chars.length chunks << [w0, w1] end end elsif index0 == @chars.length chunks << [w0] end end chunks end
find_match_words(index)
click to toggle source
Find all words occuring in the dictionary starting from index
. The maximum word length is determined by Config.max_word_length
.
# File lib/rmmseg/complex_algorithm.rb, line 84 def find_match_words(index) for i, w in @match_cache if i == index return w end end dic = Dictionary.instance str = String.new strlen = 0 words = Array.new i = index while i < @chars.length && !basic_latin?(@chars[i]) && strlen < Config.max_word_length str << @chars[i] strlen += 1 if dic.has_word?(str) words << dic.get_word(str) end i += 1 end if words.empty? words << Word.new(@chars[index], Word::TYPES[:unrecognized]) end @match_cache[@match_cache_idx] = [index, words] @match_cache_idx += 1 @match_cache_idx = 0 if @match_cache_idx == MATCH_CACHE_MAX_LENGTH words end
get_cjk_word()
click to toggle source
Get the most proper CJK word.
# File lib/rmmseg/complex_algorithm.rb, line 28 def get_cjk_word chunks = create_chunks i = 0 while i < @rules.length break if chunks.length < 2 chunks = @rules[i].filter(chunks) i += 1 end if chunks.length > 1 if Config.on_ambiguity == :raise_exception raise Ambiguity, "Can't solve ambiguity on #{chunks}" end end word = chunks[0][0] token = @token.new(word.text, @byte_index, @byte_index+word.byte_size) @index += word.length @byte_index += word.byte_size return token end