class PinYin::Backend::MMSeg
Public Class Methods
new(override_files=[])
click to toggle source
# File lib/ruby-pinyin/backend/mmseg.rb, line 8 def initialize(override_files=[]) @simple = Simple.new override_files RMMSeg::Dictionary.dictionaries.delete_if {|(type, path)| type == :words} RMMSeg::Dictionary.dictionaries.push [:words, File.expand_path('../../data/words.dic', __FILE__)] RMMSeg::Dictionary.load_dictionaries end
Public Instance Methods
romanize(str, tone=nil, include_punctuations=false)
click to toggle source
# File lib/ruby-pinyin/backend/mmseg.rb, line 16 def romanize(str, tone=nil, include_punctuations=false) return [] unless str && str.length > 0 words = segment str base = @simple.romanize(str, tone, include_punctuations) patch = words.map {|w| format(w, tone) }.flatten if base.size != patch.size base.compact! patch.compact! end apply base, patch end
segment(str)
click to toggle source
# File lib/ruby-pinyin/backend/mmseg.rb, line 32 def segment(str) algor = RMMSeg::Algorithm.new str words = [] while token = algor.next_token s = token.text.force_encoding("UTF-8") words.push(s) unless s =~ Punctuation.chinese_regexp end words end
Private Instance Methods
apply(base, patch)
click to toggle source
# File lib/ruby-pinyin/backend/mmseg.rb, line 93 def apply(base, patch) result = [] base.each_with_index do |char, i| if patch[i].nil? result.push char elsif char =~ Punctuation.regexp result.push Value.new("#{patch[i]}#{$1}", char.english?) else result.push Value.new(patch[i], char.english?) end end result end
dictionary()
click to toggle source
# File lib/ruby-pinyin/backend/mmseg.rb, line 45 def dictionary return @dict if @dict @dict = {} src = File.expand_path('../../data/words.dat', __FILE__) File.readlines(src).map do |line| word, unicode = line.strip.split(',') @dict[word] = unicode end @dict end
format(word, tone)
click to toggle source
# File lib/ruby-pinyin/backend/mmseg.rb, line 77 def format(word, tone) pinyin = get_pinyin(word, tone) return pinyin.split(' ') if pinyin #如果是个英文单词,直接返回,否则返回与词等长的nil数组 if word =~ /^[_0-9a-zA-Z\s]*$/ word elsif word.respond_to? :force_encoding # word has been encoded in UTF-8 already [nil] * word.size else # For ruby 1.8, there is no native utf-8 support [nil] * word.unpack('U*').size end end
get_pinyin(word, tone)
click to toggle source
# File lib/ruby-pinyin/backend/mmseg.rb, line 58 def get_pinyin(word, tone) return unless dictionary[word] case tone when :unicode dictionary[word] when :ascii, true to_ascii dictionary[word], true else to_ascii dictionary[word], false end end
to_ascii(word, with_tone)
click to toggle source
# File lib/ruby-pinyin/backend/mmseg.rb, line 71 def to_ascii(word, with_tone) word.split(' ').map do |reading| PinYin::Util.to_ascii(reading, with_tone) end.join(' ') end