class CBETA::CharFrequency
Public Class Methods
new(xml_root, opts={})
click to toggle source
@option opts [Integer] :top
# File lib/cbeta/char_freq.rb, line 3 def initialize(xml_root, opts={}) @xml_root = xml_root @config = { top: 10 } @config.merge!(opts) @result = {} @current = @result end
Public Instance Methods
char_freq(canon=nil)
click to toggle source
# File lib/cbeta/char_freq.rb, line 13 def char_freq(canon=nil) stat_all if canon.nil? stat_canon(canon) r = @result.sort_by {|k,v| v} r[(0-@config[:top])..-1].reverse end
Private Instance Methods
count(c)
click to toggle source
# File lib/cbeta/char_freq.rb, line 22 def count(c) if @current.key? c @current[c] += 1 else @current[c] = 1 end end
handle_node(e)
click to toggle source
# File lib/cbeta/char_freq.rb, line 30 def handle_node(e) return if e.comment? return handle_text(e) if e.text? return if %w(foreign mulu rdg reg sic).include? e.name case e.name when 'g' then count(e['ref']) when 'note' then handle_note(e) when 't' then handle_t(e) else traverse(e) end end
handle_note(e)
click to toggle source
# File lib/cbeta/char_freq.rb, line 43 def handle_note(e) if %w(inline interlinear).include? e['place'] traverse(e) end end
handle_t(e)
click to toggle source
# File lib/cbeta/char_freq.rb, line 49 def handle_t(e) if e.has_attribute? 'place' and e['place'].include? 'foot' return end traverse(e) end
handle_text(e)
click to toggle source
# File lib/cbeta/char_freq.rb, line 56 def handle_text(e) s = e.content().chomp return if s.empty? return if e.parent.name == 'app' # cbeta xml 文字之間會有多餘的換行 s.gsub!(/[\n\r]/, '') s.each_char do |c| next if CBETA::PUNCS.include? c count(c) end end
stat_all()
click to toggle source
# File lib/cbeta/char_freq.rb, line 70 def stat_all Dir.entries(@xml_root).sort.each do |canon| next if canon.start_with? '.' next if canon == 'schema' stat_canon(canon) end end
stat_canon(canon)
click to toggle source
# File lib/cbeta/char_freq.rb, line 78 def stat_canon(canon) return if canon.nil? puts 'stat canon: ' + canon if @config[:group_by] == 'canon' @result[canon] = {} @current = @result[canon] end folder = File.join(@xml_root, canon) Dir.entries(folder).sort.each do |vol| next if vol.start_with? '.' p = File.join(folder, vol) stat_vol(p) end end
stat_file(fn)
click to toggle source
# File lib/cbeta/char_freq.rb, line 93 def stat_file(fn) if @config[:group_by] == 'work' work = File.basename(fn, '.xml') work.sub!(/^([A-Z])\d{2,3}n(.*)$/, '\1\2') work = 'T0220' if work.start_with?('T0220') puts "stat work: #{work}" @result[work] = {} @current = @result[work] else puts "stat file: #{fn}" end doc = CBETA.open_xml(fn) body = doc.at_xpath('/TEI/text/body') traverse(body) end
stat_vol(vol_folder)
click to toggle source
# File lib/cbeta/char_freq.rb, line 109 def stat_vol(vol_folder) Dir.entries(vol_folder).sort.each do |f| next if f.start_with? '.' p = File.join(vol_folder, f) stat_file(p) end end
traverse(e)
click to toggle source
# File lib/cbeta/char_freq.rb, line 117 def traverse(e) e.children.each { |c| handle_node(c) } end