class CBETA::CharFrequency

Public Class Methods

new(xml_root, opts={}) click to toggle source

@option opts [Integer] :top

# File lib/cbeta/char_freq.rb, line 3
def initialize(xml_root, opts={})
  @xml_root = xml_root
  @config = {
    top: 10
  }
  @config.merge!(opts)
  @result = {}
  @current = @result
end

Public Instance Methods

char_freq(canon=nil) click to toggle source
# File lib/cbeta/char_freq.rb, line 13
def char_freq(canon=nil)
  stat_all if canon.nil?
  stat_canon(canon)
  r = @result.sort_by {|k,v| v}
  r[(0-@config[:top])..-1].reverse
end

Private Instance Methods

count(c) click to toggle source
# File lib/cbeta/char_freq.rb, line 22
def count(c)
  if @current.key? c
    @current[c] += 1
  else
    @current[c] = 1
  end
end
handle_node(e) click to toggle source
# File lib/cbeta/char_freq.rb, line 30
def handle_node(e)
  return if e.comment?
  return handle_text(e) if e.text?
  return if %w(foreign mulu rdg reg sic).include? e.name
  
  case e.name
  when 'g'    then count(e['ref'])
  when 'note' then handle_note(e)
  when 't'    then handle_t(e)
  else traverse(e)
  end
end
handle_note(e) click to toggle source
# File lib/cbeta/char_freq.rb, line 43
def handle_note(e)
  if %w(inline interlinear).include? e['place']
    traverse(e)
  end
end
handle_t(e) click to toggle source
# File lib/cbeta/char_freq.rb, line 49
def handle_t(e)
  if e.has_attribute? 'place' and e['place'].include? 'foot'
    return
  end
  traverse(e)
end
handle_text(e) click to toggle source
# File lib/cbeta/char_freq.rb, line 56
def handle_text(e)
  s = e.content().chomp
  return if s.empty?
  return if e.parent.name == 'app'

  # cbeta xml 文字之間會有多餘的換行
  s.gsub!(/[\n\r]/, '')

  s.each_char do |c|
    next if CBETA::PUNCS.include? c
    count(c)
  end
end
stat_all() click to toggle source
# File lib/cbeta/char_freq.rb, line 70
def stat_all
  Dir.entries(@xml_root).sort.each do |canon|
    next if canon.start_with? '.'
    next if canon == 'schema'
    stat_canon(canon)
  end
end
stat_canon(canon) click to toggle source
# File lib/cbeta/char_freq.rb, line 78
def stat_canon(canon)
  return if canon.nil?
  puts 'stat canon: ' + canon
  if @config[:group_by] == 'canon'
    @result[canon] = {}
    @current = @result[canon]
  end
  folder = File.join(@xml_root, canon)
  Dir.entries(folder).sort.each do |vol|
    next if vol.start_with? '.'
    p = File.join(folder, vol)
    stat_vol(p)
  end
end
stat_file(fn) click to toggle source
# File lib/cbeta/char_freq.rb, line 93
def stat_file(fn)
  if @config[:group_by] == 'work'
    work = File.basename(fn, '.xml')
    work.sub!(/^([A-Z])\d{2,3}n(.*)$/, '\1\2')
    work = 'T0220' if work.start_with?('T0220')
    puts "stat work: #{work}"
    @result[work] = {}
    @current = @result[work]
  else
    puts "stat file: #{fn}"
  end
  doc = CBETA.open_xml(fn)
  body = doc.at_xpath('/TEI/text/body')
  traverse(body)
end
stat_vol(vol_folder) click to toggle source
# File lib/cbeta/char_freq.rb, line 109
def stat_vol(vol_folder)
  Dir.entries(vol_folder).sort.each do |f|
    next if f.start_with? '.'
    p = File.join(vol_folder, f)
    stat_file(p)
  end
end
traverse(e) click to toggle source
# File lib/cbeta/char_freq.rb, line 117
def traverse(e)
  e.children.each { |c| 
    handle_node(c)
  }
end