class CBETA::P5aToText

Convert CBETA XML P5a to Text

CBETA XML P5a 可由此取得: github.com/cbeta-git/xml-p5a

@example for convert 大正藏第一冊 in app format:

c = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER', 'app')
c.convert('T01')

Constants

PASS

內容不輸出的元素

Public Class Methods

new(xml_root, output_root, opts={}) click to toggle source

@param xml_root [String] 來源 CBETA XML P5a 路徑@param output_root [String] 輸出 Text 路徑@option opts [String] :format 輸出格式,例:‘app’,預設是 normal @option opts [String] :encoding 輸出編碼,預設 ‘UTF-8’ @option opts [String] :gaiji 缺字處理方式,預設 ‘default’ @option opts [String] :inline_note 是否呈現夾註,預設為 true

* 'PUA': 缺字一律使用 Unicode PUA
* 'default': 優先使用通用字
# File lib/cbeta/p5a_to_text.rb, line 31
def initialize(xml_root, output_root, opts={})
  @xml_root = xml_root
  @output_root = output_root
  
  @settings = {
    format: nil,
    encoding: 'UTF-8',
    gaiji: 'default',
    inline_note: true
  }
  @settings.merge!(opts)
  
  @cbeta = CBETA.new
  @gaijis = CBETA::Gaiji.new
end

Public Instance Methods

convert(target=nil) click to toggle source

CBETA XML P5a 轉為 Text

@example for convert all:

x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
x2h.convert

@example for convert 大正藏第一冊:

x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
x2h.convert('T01')

@example for convert 大正藏全部:

x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
x2h.convert('T')

@example for convert 大正藏第五冊至第七冊:

x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
x2h.convert('T05..T07')

T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: www.cbeta.org/format/id.php

# File lib/cbeta/p5a_to_text.rb, line 70
def convert(target=nil)
  return convert_all if target.nil?

  arg = target.upcase
  if arg.size <= 2
    handle_canon(arg)
  else
    if arg.include? '..'
      arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
        handle_vols($1, $2)
      }
    else
      handle_vol(arg)
    end
  end
end

Private Instance Methods

appify(text) click to toggle source

跨行字詞移到下一行

# File lib/cbeta/p5a_to_text.rb, line 90
def appify(text)
  r = ''
  i = 0
  app = ''
  text.each_line do |line|
    line.chomp!
    if line.match(/^(.*)║(.*)$/)
      r += $1
      t = $2
      r += "(%02d)" % i
      r += "║#{app}"
      app = ''
      i = 0
      chars = t.chars
      until chars.empty?
        c = chars.pop
        if c == "\t"
          break
        elsif '  :》」』、;,!?。'.include? c
          chars << c
          break
        elsif '《「『'.include? c  # 這些標點移到下一行
          app = c + app
          break
        else
          app = c + app
        end
      end
      r += chars.join.gsub(/\t/, '') + "\n"
      i = app.size
    end
  end
  r
end
convert_all() click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 125
def convert_all
  Dir.entries(@xml_root).sort.each do |c|
    next unless c.match(/^#{CBETA::CANON}$/)
    handle_canon(c)
  end
end
e_anchor(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 142
def e_anchor(e)
  if e.has_attribute?('type')
    if e['type'] == 'circle'
      return '◎'
    end
  end

  ''
end
e_app(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 152
def e_app(e)
  traverse(e)
end
e_byline(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 156
def e_byline(e)
  r = traverse(e)
  r += @settings[:format]=='app' ? "\t" : "\n"
  r
end
e_cell(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 162
def e_cell(e)
  r = traverse(e)
  r += @settings[:format]=='app' ? "\t" : "\n"
  r
end
e_corr(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 168
def e_corr(e)
  "<r w='【CBETA】'>%s</r>" % traverse(e)
end
e_div(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 172
def e_div(e)
  traverse(e)
end
e_docNumber(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 176
def e_docNumber(e)
  r = traverse(e)
  r += @settings[:format] == 'app' ? "\t" : "\n"
  r
end
e_figure(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 182
def e_figure(e)
  r = traverse(e)
  r += @settings[:format] == 'app' ? "\t" : "\n"
  r
end
e_foreign(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 188
def e_foreign(e)
  return '' if e.key?('place') and e['place'].include?('foot')
  traverse(e)
end
e_g(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 193
def e_g(e)
  # if 悉曇字、蘭札體
  #   使用 Unicode PUA
  # else if 有 <mapping type="unicode">
  #   直接採用
  # else if 有 <mapping type="normal_unicode">
  #   採用 normal_unicode
  # else if 有 normalized form
  #   採用 normalized form
  # else
  #   Unicode PUA
  gid = e['ref'][1..-1]
  
  if @settings[:gaiji] == 'PUA'
    return CBETA.siddham_pua(gid) if gid.start_with?('SD') # 悉曇字
    return CBETA.ranjana_pua(gid) if gid.start_with?('RJ') # 蘭札體
    return CBETA.pua(gid)
  end
  
  g = @gaijis[gid]
  abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
  
  if gid.start_with?('SD') # 悉曇字
    case gid
    when 'SD-E35A'
      return '('
    when 'SD-E35B'
      return ')'
    else
      return CBETA.siddham_pua(gid)
    end
  end
  
  if gid.start_with?('RJ') # 蘭札體
    return CBETA.ranjana_pua(gid)
  end
  
  return g['unicode-char'] if g.has_key?('unicode')
  return g['normal_unicode'] if g.has_key?('normal_unicode')
  return g['normal'] if g.has_key?('normal')

  # Unicode PUA
  [0xf0000 + gid[2..-1].to_i].pack 'U'
end
e_graphic(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 238
def e_graphic(e)
  ''
end
e_head(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 242
def e_head(e)
  r = traverse(e)
  r += @settings[:format] == 'app' ? "\t" : "\n"
  r
end
e_item(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 248
def e_item(e)
  r = traverse(e)
  r += @settings[:format] == 'app' ? "\t" : "\n"
end
e_juan(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 253
def e_juan(e)
  r = traverse(e)
  r += @settings[:format] == 'app' ? "\t" : "\n"
  r
end
e_l(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 259
def e_l(e)
  r = traverse(e)
  if @settings[:format] == 'app'
    r += "\t"
  else
    r += "\n" unless @lg_type == 'abnormal'
  end
  r
end
e_lb(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 269
def e_lb(e)
  return '' if e['type']=='old'
  r = ''
  if @settings[:format] == 'app'
    r += "\n#{e['n']}║"
  end
  unless @next_line_buf.empty?
    r += @next_line_buf + "\n"
    @next_line_buf = ''
  end
  r
end
e_lem(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 282
def e_lem(e)
  # 沒有 rdg 的版本,用字同 lem
  editions = Set.new @editions
  e.xpath('./following-sibling::rdg').each do |rdg|
    rdg['wit'].scan(/【.*?】/).each do |w|
      editions.delete w
    end
  end
  
  w = editions.to_a.join(' ')
  "<r w='#{w}'>%s</r>" % traverse(e)
end
e_lg(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 295
def e_lg(e)
  traverse(e)
end
e_list(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 299
def e_list(e)
  r = ''
  r += "\n" unless @settings[:format] == 'app'
  r + traverse(e)
end
e_milestone(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 305
def e_milestone(e)
  r = ''
  if e['unit'] == 'juan'
    @juan = e['n'].to_i
    r += "<juan #{@juan}>"
  end
  r
end
e_mulu(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 314
def e_mulu(e)
  ''
end
e_note(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 318
def e_note(e)
  return '' unless @settings[:inline_note]
  
  if e.has_attribute?('place') && e['place']=='inline'
    r = traverse(e)
    return "(#{r})"
  end
  ''
end
e_p(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 328
def e_p(e)
  r = traverse(e)
  r += @settings[:format] == 'app' ? "\t" : "\n"
  r
end
e_rdg(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 334
def e_rdg(e)
  "<r w='#{e['wit']}'>%s</r>" % traverse(e)
end
e_row(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 338
def e_row(e)
  traverse(e)
end
e_sg(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 342
def e_sg(e)
  '(' + traverse(e) + ')'
end
e_sic(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 346
def e_sic(e)
  "<r w='#{@orig}'>" + traverse(e) + "</r>"
end
e_t(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 350
def e_t(e)
  if e.has_attribute? 'place'
    return '' if e['place'].include? 'foot'
  end
  r = traverse(e)

  # 不是雙行對照
  return r if @tt_type == 'app'

  # 處理雙行對照
  i = e.xpath('../t').index(e)
  case i
  when 0
    return r + ' '
  when 1
    @next_line_buf += r + ' '
    return ''
  else
    return r
  end
end
e_table(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 372
def e_table(e)
  traverse(e)
end
e_tt(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 483
def e_tt(e)
  @tt_type = e['type']
  traverse(e)
end
get_editions(doc) click to toggle source

取得所有對校版本

# File lib/cbeta/p5a_to_text.rb, line 133
def get_editions(doc)
  r = Set.new [@orig, "【CBETA】"] # 至少有底本及 CBETA 兩個版本
  doc.xpath('//lem|//rdg').each do |e|
    w = e['wit'].scan(/【.*?】/)
    r.merge w
  end
  r
end
handle_canon(c) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 376
def handle_canon(c)
  @canon = c
  puts 'handle_canon ' + c
  folder = File.join(@xml_root, @canon)
  Dir.entries(folder).sort.each do |vol|
    next if vol.start_with? '.'
    handle_vol(vol)
  end
end
handle_node(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 386
def handle_node(e)
  return '' if e.comment?
  return handle_text(e) if e.text?
  return '' if PASS.include?(e.name)
  r = case e.name
  when 'anchor'    then e_anchor(e)
  when 'app'       then e_app(e)
  when 'back'      then ''
  when 'byline'    then e_byline(e)
  when 'cell'      then e_cell(e)
  when 'corr'      then e_corr(e)
  when 'div'       then e_div(e)
  when 'docNumber' then e_docNumber(e)
  when 'figure'    then e_figure(e)
  when 'foreign'   then e_foreign(e)
  when 'g'         then e_g(e)
  when 'graphic'   then e_graphic(e)
  when 'head'      then e_head(e)
  when 'item'      then e_item(e)
  when 'juan'      then e_juan(e)
  when 'l'         then e_l(e)
  when 'lb'        then e_lb(e)
  when 'lem'       then e_lem(e)
  when 'lg'        then e_lg(e)
  when 'list'      then e_list(e)
  when 'mulu'      then e_mulu(e)
  when 'note'      then e_note(e)
  when 'milestone' then e_milestone(e)
  when 'p'         then e_p(e)
  when 'rdg'       then e_rdg(e)
  when 'reg'       then ''
  when 'row'       then e_row(e)
  when 'sic'       then e_sic(e)
  when 'sg'        then e_sg(e)
  when 'tt'        then e_tt(e)
  when 't'         then e_t(e)
  when 'table'     then e_table(e)
  when 'teiHeader' then ''
  when 'unclear'   then '▆'
  else traverse(e)
  end
  r
end
handle_sutra(xml_fn) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 430
def handle_sutra(xml_fn)
  @dila_note = 0
  @div_count = 0
  #@editions = Set.new [@orig, "【CBETA】"] # 至少有底本跟CBETA兩種版本
  @in_l = false
  @juan = 0
  @lg_row_open = false
  @mod_notes = Set.new
  @next_line_buf = ''
  @open_divs = []
  @sutra_no = File.basename(xml_fn, ".xml")

  text = parse_xml(xml_fn)
 
  # 大正藏 No. 220 大般若經跨冊,CBETA 分成多檔並在檔尾加上 a, b, c....
  # 輸出時去掉這些檔尾的 a, b, b....
  if @sutra_no.match(/^(T05|T06|T07)n0220/)
    @sutra_no = "#{$1}n0220"
  end

  @out_sutra = File.join(@out_vol, @sutra_no)
  FileUtils.makedirs @out_sutra

  juans = text.split(/(<juan \d+>)/)
  juan_no = nil
  buf = ''
  # 一卷一檔
  juans.each { |j|
    if j =~ /<juan (\d+)>$/
      juan_no = $1.to_i
    else
      if juan_no.nil?
        buf = j
      else
        write_juan(juan_no, buf+j)
        buf = ''
      end
    end
  }
end
handle_text(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 471
def handle_text(e)
  s = e.content().chomp
  return '' if s.empty?
  return '' if e.parent.name == 'app'

  # cbeta xml 文字之間會有多餘的換行
  r = s.gsub(/[\n\r]/, '')

  # 把 & 轉為 &amp;
  CGI.escapeHTML(r)
end
handle_vol(vol) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 488
def handle_vol(vol)
  print vol + ' '

  @canon = CBETA.get_canon_from_vol(vol)
  @orig = @cbeta.get_canon_symbol(@canon)
  abort "未處理底本" if @orig.nil?

  @vol = vol
  @out_vol = File.join(@output_root, @canon, vol)
  FileUtils.remove_dir(@out_vol, true)
  FileUtils.makedirs @out_vol
  
  source = File.join(@xml_root, @canon, vol)
  Dir.entries(source).sort.each { |f|
    next if f.start_with? '.'
    fn = File.join(source, f)
    handle_sutra(fn)
  }
end
handle_vols(v1, v2) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 508
def handle_vols(v1, v2)
  puts "convert volumns: #{v1}..#{v2}"
  @canon = get_canon_from_vol(v1)
  folder = File.join(@xml_root, @canon)
  Dir.entries(folder).sort.each do |vol|
    next if vol < v1
    next if vol > v2
    handle_vol(vol)
  end
end
open_xml(fn) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 519
def open_xml(fn)
  s = File.read(fn)
  doc = Nokogiri::XML(s)
  doc.remove_namespaces!()
  doc
end
parse_xml(xml_fn) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 526
def parse_xml(xml_fn)
  doc = open_xml(xml_fn)        
  root = doc.root()
  
  @editions = get_editions(doc)

  body = root.xpath("text/body")[0]
  traverse(body)
end
traverse(e) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 536
def traverse(e)
  r = ''
  e.children.each { |c| 
    s = handle_node(c)
    puts "handle_node return nil, node: " + c.to_s if s.nil?
    r += s
  }
  r
end
write_juan(juan_no, txt) click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 546
def write_juan(juan_no, txt)
  folder = File.join(@out_sutra, "%03d" % juan_no)
  FileUtils.makedirs(folder)
  @editions.each do |ed|
    frag = Nokogiri::XML.fragment(txt)
    frag.search("r").each do |node|
      if node['w'].include? ed
        node.add_previous_sibling node.inner_html
      end
      node.remove
    end
    text = frag.content
    text = appify(text) if @settings[:format] == 'app'

    ed2 = ed.sub(/^【(.*?)】$/, '\1')
    if ed == @orig
      fn = "#{ed2}-orig.txt"
    else
      unless ed2 == 'CBETA'
        ed2 = @orig.sub(/^【(.*?)】$/, '\1') + '→' + ed2
      end
      fn = "#{ed2}.txt"
    end
    output_path = File.join(folder, fn)
    fo = File.open(output_path, 'w', encoding: @settings[:encoding])
    fo.write(text)
    fo.close
  end
end