class CBETA::P5aToSimpleHTML

Convert CBETA XML P5a to simple HTML

CBETA XML P5a 可由此取得: github.com/cbeta-git/xml-p5a

@example for convert 大正藏第一冊:

c = CBETA::P5aToSimpleHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
c.convert('T01')

Constants

PASS

內容不輸出的元素

Public Class Methods

new(xml_root, output_root, gaiji_base, opts={}) click to toggle source

@param xml_root [String] 來源 CBETA XML P5a 路徑@param output_root [String] 輸出 Text 路徑

# File lib/cbeta/p5a_to_simple_html.rb, line 28
def initialize(xml_root, output_root, gaiji_base, opts={})
  @xml_root = xml_root
  @output_root = output_root
  @cbeta = CBETA.new
  @gaijis = CBETA::Gaiji.new(gaiji_base)
  @config = { multi_edition: false }
  @config.merge!(opts)
end

Public Instance Methods

convert(target=nil) click to toggle source

CBETA XML P5a 轉為 Simple HTML

@example for convert 大正藏第一冊:

x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
x2h.convert('T01')

@example for convert 大正藏全部:

x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
x2h.convert('T')

@example for convert 大正藏第五冊至第七冊:

x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER')
x2h.convert('T05..T07')

T 是大正藏的 ID, CBETA 的藏經 ID 系統請參考: www.cbeta.org/format/id.php

# File lib/cbeta/p5a_to_simple_html.rb, line 55
def convert(target=nil)
  return convert_all if target.nil?

  arg = target.upcase
  if arg.size <= 2
    handle_collection(arg)
  else
    if arg.include? '..'
      arg.match(/^([^\.]+?)\.\.([^\.]+)$/) {
        handle_vols($1, $2)
      }
    else
      handle_vol(arg)
    end
  end
end

Private Instance Methods

convert_all() click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 76
def convert_all
  Dir.entries(@xml_root).sort.each { |c|
    next if c.start_with? '.'
    next if c.size > 2
    handle_collection(c)
  }
end
handle_anchor(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 84
def handle_anchor(e)
  if e.has_attribute?('type')
    if e['type'] == 'circle'
      return '◎'
    end
  end

  ''
end
handle_collection(c) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 94
def handle_collection(c)
  @series = c
  puts 'handle_collection ' + c
  folder = File.join(@xml_root, @series)
  Dir.entries(folder).sort.each { |vol|
    next if vol.start_with? '.'
    handle_vol(vol)
  }
end
handle_corr(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 104
def handle_corr(e)
  r = traverse(e)
  if @config[:multi_edition]
    r = "<r w='【CBETA】'>#{r}</r>"
  end
  r
end
handle_foreign(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 112
def handle_foreign(e)
  return '' if e.key?('place') and e['place'].include?('foot')
  traverse(e)
end
handle_g(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 117
def handle_g(e)
  # if 悉曇字、蘭札體
  #   使用 Unicode PUA
  # else if 有 <mapping type="unicode">
  #   直接採用
  # else if 有 <mapping type="normal_unicode">
  #   採用 normal_unicode
  # else if 有 normalized form
  #   採用 normalized form
  # else
  #   Unicode PUA
  gid = e['ref'][1..-1]
  g = @gaijis[gid]
  abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil?
  
  # 悉曇字 or 蘭札體
  if gid.start_with?('SD') or gid.start_with? 'RJ'
    return g['symbol'] if g.key?('symbol')
    return g['romanized'] if g.key?('romanized')
    return g['pua']
  end
  
  return g['uni_char'] unless g['uni_char'].empty?
  return g['norm_uni_char'] unless g['norm_uni_char'].empty?
  return g['norm_big5_char'] unless g['norm_big5_char'].empty?

  # Unicode PUA
  [0xf0000 + gid[2..-1].to_i].pack 'U'
end
handle_item(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 147
def handle_item(e)
  r = traverse(e)
  if e.key? 'n'
    r = e['n'] + r
  end
  r
end
handle_lb(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 155
def handle_lb(e)
  return '' if e['type']=='old'
  @lb = e['n']
  r = %(<a id="lb#{@lb}"></a>)
  unless @next_line_buf.empty?
    r += @next_line_buf + "\n"
    @next_line_buf = ''
  end
  r
end
handle_lem(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 166
def handle_lem(e)
  r = traverse(e)
  if @config[:multi_edition]
    w = e['wit'].scan(/【.*?】/)
    @editions.merge w
    w = w.join(' ')
    r = "<r w='#{w}'>#{r}</r>"
  end
  r
end
handle_milestone(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 177
def handle_milestone(e)
  r = ''
  if e['unit'] == 'juan'
    @juan = e['n'].to_i
    r += "<juan #{@juan}>"
    r += %(<a id="lb#{@lb}"></a>) unless @lb.nil?
  end
  r
end
handle_node(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 187
def handle_node(e)
  return '' if e.comment?
  return handle_text(e) if e.text?
  return '' if PASS.include?(e.name)
  r = case e.name
  when 'anchor'    then handle_anchor(e)
  when 'back'      then ''
  when 'corr'      then handle_corr(e)
  when 'foreign'   then handle_foreign(e)
  when 'g'         then handle_g(e)
  when 'graphic'   then ''
  when 'item'      then handle_item(e)
  when 'lb'        then handle_lb(e)
  when 'lem'       then handle_lem(e)
  when 'mulu'      then ''
  when 'note'      then handle_note(e)
  when 'milestone' then handle_milestone(e)
  when 'rdg'       then handle_rdg(e)
  when 'reg'       then ''
  when 'sic'       then handle_sic(e)
  when 'sg'        then handle_sg(e)
  when 'tt'        then handle_tt(e)
  when 't'         then handle_t(e)
  when 'teiHeader' then ''
  when 'unclear'   then '▆'
  else traverse(e)
  end
  r
end
handle_note(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 217
def handle_note(e)
  if e.has_attribute?('place') && e['place']=='inline'
    r = traverse(e)
    return "(#{r})"
  end
  ''
end
handle_rdg(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 225
def handle_rdg(e)
  return '' unless @config[:multi_edition]
  
  r = traverse(e)
  w = e['wit'].scan(/【.*?】/)
  @editions.merge w
  "<r w='#{e['wit']}'>#{r}</r>"
end
handle_sg(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 234
def handle_sg(e)
  '(' + traverse(e) + ')'
end
handle_sic(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 238
def handle_sic(e)
  return '' unless@config[:multi_edition]
  
  "<r w='#{@orig}'>" + traverse(e) + "</r>"
end
handle_sutra(xml_fn) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 244
def handle_sutra(xml_fn)
  puts "convert sutra #{xml_fn}"
  @dila_note = 0
  @div_count = 0
  @editions = Set.new ["【CBETA】"]
  @in_l = false
  @juan = 0
  @lg_row_open = false
  @mod_notes = Set.new
  @next_line_buf = ''
  @open_divs = []
  @sutra_no = File.basename(xml_fn, ".xml")
  @lb = nil

  text = parse_xml(xml_fn)
 
  # 大正藏 No. 220 大般若經跨冊,CBETA 分成多檔並在檔尾加上 a, b, c....
  # 輸出時去掉這些檔尾的 a, b, b....
  if @sutra_no.match(/^(T05|T06|T07)n0220/)
    @sutra_no = "#{$1}n0220"
  end

  @out_sutra = File.join(@out_vol, @sutra_no)
  FileUtils.makedirs @out_sutra

  juans = text.split(/(<juan \d+>)/)
  juan_no = nil
  buf = ''
  # 一卷一檔
  juans.each { |j|
    if j =~ /<juan (\d+)>$/
      juan_no = $1.to_i
    else
      if juan_no.nil?
        buf = j
      else
        write_juan(juan_no, buf+j)
        buf = ''
      end
    end
  }
end
handle_t(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 287
def handle_t(e)
  if e.has_attribute? 'place'
    return '' if e['place'].include? 'foot'
  end
  r = traverse(e)

  # 不是雙行對照
  return r if @tt_type == 'app'

  # 處理雙行對照
  i = e.xpath('../t').index(e)
  case i
  when 0
    return r + ' '
  when 1
    @next_line_buf += r + ' '
    return ''
  else
    return r
  end
end
handle_text(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 309
def handle_text(e)
  s = e.content().chomp
  return '' if s.empty?
  return '' if e.parent.name == 'app'

  # cbeta xml 文字之間會有多餘的換行
  r = s.gsub(/[\n\r]/, '')

  # 把 & 轉為 &amp;
  CGI.escapeHTML(r)
end
handle_tt(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 321
def handle_tt(e)
  @tt_type = e['type']
  traverse(e)
end
handle_vol(vol) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 326
def handle_vol(vol)
  puts "convert volumn: #{vol}"


  @vol = vol
  @series = CBETA.get_canon_from_vol(vol)
  
  @orig = @cbeta.get_canon_symbol(@series)
  abort "未處理底本" if @orig.nil?
  @orig_short = @orig.sub(/^【(.*)】$/, '\1')
  
  @out_vol = File.join(@output_root, @series, vol)
  FileUtils.remove_dir(@out_vol, true)
  FileUtils.makedirs @out_vol
  
  source = File.join(@xml_root, @series, vol)
  Dir[source+"/*"].each { |f|
    handle_sutra(f)
  }
end
handle_vols(v1, v2) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 347
def handle_vols(v1, v2)
  puts "convert volumns: #{v1}..#{v2}"
  @series = CBETA.get_canon_from_vol(v1)
  folder = File.join(@xml_root, @series)
  Dir.foreach(folder) { |vol|
    next if vol < v1
    next if vol > v2
    handle_vol(vol)
  }
end
open_xml(fn) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 358
def open_xml(fn)
  s = File.read(fn)
  doc = Nokogiri::XML(s)
  doc.remove_namespaces!()
  doc
end
parse_xml(xml_fn) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 365
def parse_xml(xml_fn)
  doc = open_xml(xml_fn)        
  root = doc.root()

  body = root.xpath("text/body")[0]
  traverse(body)
end
traverse(e) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 373
def traverse(e)
  r = ''
  e.children.each { |c| 
    s = handle_node(c)
    r += s
  }
  r
end
write_juan(juan_no, txt) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 382
def write_juan(juan_no, txt)
  if @config[:multi_edition]
    write_juan_for_editions(juan_no, txt)
  else
    fn = File.join(@out_sutra, "%03d.html" % juan_no)
    write_juan_to_file(fn, txt)
  end
end
write_juan_for_editions(juan_no, txt) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 391
def write_juan_for_editions(juan_no, txt)
  folder = File.join(@out_sutra, "%03d" % juan_no)
  FileUtils.makedirs(folder)
  @editions.each do |ed|
    frag = Nokogiri::XML.fragment(txt)
    frag.search("r").each do |node|
      if node['w'] == ed
        node.add_previous_sibling(node.text)
      end
      node.remove
    end
    html = to_html(frag)

    fn = ed.sub(/^【(.*?)】$/, '\1')
    if fn != 'CBETA' and fn != @orig_short
      fn = @orig_short + '→' + fn
    end
    fn = "#{fn}.html"
    output_path = File.join(folder, fn)
    write_juan_to_file(output_path, html)
  end
end
write_juan_to_file(fn, html) click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 414
  def write_juan_to_file(fn, html)
    text = <<-END.gsub(/^\s+\|/, '')
      |<!DOCTYPE html>
      |<html>
      |<head>
      |  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
      |</head>
    END
    text += "<body>#{html}</body></html>"
    File.write(fn, text)
  end