class CBETA::P5aToSimpleHTML
Convert CBETA
XML P5a to simple HTML
-
HTML 中除了純文字之外,只有行號標記
-
每一卷、每個校勘版本都產生一個檔案
CBETA
XML P5a 可由此取得: github.com/cbeta-git/xml-p5a
@example for convert 大正藏第一冊:
c = CBETA::P5aToSimpleHTML.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER') c.convert('T01')
Constants
- PASS
內容不輸出的元素
Public Class Methods
new(xml_root, output_root, gaiji_base, opts={})
click to toggle source
@param xml_root [String] 來源 CBETA
XML P5a 路徑@param output_root [String] 輸出 Text 路徑
# File lib/cbeta/p5a_to_simple_html.rb, line 28 def initialize(xml_root, output_root, gaiji_base, opts={}) @xml_root = xml_root @output_root = output_root @cbeta = CBETA.new @gaijis = CBETA::Gaiji.new(gaiji_base) @config = { multi_edition: false } @config.merge!(opts) end
Public Instance Methods
convert(target=nil)
click to toggle source
將 CBETA
XML P5a 轉為 Simple HTML
@example for convert 大正藏第一冊:
x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER') x2h.convert('T01')
@example for convert 大正藏全部:
x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER') x2h.convert('T')
@example for convert 大正藏第五冊至第七冊:
x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER') x2h.convert('T05..T07')
T 是大正藏的 ID, CBETA
的藏經 ID 系統請參考: www.cbeta.org/format/id.php
# File lib/cbeta/p5a_to_simple_html.rb, line 55 def convert(target=nil) return convert_all if target.nil? arg = target.upcase if arg.size <= 2 handle_collection(arg) else if arg.include? '..' arg.match(/^([^\.]+?)\.\.([^\.]+)$/) { handle_vols($1, $2) } else handle_vol(arg) end end end
Private Instance Methods
convert_all()
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 76 def convert_all Dir.entries(@xml_root).sort.each { |c| next if c.start_with? '.' next if c.size > 2 handle_collection(c) } end
handle_anchor(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 84 def handle_anchor(e) if e.has_attribute?('type') if e['type'] == 'circle' return '◎' end end '' end
handle_collection(c)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 94 def handle_collection(c) @series = c puts 'handle_collection ' + c folder = File.join(@xml_root, @series) Dir.entries(folder).sort.each { |vol| next if vol.start_with? '.' handle_vol(vol) } end
handle_corr(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 104 def handle_corr(e) r = traverse(e) if @config[:multi_edition] r = "<r w='【CBETA】'>#{r}</r>" end r end
handle_foreign(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 112 def handle_foreign(e) return '' if e.key?('place') and e['place'].include?('foot') traverse(e) end
handle_g(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 117 def handle_g(e) # if 悉曇字、蘭札體 # 使用 Unicode PUA # else if 有 <mapping type="unicode"> # 直接採用 # else if 有 <mapping type="normal_unicode"> # 採用 normal_unicode # else if 有 normalized form # 採用 normalized form # else # Unicode PUA gid = e['ref'][1..-1] g = @gaijis[gid] abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil? # 悉曇字 or 蘭札體 if gid.start_with?('SD') or gid.start_with? 'RJ' return g['symbol'] if g.key?('symbol') return g['romanized'] if g.key?('romanized') return g['pua'] end return g['uni_char'] unless g['uni_char'].empty? return g['norm_uni_char'] unless g['norm_uni_char'].empty? return g['norm_big5_char'] unless g['norm_big5_char'].empty? # Unicode PUA [0xf0000 + gid[2..-1].to_i].pack 'U' end
handle_item(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 147 def handle_item(e) r = traverse(e) if e.key? 'n' r = e['n'] + r end r end
handle_lb(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 155 def handle_lb(e) return '' if e['type']=='old' @lb = e['n'] r = %(<a id="lb#{@lb}"></a>) unless @next_line_buf.empty? r += @next_line_buf + "\n" @next_line_buf = '' end r end
handle_lem(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 166 def handle_lem(e) r = traverse(e) if @config[:multi_edition] w = e['wit'].scan(/【.*?】/) @editions.merge w w = w.join(' ') r = "<r w='#{w}'>#{r}</r>" end r end
handle_milestone(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 177 def handle_milestone(e) r = '' if e['unit'] == 'juan' @juan = e['n'].to_i r += "<juan #{@juan}>" r += %(<a id="lb#{@lb}"></a>) unless @lb.nil? end r end
handle_node(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 187 def handle_node(e) return '' if e.comment? return handle_text(e) if e.text? return '' if PASS.include?(e.name) r = case e.name when 'anchor' then handle_anchor(e) when 'back' then '' when 'corr' then handle_corr(e) when 'foreign' then handle_foreign(e) when 'g' then handle_g(e) when 'graphic' then '' when 'item' then handle_item(e) when 'lb' then handle_lb(e) when 'lem' then handle_lem(e) when 'mulu' then '' when 'note' then handle_note(e) when 'milestone' then handle_milestone(e) when 'rdg' then handle_rdg(e) when 'reg' then '' when 'sic' then handle_sic(e) when 'sg' then handle_sg(e) when 'tt' then handle_tt(e) when 't' then handle_t(e) when 'teiHeader' then '' when 'unclear' then '▆' else traverse(e) end r end
handle_note(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 217 def handle_note(e) if e.has_attribute?('place') && e['place']=='inline' r = traverse(e) return "(#{r})" end '' end
handle_rdg(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 225 def handle_rdg(e) return '' unless @config[:multi_edition] r = traverse(e) w = e['wit'].scan(/【.*?】/) @editions.merge w "<r w='#{e['wit']}'>#{r}</r>" end
handle_sg(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 234 def handle_sg(e) '(' + traverse(e) + ')' end
handle_sic(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 238 def handle_sic(e) return '' unless@config[:multi_edition] "<r w='#{@orig}'>" + traverse(e) + "</r>" end
handle_sutra(xml_fn)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 244 def handle_sutra(xml_fn) puts "convert sutra #{xml_fn}" @dila_note = 0 @div_count = 0 @editions = Set.new ["【CBETA】"] @in_l = false @juan = 0 @lg_row_open = false @mod_notes = Set.new @next_line_buf = '' @open_divs = [] @sutra_no = File.basename(xml_fn, ".xml") @lb = nil text = parse_xml(xml_fn) # 大正藏 No. 220 大般若經跨冊,CBETA 分成多檔並在檔尾加上 a, b, c.... # 輸出時去掉這些檔尾的 a, b, b.... if @sutra_no.match(/^(T05|T06|T07)n0220/) @sutra_no = "#{$1}n0220" end @out_sutra = File.join(@out_vol, @sutra_no) FileUtils.makedirs @out_sutra juans = text.split(/(<juan \d+>)/) juan_no = nil buf = '' # 一卷一檔 juans.each { |j| if j =~ /<juan (\d+)>$/ juan_no = $1.to_i else if juan_no.nil? buf = j else write_juan(juan_no, buf+j) buf = '' end end } end
handle_t(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 287 def handle_t(e) if e.has_attribute? 'place' return '' if e['place'].include? 'foot' end r = traverse(e) # 不是雙行對照 return r if @tt_type == 'app' # 處理雙行對照 i = e.xpath('../t').index(e) case i when 0 return r + ' ' when 1 @next_line_buf += r + ' ' return '' else return r end end
handle_text(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 309 def handle_text(e) s = e.content().chomp return '' if s.empty? return '' if e.parent.name == 'app' # cbeta xml 文字之間會有多餘的換行 r = s.gsub(/[\n\r]/, '') # 把 & 轉為 & CGI.escapeHTML(r) end
handle_tt(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 321 def handle_tt(e) @tt_type = e['type'] traverse(e) end
handle_vol(vol)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 326 def handle_vol(vol) puts "convert volumn: #{vol}" @vol = vol @series = CBETA.get_canon_from_vol(vol) @orig = @cbeta.get_canon_symbol(@series) abort "未處理底本" if @orig.nil? @orig_short = @orig.sub(/^【(.*)】$/, '\1') @out_vol = File.join(@output_root, @series, vol) FileUtils.remove_dir(@out_vol, true) FileUtils.makedirs @out_vol source = File.join(@xml_root, @series, vol) Dir[source+"/*"].each { |f| handle_sutra(f) } end
handle_vols(v1, v2)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 347 def handle_vols(v1, v2) puts "convert volumns: #{v1}..#{v2}" @series = CBETA.get_canon_from_vol(v1) folder = File.join(@xml_root, @series) Dir.foreach(folder) { |vol| next if vol < v1 next if vol > v2 handle_vol(vol) } end
open_xml(fn)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 358 def open_xml(fn) s = File.read(fn) doc = Nokogiri::XML(s) doc.remove_namespaces!() doc end
parse_xml(xml_fn)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 365 def parse_xml(xml_fn) doc = open_xml(xml_fn) root = doc.root() body = root.xpath("text/body")[0] traverse(body) end
traverse(e)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 373 def traverse(e) r = '' e.children.each { |c| s = handle_node(c) r += s } r end
write_juan(juan_no, txt)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 382 def write_juan(juan_no, txt) if @config[:multi_edition] write_juan_for_editions(juan_no, txt) else fn = File.join(@out_sutra, "%03d.html" % juan_no) write_juan_to_file(fn, txt) end end
write_juan_for_editions(juan_no, txt)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 391 def write_juan_for_editions(juan_no, txt) folder = File.join(@out_sutra, "%03d" % juan_no) FileUtils.makedirs(folder) @editions.each do |ed| frag = Nokogiri::XML.fragment(txt) frag.search("r").each do |node| if node['w'] == ed node.add_previous_sibling(node.text) end node.remove end html = to_html(frag) fn = ed.sub(/^【(.*?)】$/, '\1') if fn != 'CBETA' and fn != @orig_short fn = @orig_short + '→' + fn end fn = "#{fn}.html" output_path = File.join(folder, fn) write_juan_to_file(output_path, html) end end
write_juan_to_file(fn, html)
click to toggle source
# File lib/cbeta/p5a_to_simple_html.rb, line 414 def write_juan_to_file(fn, html) text = <<-END.gsub(/^\s+\|/, '') |<!DOCTYPE html> |<html> |<head> | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"> |</head> END text += "<body>#{html}</body></html>" File.write(fn, text) end