class CBETA::P5aToText
Convert CBETA
XML P5a to Text
CBETA
XML P5a 可由此取得: github.com/cbeta-git/xml-p5a
@example for convert 大正藏第一冊 in app format:
c = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER', 'app') c.convert('T01')
Constants
- PASS
內容不輸出的元素
Public Class Methods
new(xml_root, output_root, opts={})
click to toggle source
@param xml_root [String] 來源 CBETA
XML P5a 路徑@param output_root [String] 輸出 Text 路徑@option opts [String] :format 輸出格式,例:‘app’,預設是 normal @option opts [String] :encoding 輸出編碼,預設 ‘UTF-8’ @option opts [String] :gaiji 缺字處理方式,預設 ‘default’ @option opts [String] :inline_note 是否呈現夾註,預設為 true
* 'PUA': 缺字一律使用 Unicode PUA * 'default': 優先使用通用字
# File lib/cbeta/p5a_to_text.rb, line 31 def initialize(xml_root, output_root, opts={}) @xml_root = xml_root @output_root = output_root @settings = { format: nil, encoding: 'UTF-8', gaiji: 'default', inline_note: true } @settings.merge!(opts) @cbeta = CBETA.new @gaijis = CBETA::Gaiji.new end
Public Instance Methods
convert(target=nil)
click to toggle source
將 CBETA
XML P5a 轉為 Text
@example for convert all:
x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER') x2h.convert
@example for convert 大正藏第一冊:
x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER') x2h.convert('T01')
@example for convert 大正藏全部:
x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER') x2h.convert('T')
@example for convert 大正藏第五冊至第七冊:
x2h = CBETA::P5aToText.new('/PATH/TO/CBETA/XML/P5a', '/OUTPUT/FOLDER') x2h.convert('T05..T07')
T 是大正藏的 ID, CBETA
的藏經 ID 系統請參考: www.cbeta.org/format/id.php
# File lib/cbeta/p5a_to_text.rb, line 70 def convert(target=nil) return convert_all if target.nil? arg = target.upcase if arg.size <= 2 handle_canon(arg) else if arg.include? '..' arg.match(/^([^\.]+?)\.\.([^\.]+)$/) { handle_vols($1, $2) } else handle_vol(arg) end end end
Private Instance Methods
appify(text)
click to toggle source
跨行字詞移到下一行
# File lib/cbeta/p5a_to_text.rb, line 90 def appify(text) r = '' i = 0 app = '' text.each_line do |line| line.chomp! if line.match(/^(.*)║(.*)$/) r += $1 t = $2 r += "(%02d)" % i r += "║#{app}" app = '' i = 0 chars = t.chars until chars.empty? c = chars.pop if c == "\t" break elsif ' :》」』、;,!?。'.include? c chars << c break elsif '《「『'.include? c # 這些標點移到下一行 app = c + app break else app = c + app end end r += chars.join.gsub(/\t/, '') + "\n" i = app.size end end r end
convert_all()
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 125 def convert_all Dir.entries(@xml_root).sort.each do |c| next unless c.match(/^#{CBETA::CANON}$/) handle_canon(c) end end
e_anchor(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 142 def e_anchor(e) if e.has_attribute?('type') if e['type'] == 'circle' return '◎' end end '' end
e_app(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 152 def e_app(e) traverse(e) end
e_byline(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 156 def e_byline(e) r = traverse(e) r += @settings[:format]=='app' ? "\t" : "\n" r end
e_cell(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 162 def e_cell(e) r = traverse(e) r += @settings[:format]=='app' ? "\t" : "\n" r end
e_corr(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 168 def e_corr(e) "<r w='【CBETA】'>%s</r>" % traverse(e) end
e_div(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 172 def e_div(e) traverse(e) end
e_docNumber(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 176 def e_docNumber(e) r = traverse(e) r += @settings[:format] == 'app' ? "\t" : "\n" r end
e_figure(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 182 def e_figure(e) r = traverse(e) r += @settings[:format] == 'app' ? "\t" : "\n" r end
e_foreign(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 188 def e_foreign(e) return '' if e.key?('place') and e['place'].include?('foot') traverse(e) end
e_g(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 193 def e_g(e) # if 悉曇字、蘭札體 # 使用 Unicode PUA # else if 有 <mapping type="unicode"> # 直接採用 # else if 有 <mapping type="normal_unicode"> # 採用 normal_unicode # else if 有 normalized form # 採用 normalized form # else # Unicode PUA gid = e['ref'][1..-1] if @settings[:gaiji] == 'PUA' return CBETA.siddham_pua(gid) if gid.start_with?('SD') # 悉曇字 return CBETA.ranjana_pua(gid) if gid.start_with?('RJ') # 蘭札體 return CBETA.pua(gid) end g = @gaijis[gid] abort "Line:#{__LINE__} 無缺字資料:#{gid}" if g.nil? if gid.start_with?('SD') # 悉曇字 case gid when 'SD-E35A' return '(' when 'SD-E35B' return ')' else return CBETA.siddham_pua(gid) end end if gid.start_with?('RJ') # 蘭札體 return CBETA.ranjana_pua(gid) end return g['unicode-char'] if g.has_key?('unicode') return g['normal_unicode'] if g.has_key?('normal_unicode') return g['normal'] if g.has_key?('normal') # Unicode PUA [0xf0000 + gid[2..-1].to_i].pack 'U' end
e_graphic(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 238 def e_graphic(e) '' end
e_head(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 242 def e_head(e) r = traverse(e) r += @settings[:format] == 'app' ? "\t" : "\n" r end
e_item(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 248 def e_item(e) r = traverse(e) r += @settings[:format] == 'app' ? "\t" : "\n" end
e_juan(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 253 def e_juan(e) r = traverse(e) r += @settings[:format] == 'app' ? "\t" : "\n" r end
e_l(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 259 def e_l(e) r = traverse(e) if @settings[:format] == 'app' r += "\t" else r += "\n" unless @lg_type == 'abnormal' end r end
e_lb(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 269 def e_lb(e) return '' if e['type']=='old' r = '' if @settings[:format] == 'app' r += "\n#{e['n']}║" end unless @next_line_buf.empty? r += @next_line_buf + "\n" @next_line_buf = '' end r end
e_lem(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 282 def e_lem(e) # 沒有 rdg 的版本,用字同 lem editions = Set.new @editions e.xpath('./following-sibling::rdg').each do |rdg| rdg['wit'].scan(/【.*?】/).each do |w| editions.delete w end end w = editions.to_a.join(' ') "<r w='#{w}'>%s</r>" % traverse(e) end
e_lg(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 295 def e_lg(e) traverse(e) end
e_list(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 299 def e_list(e) r = '' r += "\n" unless @settings[:format] == 'app' r + traverse(e) end
e_milestone(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 305 def e_milestone(e) r = '' if e['unit'] == 'juan' @juan = e['n'].to_i r += "<juan #{@juan}>" end r end
e_mulu(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 314 def e_mulu(e) '' end
e_note(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 318 def e_note(e) return '' unless @settings[:inline_note] if e.has_attribute?('place') && e['place']=='inline' r = traverse(e) return "(#{r})" end '' end
e_p(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 328 def e_p(e) r = traverse(e) r += @settings[:format] == 'app' ? "\t" : "\n" r end
e_rdg(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 334 def e_rdg(e) "<r w='#{e['wit']}'>%s</r>" % traverse(e) end
e_row(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 338 def e_row(e) traverse(e) end
e_sg(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 342 def e_sg(e) '(' + traverse(e) + ')' end
e_sic(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 346 def e_sic(e) "<r w='#{@orig}'>" + traverse(e) + "</r>" end
e_t(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 350 def e_t(e) if e.has_attribute? 'place' return '' if e['place'].include? 'foot' end r = traverse(e) # 不是雙行對照 return r if @tt_type == 'app' # 處理雙行對照 i = e.xpath('../t').index(e) case i when 0 return r + ' ' when 1 @next_line_buf += r + ' ' return '' else return r end end
e_table(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 372 def e_table(e) traverse(e) end
e_tt(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 483 def e_tt(e) @tt_type = e['type'] traverse(e) end
get_editions(doc)
click to toggle source
取得所有對校版本
# File lib/cbeta/p5a_to_text.rb, line 133 def get_editions(doc) r = Set.new [@orig, "【CBETA】"] # 至少有底本及 CBETA 兩個版本 doc.xpath('//lem|//rdg').each do |e| w = e['wit'].scan(/【.*?】/) r.merge w end r end
handle_canon(c)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 376 def handle_canon(c) @canon = c puts 'handle_canon ' + c folder = File.join(@xml_root, @canon) Dir.entries(folder).sort.each do |vol| next if vol.start_with? '.' handle_vol(vol) end end
handle_node(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 386 def handle_node(e) return '' if e.comment? return handle_text(e) if e.text? return '' if PASS.include?(e.name) r = case e.name when 'anchor' then e_anchor(e) when 'app' then e_app(e) when 'back' then '' when 'byline' then e_byline(e) when 'cell' then e_cell(e) when 'corr' then e_corr(e) when 'div' then e_div(e) when 'docNumber' then e_docNumber(e) when 'figure' then e_figure(e) when 'foreign' then e_foreign(e) when 'g' then e_g(e) when 'graphic' then e_graphic(e) when 'head' then e_head(e) when 'item' then e_item(e) when 'juan' then e_juan(e) when 'l' then e_l(e) when 'lb' then e_lb(e) when 'lem' then e_lem(e) when 'lg' then e_lg(e) when 'list' then e_list(e) when 'mulu' then e_mulu(e) when 'note' then e_note(e) when 'milestone' then e_milestone(e) when 'p' then e_p(e) when 'rdg' then e_rdg(e) when 'reg' then '' when 'row' then e_row(e) when 'sic' then e_sic(e) when 'sg' then e_sg(e) when 'tt' then e_tt(e) when 't' then e_t(e) when 'table' then e_table(e) when 'teiHeader' then '' when 'unclear' then '▆' else traverse(e) end r end
handle_sutra(xml_fn)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 430 def handle_sutra(xml_fn) @dila_note = 0 @div_count = 0 #@editions = Set.new [@orig, "【CBETA】"] # 至少有底本跟CBETA兩種版本 @in_l = false @juan = 0 @lg_row_open = false @mod_notes = Set.new @next_line_buf = '' @open_divs = [] @sutra_no = File.basename(xml_fn, ".xml") text = parse_xml(xml_fn) # 大正藏 No. 220 大般若經跨冊,CBETA 分成多檔並在檔尾加上 a, b, c.... # 輸出時去掉這些檔尾的 a, b, b.... if @sutra_no.match(/^(T05|T06|T07)n0220/) @sutra_no = "#{$1}n0220" end @out_sutra = File.join(@out_vol, @sutra_no) FileUtils.makedirs @out_sutra juans = text.split(/(<juan \d+>)/) juan_no = nil buf = '' # 一卷一檔 juans.each { |j| if j =~ /<juan (\d+)>$/ juan_no = $1.to_i else if juan_no.nil? buf = j else write_juan(juan_no, buf+j) buf = '' end end } end
handle_text(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 471 def handle_text(e) s = e.content().chomp return '' if s.empty? return '' if e.parent.name == 'app' # cbeta xml 文字之間會有多餘的換行 r = s.gsub(/[\n\r]/, '') # 把 & 轉為 & CGI.escapeHTML(r) end
handle_vol(vol)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 488 def handle_vol(vol) print vol + ' ' @canon = CBETA.get_canon_from_vol(vol) @orig = @cbeta.get_canon_symbol(@canon) abort "未處理底本" if @orig.nil? @vol = vol @out_vol = File.join(@output_root, @canon, vol) FileUtils.remove_dir(@out_vol, true) FileUtils.makedirs @out_vol source = File.join(@xml_root, @canon, vol) Dir.entries(source).sort.each { |f| next if f.start_with? '.' fn = File.join(source, f) handle_sutra(fn) } end
handle_vols(v1, v2)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 508 def handle_vols(v1, v2) puts "convert volumns: #{v1}..#{v2}" @canon = get_canon_from_vol(v1) folder = File.join(@xml_root, @canon) Dir.entries(folder).sort.each do |vol| next if vol < v1 next if vol > v2 handle_vol(vol) end end
open_xml(fn)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 519 def open_xml(fn) s = File.read(fn) doc = Nokogiri::XML(s) doc.remove_namespaces!() doc end
parse_xml(xml_fn)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 526 def parse_xml(xml_fn) doc = open_xml(xml_fn) root = doc.root() @editions = get_editions(doc) body = root.xpath("text/body")[0] traverse(body) end
traverse(e)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 536 def traverse(e) r = '' e.children.each { |c| s = handle_node(c) puts "handle_node return nil, node: " + c.to_s if s.nil? r += s } r end
write_juan(juan_no, txt)
click to toggle source
# File lib/cbeta/p5a_to_text.rb, line 546 def write_juan(juan_no, txt) folder = File.join(@out_sutra, "%03d" % juan_no) FileUtils.makedirs(folder) @editions.each do |ed| frag = Nokogiri::XML.fragment(txt) frag.search("r").each do |node| if node['w'].include? ed node.add_previous_sibling node.inner_html end node.remove end text = frag.content text = appify(text) if @settings[:format] == 'app' ed2 = ed.sub(/^【(.*?)】$/, '\1') if ed == @orig fn = "#{ed2}-orig.txt" else unless ed2 == 'CBETA' ed2 = @orig.sub(/^【(.*?)】$/, '\1') + '→' + ed2 end fn = "#{ed2}.txt" end output_path = File.join(folder, fn) fo = File.open(output_path, 'w', encoding: @settings[:encoding]) fo.write(text) fo.close end end