class CBETA::BMToText
將 CBETA
Basic Markup 格式檔 轉為 純文字(含行首資訊)
CBETA
Basic Markup 格式可由此取得: github.com/mahawu/BM_u8
Example:
bm2t = CBETA::BMToText.new('/temp/cbeta-bm', '/temp/cbeta-text1') bm2t.convert('T01') # 執行大正藏第一冊
Public Class Methods
new(bm_root, out_root)
click to toggle source
@param bm_root [String] 來源 CBETA
Basic Markup 檔案路徑@param out_root [String] 輸出路徑
# File lib/cbeta/bm_to_text.rb, line 15 def initialize(bm_root, out_root) @bm_root = bm_root @out_root = out_root end
Public Instance Methods
convert(vol)
click to toggle source
@param vol [String] 要執行的冊號,例如:T01
# File lib/cbeta/bm_to_text.rb, line 21 def convert(vol) @corpus = vol[0] handle_vol(vol) end
Private Instance Methods
handle_vol(vol)
click to toggle source
# File lib/cbeta/bm_to_text.rb, line 36 def handle_vol(vol) path = File.join(@bm_root, @corpus, vol, 'new.txt') fo = nil last_sutra = '' dirty = false char = '(?:\[[^\]]+\]|[^\[\]])' File.open(path, 'r').each_line { |line| line.match(/^(\D+\d+n.{5})(.{8})...(.*)$/) { @sutra = $1.chomp('_') lb = $2 text = $3 line_head = "#{@sutra}_#{lb}" if last_sutra != @sutra folder = prepare_folder(vol) fn = "#{@sutra}.txt" path = File.join(folder, fn) puts "bm2t #{path}" fo = File.open(path, 'w') dirty = false last_sutra = @sutra end text.gsub!(/<[^>]+>/, '') text.gsub!(/\[\d+[A-Z]?\]/, '') # 去掉校勘註標, 例 [01], [02A] text.gsub!(/\[*\]|A|B|D|I|M|P|Q|R|S|T|W|Z|j|s| /, '') text.sub!('[𪄱鴹>[𪄲鴹;商羊]]', '𪄲鴹') # T39n1799_p0939c06 # 通用詞 [䠒跪;胡跪] text.gsub!(/\[([^; ]*);[^\] ]*\]/, '\1') # 修訂 [A>B] reg = Regexp.new("\\[#{char}*>(#{char}*)\\]") text.gsub!(reg, '\1') # 悉曇字 text.gsub!(/((【◇】)|\(【◇】\)|【◇】|(◇)|◇)+/, '【◇】') text.gsub!('((?))', '(?)') # 去掉不比對的標點 text.gsub!(/(\[[^\[\]]+\]|[,、—!。:「])/) { |s| if s.size > 1 s else '' end } if dirty fo.puts else dirty = true end fo.write("#{line_head}║#{text}") } } fo.close end
prepare_folder(vol)
click to toggle source
# File lib/cbeta/bm_to_text.rb, line 28 def prepare_folder(vol) folder = File.join(@out_root, @corpus, vol) unless Dir.exist? folder FileUtils.mkdir_p(folder) end folder end