class CBETA::HTMLToText

CBETA HTML 轉為 純文字(含行首資訊)

Example:

h2t = CBETA::HTMLToText.new('/temp/cbeta-html', '/temp/cbeta-text')
h2t.convert("T01")  # 轉換大正藏第一冊

Public Class Methods

new(html_root, out_root) click to toggle source

@param html_root [String] 來源 HTML 路徑@param out_root [String] 輸出路徑

# File lib/cbeta/html_to_text.rb, line 13
def initialize(html_root, out_root)
  @html_root = html_root
  @out_root = out_root
end

Public Instance Methods

convert(arg) click to toggle source

@param arg [String] 要執行轉換的冊數@example

convert("T01")
# File lib/cbeta/html_to_text.rb, line 21
def convert(arg)
  @dirty = false
  @vol = arg.upcase
  @corpus = @vol[0]
  handle_vol
end

Private Instance Methods

handle_file(path) click to toggle source
# File lib/cbeta/html_to_text.rb, line 111
def handle_file(path)
  sutra = File.basename(path, ".*")
  sutra.sub!(/^(.*)_.*$/, '\1')
  sutra.sub!(/(T\d\dn0220).*$/, '\1') # T0220 BM 沒有分 a, b, c...

  if sutra != @last_sutra
    txt_fn = sutra + '.txt'
    txt_path = File.join(@folder_out, txt_fn)
    puts "h2t #{txt_path}"
    @fo = File.open(txt_path, 'w')
    @last_sutra = sutra
    @dirty = false
  end

  f = File.open(path)
  doc = Nokogiri::HTML(f)
  f.close

  @gaiji = {}
  doc.css("span.gaijiInfo").each { |e|
    @gaiji[e['id']] = e['zzs']
  }

  text = traverse(doc.root)

  # 悉曇字
  text.gsub!(/(\((【◇】)+\)|(【◇】)|【◇】)+/, '【◇】')

  @fo.write(text)
end
handle_node(e) click to toggle source
# File lib/cbeta/html_to_text.rb, line 73
def handle_node(e)
  return '' if e.comment?
  return handle_text(e) if e.text?
  r = ''
  case e.name
  when 'a'
    if e['class'] == 'gaijiAnchor'
      id = e['href'][1..-1]
      r = @gaiji[id]
    else
      r = traverse(e)
    end
  when 'div'
    if e['id'] != 'back'
      r = traverse(e)
    end
  when 'head'
  when 'p'
    if e['class'] == 'figure'
      r = '【圖】'
    else
      r = traverse(e)
    end
  when 'span'
    r = handle_span(e)
  else
    r = traverse(e)
  end
  r
end
handle_span(e) click to toggle source
# File lib/cbeta/html_to_text.rb, line 44
def handle_span(e)
  r = ''
  case e['class']
  when 'doube-line-note'
    r = traverse(e)
    unless r.start_with? '('
      r = "(#{r})"
    end
  when 'lb'
    if @dirty
      r += "\n"
    else
      @dirty = true
    end
    # 行首資訊 T05n0220a 改為 T05n0220
    lb = e['id'].sub(/^(T0\dn0220)[a-z](.*)$/, '\1\2')
    r += lb + '║'
  when 'lineInfo'
  when 'ranja'
    r = '【◇】'
  when 'siddam'
    r = '【◇】'
  when 'star'
  else
    r = traverse(e)
  end
  r
end
handle_text(e) click to toggle source
# File lib/cbeta/html_to_text.rb, line 38
def handle_text(e)
  s = e.content().chomp
  return '' if s.empty?
  s.gsub(/[\n,、—!。:「]/, '')
end
handle_vol() click to toggle source
# File lib/cbeta/html_to_text.rb, line 142
def handle_vol()
  folder_in = File.join(@html_root, @corpus, @vol)
  @folder_out = prepare_folder
  @last_sutra = ''
  Dir["#{folder_in}/*"].each { |f|
    handle_file(f)
  }
end
prepare_folder() click to toggle source
# File lib/cbeta/html_to_text.rb, line 104
def prepare_folder()
  folder = File.join(@out_root, @corpus, @vol)
  FileUtils.remove_dir(folder, true)
  FileUtils.mkdir_p(folder)
  folder
end
traverse(e) click to toggle source
# File lib/cbeta/html_to_text.rb, line 30
def traverse(e)
  r = ''
  e.children.each { |c|
    r += handle_node(c)
  }
  r.gsub(' ', '')
end