class ReverseAdoc::Cleaner

Public Instance Methods

clean_headings(string) click to toggle source

following added by me

# File lib/reverse_adoc/cleaner.rb, line 82
def clean_headings(string)
  string.gsub!(%r{<h([1-9])[^>]*></h\1>}, " ")
  # I don't know why Libre Office is inserting them, but they need to go
  string.gsub!(%r{<h([1-9])[^>]* style="vertical-align: super;[^>]*>(.+?)</h\1>},
               "<sup>\\2</sup>")
  # I absolutely don't know why Libre Office is rendering superscripts as h1
  string
end
clean_punctuation_characters(string) click to toggle source
# File lib/reverse_adoc/cleaner.rb, line 62
def clean_punctuation_characters(string)
  string.gsub(/(\*\*|~~|__)\s([.!?'"])/, "\\1".strip + "\\2")
end
clean_tag_borders(string) click to toggle source

Find non-asterisk content that is enclosed by two or more asterisks. Ensure that only one whitespace occurs in the border area. Same for underscores and brackets.

# File lib/reverse_adoc/cleaner.rb, line 36
def clean_tag_borders(string)
  result = string.gsub(/\s?\*{2,}.*?\*{2,}\s?/) do |match|
    preserve_border_whitespaces(match, default_border: ReverseAdoc.config.tag_border) do
      match.strip.sub("** ", "**").sub(" **", "**")
    end
  end

  result = result.gsub(/\s?_{2,}.*?_{2,}\s?/) do |match|
    preserve_border_whitespaces(match, default_border: ReverseAdoc.config.tag_border) do
      match.strip.sub("__ ", "__").sub(" __", "__")
    end
  end

  result = result.gsub(/\s?~{2,}.*?~{2,}\s?/) do |match|
    preserve_border_whitespaces(match, default_border: ReverseAdoc.config.tag_border) do
      match.strip.sub("~~ ", "~~").sub(" ~~", "~~")
    end
  end

  result.gsub(/\s?\[.*?\]\s?/) do |match|
    preserve_border_whitespaces(match) do
      match.strip.sub("[ ", "[").sub(" ]", "]")
    end
  end
end
preprocess_word_html(string) click to toggle source

preprocesses HTML, rather than postprocessing it

# File lib/reverse_adoc/cleaner.rb, line 67
def preprocess_word_html(string)
  clean_headings(scrub_whitespace(string.dup))
end
remove_inner_whitespaces(string) click to toggle source
# File lib/reverse_adoc/cleaner.rb, line 19
def remove_inner_whitespaces(string)
  unless string.nil?
    string.gsub!(/\n stem:\[/, "\nstem:[")
    string.gsub!(/(stem:\[([^\]]|\\\])*\])\n(?=\S)/, "\\1 ")
    string.gsub!(/(stem:\[([^\]]|\\\])*\])\s+(?=[\^-])/, "\\1")
  end
  string.each_line.inject("") do |memo, line|
    memo + preserve_border_whitespaces(line) do
      line.strip.gsub(/[ \t]{2,}/, " ")
    end
  end
end
remove_leading_newlines(string) click to toggle source
# File lib/reverse_adoc/cleaner.rb, line 15
def remove_leading_newlines(string)
  string.gsub(/\A\n+/, "")
end
remove_newlines(string) click to toggle source
# File lib/reverse_adoc/cleaner.rb, line 11
def remove_newlines(string)
  string.gsub(/\n{3,}/, "\n\n")
end
scrub_whitespace(string) click to toggle source
# File lib/reverse_adoc/cleaner.rb, line 71
def scrub_whitespace(string)
  string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
  string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace
  string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace
  string.gsub!(/( +)$/, " ") # line trailing whitespace
  string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
  # string.delete!('?| ')               # Unicode non-breaking spaces, injected as tabs
  string
end
tidy(string) click to toggle source
# File lib/reverse_adoc/cleaner.rb, line 3
def tidy(string)
  result = remove_inner_whitespaces(string)
  result = remove_newlines(result)
  result = remove_leading_newlines(result)
  result = clean_tag_borders(result)
  clean_punctuation_characters(result)
end

Private Instance Methods

present_or_default(string, default) click to toggle source
# File lib/reverse_adoc/cleaner.rb, line 106
def present_or_default(string, default)
  if string.nil? || string.empty?
    default
  else
    string
  end
end
preserve_border_whitespaces(string, options = {}) { || ... } click to toggle source
# File lib/reverse_adoc/cleaner.rb, line 93
def preserve_border_whitespaces(string, options = {})
  return string if /\A\s*\Z/.match?(string)

  default_border = options.fetch(:default_border, "")
  # If the string contains part of a link so the characters [,],(,)
  # then don't add any extra spaces
  default_border = "" if /[\[(\])]/.match?(string)
  string_start   = present_or_default(string[/\A\s*/], default_border)
  string_end     = present_or_default(string[/\s*\Z/], default_border)
  result         = yield
  string_start + result + string_end
end