module BookClean::Publisher

Public Class Methods

clean(str, lang=:pt) click to toggle source
# File lib/bookclean.rb, line 6
def self.clean(str, lang=:pt)
  return str if str==nil
  str=str.rstrip.lstrip
  str = UnicodeUtils.downcase(str)
  str.gsub!(/\s+/, ' ')
  str.gsub!(/\s*ltda.?$/, '')
  str.gsub!(/\slv$/, '')
  str.gsub!(/\ss\.a\.$/, '')
  str.gsub!(/\ss\.a$/, '')
  str.gsub!(/\ssa\.$/, '')
  str.gsub!(/\ss\.\sa\.$/, '')
  str.gsub!(/\ss\.\sa$/, '')
  str.gsub!(/^editora/, '') if !str.match(/^editora\s+..\s+/) #editora da mente should keep editora
  str.gsub!(/editora$/, '')
  str.gsub!('&', ' & ')
  str.gsub!(/\s+/, ' ')
  str.gsub!('Ã?', 'á')
  str.gsub!('ã?', 'á')
  str.gsub!(' ed.', '')
  str.gsub!(/\sed$/, '')
  str.gsub!(/\s-$/, '')




  #Split words in tokens
  #Match each token to dictionary of accented words.
  #Join words by space
  str = UnicodeUtils.titlecase(str)
  #Downcase Prepositions
  #Downcase Conjuntions
  #Remove extra space (end, middle and end)
  #Remove editora begin and end.
   
  str.gsub!(" Da ", " da ")
  str.gsub!(" De ", " de ")
  str.gsub!(" Do ", " do ")
  str.gsub!("çao", "ção")
  str.gsub!("çoes", "ções")
  
  str=str.rstrip.lstrip
end