module TableData::Detection

Constants

DiacritsMac
DiacritsWin
UmlautsMac
UmlautsWin
UnlikelyCharsIso8859_1
UnlikelyCharsMacRoman
UnlikelyCharsWin1252

Public Instance Methods

file_type_from_path(path) click to toggle source
# File lib/tabledata/detection.rb, line 67
def file_type_from_path(path)
  case path
    when /\.csv$/ then :csv
    when /\.xls$/ then :xls
    when /\.xlsx$/ then :xlsx
    else raise InvalidFileType, "Unknown file format for path #{path.inspect}"
  end
end
force_guessed_encoding!(string) click to toggle source
# File lib/tabledata/detection.rb, line 25
def force_guessed_encoding!(string)
  return string if string.force_encoding(Encoding::UTF_8).valid_encoding?
  string.force_encoding(Encoding::BINARY)

  # check for non-mapped codepoints
  possible_encodings = [Encoding::Windows_1252, Encoding::ISO8859_15, Encoding::MacRoman]
  possible_encodings.delete(Encoding::ISO8859_15) if string =~ /[\x80-\x9f]/n
  possible_encodings.delete(Encoding::Windows_1252) if string =~ /[\x81\x8D\x8F\x90\x9D]/n
  return string.force_encoding(possible_encodings.first) if possible_encodings.size == 1

  # # check for occurrences of characters with weighted expectancy
  # # e.g. a "§" is quite unlikely
  # win = string[0,10_000].count(UnlikelyCharsWin1252)
  # iso = string[0,10_000].count(UnlikelyCharsIso8859_1)
  # mac = string[0,10_000].count(UnlikelyCharsMacRoman)

  # Check occurrences of äöü
  case string[0,10_000].count(UmlautsMac) <=> string[0,10_000].count(UmlautsWin)
    when -1 then return string.force_encoding(Encoding::Windows_1252)
    when  1 then return string.force_encoding(Encoding::MacRoman)
  end

  # Check occurrences of âàéèô
  case string[0,10_000].count(DiacritsMac) <=> string[0,10_000].count(DiacritsWin)
    when -1 then return string.force_encoding(Encoding::Windows_1252)
    when  1 then return string.force_encoding(Encoding::MacRoman)
  end

  # Bias for Windows_1252
  string.force_encoding(Encoding::Windows_1252)
end
guess_csv_delimiter(csv, out_of=[',',';']) click to toggle source
# File lib/tabledata/detection.rb, line 61
def guess_csv_delimiter(csv, out_of=[',',';'])
  out_of = out_of.map { |delimiter| delimiter.encode(csv.encoding) }

  out_of.max_by { |delimiter| csv[0, 10_000].count(delimiter) }
end
guess_encoding(string) click to toggle source
# File lib/tabledata/detection.rb, line 57
def guess_encoding(string)
  force_guessed_encoding!(string.dup).encoding
end