module TableData::Detection
Constants
- DiacritsMac
- DiacritsWin
- UmlautsMac
- UmlautsWin
- UnlikelyCharsIso8859_1
- UnlikelyCharsMacRoman
- UnlikelyCharsWin1252
Public Instance Methods
file_type_from_path(path)
click to toggle source
# File lib/tabledata/detection.rb, line 67 def file_type_from_path(path) case path when /\.csv$/ then :csv when /\.xls$/ then :xls when /\.xlsx$/ then :xlsx else raise InvalidFileType, "Unknown file format for path #{path.inspect}" end end
force_guessed_encoding!(string)
click to toggle source
# File lib/tabledata/detection.rb, line 25 def force_guessed_encoding!(string) return string if string.force_encoding(Encoding::UTF_8).valid_encoding? string.force_encoding(Encoding::BINARY) # check for non-mapped codepoints possible_encodings = [Encoding::Windows_1252, Encoding::ISO8859_15, Encoding::MacRoman] possible_encodings.delete(Encoding::ISO8859_15) if string =~ /[\x80-\x9f]/n possible_encodings.delete(Encoding::Windows_1252) if string =~ /[\x81\x8D\x8F\x90\x9D]/n return string.force_encoding(possible_encodings.first) if possible_encodings.size == 1 # # check for occurrences of characters with weighted expectancy # # e.g. a "§" is quite unlikely # win = string[0,10_000].count(UnlikelyCharsWin1252) # iso = string[0,10_000].count(UnlikelyCharsIso8859_1) # mac = string[0,10_000].count(UnlikelyCharsMacRoman) # Check occurrences of äöü case string[0,10_000].count(UmlautsMac) <=> string[0,10_000].count(UmlautsWin) when -1 then return string.force_encoding(Encoding::Windows_1252) when 1 then return string.force_encoding(Encoding::MacRoman) end # Check occurrences of âàéèô case string[0,10_000].count(DiacritsMac) <=> string[0,10_000].count(DiacritsWin) when -1 then return string.force_encoding(Encoding::Windows_1252) when 1 then return string.force_encoding(Encoding::MacRoman) end # Bias for Windows_1252 string.force_encoding(Encoding::Windows_1252) end
guess_csv_delimiter(csv, out_of=[',',';'])
click to toggle source
# File lib/tabledata/detection.rb, line 61 def guess_csv_delimiter(csv, out_of=[',',';']) out_of = out_of.map { |delimiter| delimiter.encode(csv.encoding) } out_of.max_by { |delimiter| csv[0, 10_000].count(delimiter) } end
guess_encoding(string)
click to toggle source
# File lib/tabledata/detection.rb, line 57 def guess_encoding(string) force_guessed_encoding!(string.dup).encoding end