module WtfCSV

Constants

VERSION

Public Class Methods

scan(file, options = {}) { |percent_done| ... } click to toggle source
# File lib/wtf_csv/wtf_csv.rb, line 2
def WtfCSV.scan(file, options = {}, &block)
  default_options = {
    :col_sep => ',',
    :row_sep => $/,
    :quote_char => '"',
    :escape_char => '\\',
    :check_col_count => true,
    :col_threshold => 80,
    :num_cols => 0,
    :ignore_string => nil,
    :allow_row_sep_in_quoted_fields => false,
    :max_chars_in_field => nil,
    :file_encoding => 'utf-8',
  }
  options = default_options.merge(options)
  
  f = File.open(file, "r:#{options[:file_encoding]}")
  trgt_line_count = `wc -l "#{file}"`.strip.split(' ')[0].to_i if block_given?
  
  # credit to tilo, author of smarter_csv, on how to loop over lines without reading whole file into memory
  old_row_sep = $/
  $/ = options[:row_sep]
  
  quote_errors = Array.new
  encoding_errors = Array.new
  column_errors = Array.new
  column_counts = Array.new if options[:check_col_count]
  length_errors = Array.new
  
  line_number = 0
  col_number = 0
  percent_done = 0
  previous_line = ""
  last_line_ended_quoted = false if options[:allow_row_sep_in_quoted_fields]
  field_length = 0 if ! options[:max_chars_in_field].nil?
  
  begin
    while ! f.eof?
      line = f.readline
      begin
        if block_given? and ((line_number.to_f / trgt_line_count)*100).to_i > percent_done
          percent_done = ((line_number.to_f / trgt_line_count)*100).to_i
          yield percent_done
        end
      
        line.chomp!
        
        next if ! options[:ignore_string].nil? and line == options[:ignore_string]
        
        if options[:allow_row_sep_in_quoted_fields] and last_line_ended_quoted
          line_number -= 1
          last_line_ended_quoted = false
          field_length += options[:row_sep].length if ! options[:max_chars_in_field].nil?
        else
          is_quoted = false
          new_col = true
          quote_has_ended = false
          quote_error = false
          escape_char = false
          col_number = 0
        end
        pos_start = 0
        
        line.each_char.with_index do |char, position|
          begin
            char.ord  # this is here to check encoding. if the encoding is bad this will throw an exception
            
            field_length += 1 if ! options[:max_chars_in_field].nil?
            
            if escape_char and options[:escape_char] == options[:quote_char] and char != options[:quote_char]
              escape_char = false
              is_quoted = ! is_quoted
              if ! is_quoted
                quote_has_ended = true
              elsif ! new_col
                quote_error = true
                is_quoted = false
              end
            end
            
            if char != options[:quote_char] and char != options[:col_sep] and char != options[:escape_char] ## escape_char part
              new_col = false
              if quote_has_ended
                quote_error = true
              end
            elsif char == options[:quote_char] and escape_char
              escape_char = false
            elsif char == options[:escape_char]
              escape_char = true
            elsif char == options[:quote_char] and is_quoted
              quote_has_ended = true
              is_quoted = false
            elsif char == options[:quote_char]
              if new_col
                is_quoted = true
                new_col = false
              else
                quote_error = true
              end
            elsif char == options[:col_sep] and ! is_quoted
              if quote_error
                quote_errors.push([line_number + 1,col_number + 1,"#{previous_line}#{line[pos_start..(position - 1)]}"])
                quote_error = false
              end
              if ! options[:max_chars_in_field].nil?
                length_errors.push([line_number + 1,col_number + 1,field_length - 1]) if (field_length - 1) > options[:max_chars_in_field]
                field_length = 0
              end
              new_col = true
              quote_has_ended = false
              previous_line = ""
              pos_start = position + 1
              col_number += 1
            end
          rescue Exception => e
            if e.message == 'invalid byte sequence in UTF-8'
              encoding_errors.push([line_number + 1,col_number + 1])
            end
          end
        end
        
        if escape_char and options[:escape_char] == options[:quote_char]
          if ! new_col and ! is_quoted
            quote_error = true
          else
            is_quoted = ! is_quoted
          end
        end
        
        if is_quoted
          if options[:allow_row_sep_in_quoted_fields]
            last_line_ended_quoted = true
            previous_line = "#{previous_line}#{line[pos_start...line.length]}#{options[:row_sep]}"
            next
          else
            quote_error = true
          end
        end
        
        quote_errors.push([line_number + 1,col_number + 1,line[pos_start..line.length]]) if quote_error
        
        if ! options[:max_chars_in_field].nil?
          length_errors.push([line_number + 1,col_number + 1,field_length]) if field_length > options[:max_chars_in_field]
          field_length = 0
        end
        
        if options[:check_col_count]
          fnd = false
          column_counts.each do |val|
            if val[0] == col_number + 1
              val[1].push(line_number)
              fnd = true
              break
            end
          end
          
          if ! fnd
            column_counts.push([col_number + 1, [line_number + 1]])
          end
        end
        
      rescue Exception => e
        # don't do anything
      ensure
        line_number += 1
      end
    end
  ensure
    $/ = old_row_sep
  end
  
  if options[:check_col_count]
    column_counts.sort_by! { |val| val[1].length }
    column_counts.reverse!
    
    # if we're looking for an absolute number...
    if options[:num_cols] != 0
      column_counts.each do |val|
        if val[0] != options[:num_cols]
          val[1].each { |row| column_errors.push([row,val[0],options[:num_cols]]) }
        end
      end
    
    # else we'll try to figure out the target number of columns with :col_threshold
    elsif column_counts.length > 1
      if column_counts[0][1].length >= line_number * (options[:col_threshold].to_f / 100)
        column_counts.drop(1).each { |val| val[1].each { |row| column_errors.push([row,val[0],column_counts[0][0]]) } }
      else
        column_counts.each { |val| column_errors.push([val[0],val[1].length]) }
      end
    end
  end

  return {:quote_errors => quote_errors,
          :encoding_errors => encoding_errors,
          :column_errors => column_errors,
          :length_errors => length_errors}
  
end