class CsvLazy

A simple library for parsing CSV-files through IO's. Solves corrupt file formats automatically like when files contains several spaces after a column and more.

Public Class Methods

new(args = {}, &blk) click to toggle source

Examples

File.open("csvfile.csv", "r") do |fp|
  CsvLazy.new(io: fp, quote_char: '"', col_sep: ";", row_sep: "\n", encode: "utf-8") do |row_array|
    puts "Row: #{row_array}"
  end
end
# File lib/csv_lazy.rb, line 16
def initialize(args = {}, &blk)
  @args = {
    quote_char: '"',
    row_sep: "\n",
    col_sep: ";",
    headers: false,
    buffer_length: 4096
  }.merge(args)

  @io = @args[:io]
  @eof = false
  @buffer = ""
  @debug = @args[:debug]
  @encode = @args[:encode]
  @mutex = Mutex.new
  @buffer_length = @args[:buffer_length]
  @escape_char = "\\"
  @escaped_quote = "#{@escape_char}#{@args[:quote_char]}"
  @escaped_quote_double = "#{@escape_char}#{@escape_char}#{@args[:quote_char]}"

  #@debug = true

  accepted = [:encode, :quote_char, :row_sep, :col_sep, :io, :debug, :headers, :buffer_length]
  @args.each do |key, val|
    raise "Unknown argument: '#{key}'." unless accepted.include?(key)
  end

  raise "No ':quote_char' was given." if @args[:quote_char].to_s.strip.empty?
  raise "No ':col_sep' was given." if @args[:col_sep].to_s.empty?
  raise "No ':row_sep' was given." if @args[:row_sep].to_s.empty?
  raise "No ':io' was given." unless @args[:io]

  @regex_begin_quote_char = /\A\s*#{Regexp.escape(@args[:quote_char])}/

  @regex_row_end = /\A\s*?#{Regexp.escape(@args[:row_sep])}/
  @regex_colsep_next = /\A#{Regexp.escape(@args[:col_sep])}/

  @regex_read_until_quote_char = /\A(.*?)#{Regexp.escape(@args[:quote_char])}/
  @regex_read_until_col_sep = /\A(.*?)#{Regexp.escape(@args[:col_sep])}/
  @regex_read_until_row_sep = /\A(.+?)#{Regexp.escape(@args[:row_sep])}/
  @regex_read_until_end = /\A(.+?)\Z/

  if @args[:headers]
    headers = []
    read_row.each do |key|
      headers << key.to_sym
    end

    @headers = headers
  end

  self.each(&blk) if blk
end

Public Instance Methods

each() { |row| ... } click to toggle source

Yields each row as an array.

# File lib/csv_lazy.rb, line 71
def each
  if block_given?
    @mutex.synchronize do
      while row = read_row
        yield(row)
      end
    end
  else
    Enumerable.new do |yielder|
      @mutex.synchronize do
        while row = read_row
          yielder << row
        end
      end
    end
  end
end
read_row() click to toggle source

Returns the next row.

# File lib/csv_lazy.rb, line 90
def read_row
  @row = []
  while !@eof || !@buffer.empty?
    break unless read_next_col
  end

  row = @row
  @row = nil

  puts "csv_lazy: Row: #{row}\n\n" if @debug

  if row.empty? && @eof
    return false
  else
    if @headers
      ret = {}
      row.length.times do |count|
        ret[@headers[count]] = row[count]
      end

      return ret
    else
      return row
    end
  end
end

Private Instance Methods

add_col(str) click to toggle source

Adds a new column to the current row.

# File lib/csv_lazy.rb, line 254
def add_col(str)
  @row << str
end
read_buffer() click to toggle source

Reads more content into the buffer.

# File lib/csv_lazy.rb, line 120
def read_buffer
  while @buffer.length < @buffer_length && !@eof
    read = @io.gets

    if read == nil
      @eof = true
    else
      read = read.encode(@encode) if @encode
      @buffer << read
    end
  end
end
read_next_col() click to toggle source

Adds the next column to the row. Returns true if more columns should be read or false if this was the end of the row.

# File lib/csv_lazy.rb, line 165
def read_next_col
  read_buffer if @buffer.length < @buffer_length
  return false if @buffer.empty? && @eof

  if @buffer.empty? || read_remove_regex(@regex_row_end)
    return false
  elsif match = read_remove_regex(@regex_begin_quote_char)
    read = ""
    col_content = ""

    loop do
      if read_until_quote_and_end
        break
      elsif match_read = read_remove_regex(@regex_read_until_quote_char)
        all = match_read[0]
        escaped_quote_char = all[-@escaped_quote.length, @escaped_quote.length]
        double_escaped_quote_char = all[-@escaped_quote_double.length, @escaped_quote_double.length]
        all_without_quote = match_read[1]

        if escaped_quote_char == @escaped_quote && double_escaped_quote_char != @escaped_quote_double
          #continue reading - the quote char is escaped.
          col_content << all
        else
          col_content << match_read[1]
          add_col(unescape(col_content))
          break
        end
      else
        if @eof
          add_col(@buffer) unless @buffer.empty?
          @buffer = ""
          break
        else
          read_buffer
        end
      end
    end

    read_buffer if @buffer.length < 4096

    if read_remove_regex(@regex_colsep_next)
      return true
    elsif @eof && @buffer.empty?
      puts "csv_lazy: End-of-file and empty buffer." if @debug
      return false
    elsif read_remove_regex(@regex_row_end)
      puts "csv_lazy: Row-end found." if @debug
      return false
    else
      raise "Dont know what to do (#{@buffer.length}): #{@buffer}"
    end
  elsif match = read_remove_regex(@regex_read_until_col_sep)
    add_col(match[1])
  elsif match = read_remove_regex(@regex_read_until_row_sep)
    puts "csv_lazy: Row seperator reached." if @debug
    add_col(match[1])
    return false
  elsif match = read_remove_regex(@regex_read_until_end)
    #If the very end of the file has been reached, then add this data and stop parsing.
    if @eof
      add_col(match[1])
      return false
    end

    #The end-of-file hasnt been reached. Add more data to buffer and try again.
    @buffer << match[0]
    read_buffer
    raise Errno::EAGAIN
  else
    raise "Dont know what to do with buffer: '#{@buffer}'."
  end
rescue Errno::EAGAIN
  puts "csv_lazy: Retry! Probably we ran out of buffer..." if @debug
  retry
end
read_remove_regex(regex) click to toggle source

Runs a regex against the buffer. If matched it also removes it from the buffer.

# File lib/csv_lazy.rb, line 134
def read_remove_regex(regex)
  if match = @buffer.match(regex)
    oldbuffer = @buffer
    @buffer = @buffer.gsub(regex, "")

    if @debug
      print "csv_lazy: Regex: #{regex.to_s}\n"
      print "csv_lazy: Match: #{match.to_a}\n"
      print "csv_lazy: Buffer before: #{oldbuffer}\n"
      print "csv_lazy: Buffer after: #{@buffer}\n"
      print "\n"
    end

    raise "Buffer was the same before regex?" if oldbuffer == @buffer
    return match
  end

  return false
end
read_until_quote_and_end() click to toggle source
# File lib/csv_lazy.rb, line 241
def read_until_quote_and_end
  if match = @buffer.match(/\A(.*?)#{Regexp.escape(@args[:quote_char])}(#{Regexp.escape(@args[:col_sep])}|#{Regexp.escape(@args[:row_sep])})/)
    content = match[1]
    @buffer = @buffer.gsub(/\A#{Regexp.escape(content)}#{Regexp.escape(@args[:quote_char])}/, "")
    content = content.gsub(/\\#{Regexp.escape(@args[:quote_char])}/, @args[:quote_char]) # Remove escapes from escaped quotes
    add_col(content)
    true
  else
    false
  end
end
unescape(str) click to toggle source
# File lib/csv_lazy.rb, line 154
def unescape(str)
  return str.strtr(
    "\\\\" => "\\",
    "\\t" => "\t",
    "\\n" => "\n",
    "\\r" => "\r",
    "\\\"" => "\""
  )
end