class CsvReader::ParserStrict

Constants

BACKSLASH

char constants

CR
LF

Attributes

config[R]

Public Class Methods

build_logger() click to toggle source

add simple logger with debug flag/switch

use Parser.debug = true # to turn on

todo/fix: use logutils instead of std logger - why? why not?

# File lib/csvreader/parser_strict.rb, line 22
def self.build_logger()
  l = Logger.new( STDOUT )
  l.level = :info    ## set to :info on start; note: is 0 (debug) by default
  l
end
logger() click to toggle source
# File lib/csvreader/parser_strict.rb, line 27
def self.logger() @@logger ||= build_logger; end
new( sep: ',', quote: '"', doublequote: true, escape: false, null: nil, comment: false ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 34
def initialize( sep:         ',',
                quote:       '"',  ## note: set to false/nil for no quote
                doublequote: true,
                escape:      false,   ## true/false
                null:        nil,     ## note: set to nil for no null vales / not availabe (na)
                comment:     false   ## note: comment char e.g. # or false/nil
               )
  @config = {}   ## todo/fix: change config to proper dialect class/struct - why? why not?
  @config[:sep]          = sep
  @config[:quote]        = quote
  @config[:doublequote]  = doublequote
  @config[:escape]  = escape
  @config[:null]     = null
  @config[:comment] = comment
end

Public Instance Methods

comment=( value ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 60
def comment=( value )     @config[:comment]=value; end
doublequote=( value ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 57
def doublequote=( value ) @config[:doublequote]=value; end
escape=( value ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 58
def escape=( value )      @config[:escape]=value; end
logger() click to toggle source
# File lib/csvreader/parser_strict.rb, line 28
def logger()  self.class.logger; end
null=( value ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 59
def null=( value )        @config[:null]=value; end
parse( data, sep: config[:sep], &block ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 64
def parse( data, sep: config[:sep], &block )
  ## note: data - will wrap either a String or IO object passed in data

  ##   make sure data (string or io) is a wrapped into Buffer!!!!!!
  if data.is_a?( Buffer )    ### allow (re)use of Buffer if managed from "outside"
    input = data
  else
    input = Buffer.new( data )
  end


  if block_given?
    parse_lines( input, sep: sep, &block )
  else
    records = []

    parse_lines( input, sep: sep ) do |record|
      records << record
    end

    records
  end

end
quote=( value ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 56
def quote=( value )       @config[:quote]=value; end
sep=( value ) click to toggle source

config convenience helpers

e.g. use like  Csv.mysql.sep = ','   etc.   instead of
               Csv.mysql.config[:sep] = ','
# File lib/csvreader/parser_strict.rb, line 55
def sep=( value )         @config[:sep]=value; end

Private Instance Methods

is_null?( value ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 273
def is_null?( value )
   null = @config[:null]
   if null.nil?
     false  ## nothing set; return always false (not null)
   elsif null.is_a?( Proc )
     null.call( value )
   elsif null.is_a?( Array )
     null.include?( value )
   elsif null.is_a?( String )
     value == null
   else  ## unknown config style / setting
     ##  todo: issue a warning or error - why? why not?
     false  ## nothing set; return always false (not null)
   end
end
parse_escape( input, sep: ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 93
def parse_escape( input, sep: )
  value = ""

  quote = config[:quote]

  if input.peek == BACKSLASH
    input.getc ## eat-up backslash
    if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || (quote && c==quote) )
      value << input.getc     ## add escaped char (e.g. lf, cr, etc.)
    else
      ## unknown escape sequence; no special handling/escaping
      value << BACKSLASH
    end
  else
    raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
  end
  value
end
parse_field( input, sep: ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 150
def parse_field( input, sep: )
  value = ""

  quote   = config[:quote]
  escape  = config[:escape]

  logger.debug "parse field - sep: >#{sep}< (#{sep.ord})"  if logger.debug?

  if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty unquoted field
    ## note: allows null = '' that is turn unquoted empty strings into null/nil
    ##   or if using numeric into NotANumber (NaN)
    value = nil   if is_null?( value )
    ##  do nothing - keep value as is :-) e.g. "".
  elsif quote && input.peek == quote
    logger.debug "start quote field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
    value << parse_quote( input, sep: sep )
    logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
  else
    logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
    ## consume simple value
    ##   until we hit "," or "\n" or "\r" or stray (double) quote e.g (")
    while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof? || (quote && c==quote)))
      if escape && input.peek == BACKSLASH
        value << parse_escape( input, sep: sep )
      else
        logger.debug "  add char >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
        value << input.getc
      end
    end


    value = nil  if is_null?( value )   ## note: null check only for UNQUOTED (not quoted/escaped) values
    # do nothing - keep value as is :-).

    logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
  end

  value
end
parse_lines( input, sep:, &block ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 243
def parse_lines( input, sep:, &block )
  ## no leading and trailing whitespaces trimmed/stripped
  ## no comments skipped
  ## no blanks skipped
  ## - follows strict rules of
  ##  note: this csv format is NOT recommended;
  ##    please, use a format with comments, leading and trailing whitespaces, etc.
  ##    only added for checking compatibility

  comment = config[:comment]

  loop do
    break if input.eof?

    logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?

    if comment && input.peek == comment        ## comment line
      logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
      skip_until_eol( input )
      skip_newline( input )
    else
      record = parse_record( input, sep: sep )
      ## note: requires block - enforce? how? why? why not?
      block.call( record )   ## yield( record )
    end
  end  # loop

end
parse_quote( input, sep: ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 114
def parse_quote( input, sep: )
  value = ""

  quote       = config[:quote]         # char (e.g.",') | nil
  doublequote = config[:doublequote]   # true|false
  escape      = config[:escape]        # true|false

  if input.peek == quote
    input.getc  ## eat-up double_quote

    loop do
      while (c=input.peek; !(c==quote || input.eof? || (escape && c==BACKSLASH)))
        value << input.getc   ## eat-up everything until hitting double_quote (") or backslash (escape)
      end

      if input.eof?
        break
      elsif input.peek == BACKSLASH
        value << parse_escape( input, sep: sep )
      else   ## assume input.peek == DOUBLE_QUOTE
        input.getc ## eat-up double_quote
        if doublequote && input.peek == quote  ## doubled up quote?
          value << input.getc   ## add doube quote and continue!!!!
        else
          break
        end
      end
    end
  else
    raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
  end
  value
end
parse_record( input, sep: ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 192
def parse_record( input, sep: )
  values = []

  loop do
     value = parse_field( input, sep: sep )
     logger.debug "value: »#{value}«"  if logger.debug?
     values << value

     if input.eof?
        break
     elsif (c=input.peek; c==LF || c==CR)
       skip_newline( input )   ## note: singular / single newline only (NOT plural)
       break
     elsif input.peek == sep
       input.getc   ## eat-up FS (,)
     else
       raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (,) or RS (\\n) expected!!!!" )
     end
  end

  values
end
skip_newline( input ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 217
def skip_newline( input )    ## note: singular (strict) version
  return if input.eof?

  ## only skip CR LF or LF or CR
  if input.peek == CR
    input.getc ## eat-up
    input.getc  if input.peek == LF
  elsif input.peek == LF
    input.getc ## eat-up
  else
    # do nothing
  end
end
skip_until_eol( input ) click to toggle source
# File lib/csvreader/parser_strict.rb, line 233
def skip_until_eol( input )
  return if input.eof?

  while (c=input.peek; !(c==LF || c==CR || input.eof?))
    input.getc    ## eat-up all until end of line
  end
end