class CsvReader::ParserStd

Constants

BACKSLASH
COMMENT_HASH
COMMENT_PERCENT
CR
DIRECTIVE
DOUBLE_QUOTE

char constants

LF
SEPARATORS
SINGLE_QUOTE
SPACE
TAB

Attributes

config[R]
meta[R]

Public Class Methods

build_logger() click to toggle source

add simple logger with debug flag/switch

use Parser.debug = true # to turn on

todo/fix: use logutils instead of std logger - why? why not?

# File lib/csvreader/parser_std.rb, line 33
def self.build_logger()
  l = Logger.new( STDOUT )
  l.level = :info    ## set to :info on start; note: is 0 (debug) by default
  l
end
logger() click to toggle source
# File lib/csvreader/parser_std.rb, line 38
def self.logger() @@logger ||= build_logger; end
new( sep: ',', null: ['\N', 'NA'], numeric: false, nan: nil, space: nil, hashtag: false ) click to toggle source

todo/check:

null values - include NA - why? why not?
    make null values case sensitive or add an option for case sensitive
    or better allow a proc as option for checking too!!!
# File lib/csvreader/parser_std.rb, line 52
def initialize( sep:      ',',
                null:     ['\N', 'NA'],  ## note: set to nil for no null vales / not availabe (na)
                numeric:  false,   ## (auto-)convert all non-quoted values to float
                nan:      nil,      ## note: only if numeric - set mappings for Float::NAN (not a number) values
                space:    nil,
                hashtag:  false
              )
  @config = {}   ## todo/fix: change config to proper dialect class/struct - why? why not?

  check_sep( sep )
  @config[:sep]     = sep

  ## note: null values must get handled by parser
  ##   only get checked for unquoted strings (and NOT for quoted strings)
  ##   "higher-level" code only knows about strings and has no longer any info if string was quoted or unquoted
  @config[:null]    = null   ## null values
  @config[:numeric] = numeric
  @config[:nan]     = nan   # not a number (NaN) e.g. Float::NAN

  ## e.g. treat/convert char to space e.g. _-+• etc
  ##   Man_Utd   => Man Utd
  ##  or use it for leading and trailing spaces without quotes
  ##  todo/check: only use for unquoted values? why? why not?
  @config[:space]   = space

  ## hxl - humanitarian eXchange language uses a hashtag row for "meta data"
  ##  e.g. #sector+en,#subsector,#org,#country,#sex+#targeted,#sex+#targeted,#adm1
  ##  do NOT treat # as a comment (always use % for now)
  @config[:hashtag] = hashtag

  @meta  = nil     ## no meta data block   (use empty hash {} - why? why not?)
end

Public Instance Methods

check_sep( sep ) click to toggle source
# File lib/csvreader/parser_std.rb, line 88
def check_sep( sep )
  ## note: parse does NOT support space or tab as separator!!
  ##    leading and trailing space or tab (whitespace) gets by default trimmed
  ##      unless quoted (or alternative space char used e.g. _-+ if configured)

  if SEPARATORS.include?( sep )
     ## everything ok
  else
    raise ArgumentError, "invalid/unsupported sep >#{sep}< - for now only >#{SEPARATORS}< allowed; sorry"
  end
end
hashtag=( value ) click to toggle source
# File lib/csvreader/parser_std.rb, line 111
def hashtag=( value )     @config[:hashtag]=value; end
logger() click to toggle source
# File lib/csvreader/parser_std.rb, line 39
def logger()  self.class.logger; end
nan=( value ) click to toggle source
# File lib/csvreader/parser_std.rb, line 109
def nan=( value )         @config[:nan]=value; end
null=( value ) click to toggle source
# File lib/csvreader/parser_std.rb, line 107
def null=( value )        @config[:null]=value; end
numeric=( value ) click to toggle source
# File lib/csvreader/parser_std.rb, line 108
def numeric=( value )     @config[:numeric]=value; end
parse( str_or_readable, sep: config[:sep], &block ) click to toggle source
# File lib/csvreader/parser_std.rb, line 116
def parse( str_or_readable, sep: config[:sep], &block )

  check_sep( sep )

  ## note: data - will wrap either a String or IO object passed in data
  ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)

  ##   make sure data (string or io) is a wrapped into Buffer!!!!!!
  if str_or_readable.is_a?( Buffer )    ### allow (re)use of Buffer if managed from "outside"
    input = str_or_readable
  else
    input = Buffer.new( str_or_readable )
  end

  if block_given?
    parse_lines( input, sep: sep, &block )
  else
    records = []

    parse_lines( input, sep: sep ) do |record|
      records << record
    end

    records
  end
end
sep=( value ) click to toggle source

config convenience helpers

e.g. use like  Csv.defaultl.null = '\N'   etc.   instead of
               Csv.default.config[:null] = '\N'
# File lib/csvreader/parser_std.rb, line 105
def sep=( value )         check_sep( value );  @config[:sep]=value; end
space=( value ) click to toggle source
# File lib/csvreader/parser_std.rb, line 110
def space=( value )       @config[:space]=value; end

Private Instance Methods

convert_to_float( value ) click to toggle source
# File lib/csvreader/parser_std.rb, line 546
def convert_to_float( value ) Float( value ) rescue value; end
is_nan?( value ) click to toggle source
# File lib/csvreader/parser_std.rb, line 548
def is_nan?( value )
   nan = @config[:nan]
   if nan.nil?
     false  ## nothing set; return always false (not NaN)
   elsif nan.is_a?( Proc )
     nan.call( value )
   elsif nan.is_a?( Array )
     nan.include?( value )
   elsif nan.is_a?( String )
     value == nan
   else  ## unknown config style / setting
     ##  todo: issue a warning or error - why? why not?
     false  ## nothing set; return always false (not nan)
   end
end
is_null?( value ) click to toggle source
# File lib/csvreader/parser_std.rb, line 565
def is_null?( value )
   null = @config[:null]
   if null.nil?
     false  ## nothing set; return always false (not null)
   elsif null.is_a?( Proc )
     null.call( value )
   elsif null.is_a?( Array )
     null.include?( value )
   elsif null.is_a?( String )
     value == null
   else  ## unknown config style / setting
     ##  todo: issue a warning or error - why? why not?
     false  ## nothing set; return always false (not null)
   end
end
parse_escape( input, sep: ) click to toggle source
# File lib/csvreader/parser_std.rb, line 148
def parse_escape( input, sep: )
  value = ""
  if input.peek == BACKSLASH
    input.getc ## eat-up backslash
    if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
      logger.debug "  add escaped char >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
      value << input.getc     ## add escaped char (e.g. lf, cr, etc.)
    else
      ## unknown escape sequence; no special handling/escaping
      logger.debug "  add backspace (unknown escape seq) >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
      value << BACKSLASH
    end
  else
    raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
  end
  value
end
parse_field( input, sep: ) click to toggle source
# File lib/csvreader/parser_std.rb, line 222
def parse_field( input, sep: )
  value = ""

  numeric = config[:numeric]
  hashtag = config[:hashtag]


  logger.debug "parse field"  if logger.debug?

  skip_spaces( input )   ## strip leading spaces


  if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty field
    ## note: allows null = '' that is turn unquoted empty strings into null/nil
    ##   or if using numeric into NotANumber (NaN)
    if is_null?( value )
      value = nil
    elsif numeric && is_nan?( value )  ## todo: check - how to handle numeric? return nil, NaN, or "" ???
      value = Float::NAN
    else
      # do nothing - keep value as is :-) e.g. "".
    end
  elsif input.peek == DOUBLE_QUOTE
    logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
    value << parse_quote( input, sep: sep,
                                 opening_quote: DOUBLE_QUOTE,
                                 closing_quote: DOUBLE_QUOTE )

    ## note: always eat-up all trailing spaces (" ") and tabs (\t)
    spaces_count = skip_spaces( input )

    ##  check for auto-fix trailing data after quoted value e.g. ---,"Fredy" Mercury,---
    ##   todo/fix: add auto-fix for all quote variants!!!!!!!!!!!!!!!!!!!!
    if (c=input.peek; c==sep || c==LF || c==CR || input.eof?)
       ## everything ok (that is, regular quoted value)!!!
    else
      ## try auto-fix
      ##   todo: report warning/issue error (if configured)!!!
      extra_value = parse_field_until_sep( input, sep: sep )
      ## "reconstruct" non-quoted value
      spaces = ' ' * spaces_count   ## todo: preserve tab (\t) - why? why not?
      ## note: minor (theoratical) issue (doubled quoted got "collapsed/escaped" to one from two in quoted value)
      ##    e.g. "hello """ extra,  (becomes)=>  "hello "" extra (one quote less/"eaten up")
      value = %Q{"#{value}"#{spaces}#{extra_value}}
    end

    logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
  elsif input.peek == SINGLE_QUOTE    ## allow single quote too (by default)
    logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
    value << parse_quote( input, sep: sep,
                                 opening_quote: SINGLE_QUOTE,
                                 closing_quote: SINGLE_QUOTE )

    ## note: always eat-up all trailing spaces (" ") and tabs (\t)
    skip_spaces( input )
    logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
  elsif input.peek == "«"
    value << parse_quote( input, sep: sep,
                                 opening_quote: "«",
                                 closing_quote: "»" )
    skip_spaces( input )
  elsif input.peek == "»"
    value << parse_quote( input, sep: sep,
                                 opening_quote: "»",
                                 closing_quote: "«" )
    skip_spaces( input )
  elsif input.peek == "‹"
    value << parse_quote( input, sep: sep,
                                 opening_quote: "‹",
                                 closing_quote: "›" )
    skip_spaces( input )
  elsif input.peek == "›"
    value << parse_quote( input, sep: sep,
                                 opening_quote: "›",
                                 closing_quote: "‹" )
    skip_spaces( input )
  else
    logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
    ## consume simple value
    ##   until we hit "," or "\n" or "\r"
    ##    note: will eat-up quotes too!!!
    while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
      if input.peek == BACKSLASH
        value << parse_escape( input, sep: sep )
      ###   check for end-of-line comments (e.g. # ...)
      ##    note: quick hack for now
      ##      will NOT work in hashtag (hxl) mode and for % comments
      ##      for now ALWAYS assumes # for comments
      ##      and end-of-line comments ONLY work here (that is, in unquoted values and NOT in quotes values) for now
      ##    todo/fix: note: require leading space for comment hash (#) for now- why? why not?
      ##                    require trailing space after comment hash (#) - why? why not?
    elsif (hashtag == false || hashtag.nil?) && input.peek == COMMENT_HASH &&
           (value.size == 0 || (value.size > 0 && value[-1] == ' '))
        ## eat-up everything until end-of-line (eol)
        skip_until_eol( input )
      else
        logger.debug "  add char >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
        value << input.getc   ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
      end
    end
    ##  note: only strip **trailing** spaces (space and tab only)
    ##    do NOT strip newlines etc. might have been added via escape! e.g. \\\n
    value = value.sub( /[ \t]+$/, '' )

    if is_null?( value )   ## note: null check only for UNQUOTED (not quoted/escaped) values
      value = nil
    elsif numeric
      if is_nan?( value )
        value = Float::NAN
      else
        ## numeric - (auto-convert) non-quoted values (if NOT nil) to floats
        if numeric.is_a?( Proc )
          value = numeric.call( value )   ## allow custom converter proc (e.g. how to handle NaN and conversion errors?)
        else
          value = convert_to_float( value ) # default (fails silently) keep string value if cannot convert - change - why? why not?
        end
      end
    else
      # do nothing - keep value as is :-).
    end

    logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
  end

  value
end
parse_field_until_sep( input, sep: ) click to toggle source
# File lib/csvreader/parser_std.rb, line 200
def parse_field_until_sep( input, sep: )
  value = ""
  logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
  ## consume simple value
  ##   until we hit "," or "\n" or "\r"
  ##    note: will eat-up quotes too!!!
  while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
    if input.peek == BACKSLASH
      value << parse_escape( input, sep: sep )
    else
      logger.debug "  add char >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
      value << input.getc   ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
    end
  end
  ##  note: only strip **trailing** spaces (space and tab only)
  ##    do NOT strip newlines etc. might have been added via escape! e.g. \\\n
  value = value.sub( /[ \t]+$/, '' )
  value
end
parse_lines( input, sep:, &block ) click to toggle source
# File lib/csvreader/parser_std.rb, line 474
def parse_lines( input, sep:, &block )
  ## note: reset (optional) meta data block
  @meta  = nil     ## no meta data block   (use empty hash {} - why? why not?)

  ## note: track number of records
  ##   used for meta block (can only start before any records e.g. if record_num == 0)
  record_num = 0



  hashtag = config[:hashtag]

  if hashtag
    comment = COMMENT_PERCENT
    ## todo/check: use a "heuristic" to check if its a comment or a hashtag line? why? why not?
  else
    ## note: can either use '#' or '%' but NOT both; first one "wins"
    comment = nil
  end


  has_seen_directive   = false
  has_seen_frontmatter = false   ## - renameto  has_seen_dash (---) - why? why not???
  ## note: can either use directives (@) or frontmatter (---) block; first one "wins"

  loop do
    break if input.eof?

    skipped_spaces = skip_spaces( input )

    if comment.nil? && (c=input.peek; c==COMMENT_HASH || c==COMMENT_PERCENT)
      logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
      comment = input.getc  ## first comment line (determines/fixes "allowed" comment-style)
      skip_until_eol( input )
      skip_newline( input )
    elsif comment && input.peek == comment        ## (anther) comment line
      logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
      skip_until_eol( input )
      skip_newline( input )
    elsif (c=input.peek; c==LF || c==CR || input.eof?)
      logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?
      skip_newline( input )
    elsif record_num == 0 && hashtag == false && has_seen_frontmatter == false && input.peek==DIRECTIVE
      ## note: "skip" directives for now
      has_seen_directive = true
      logger.debug "skip directive"  if logger.debug?
      skip_until_eol( input )
      skip_newline( input )
    elsif record_num == 0 && hashtag == false && has_seen_directive == false && has_seen_frontmatter == false &&
          skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
      ## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
      has_seen_frontmatter = true
      logger.debug "start meta block"  if logger.debug?
      ## note: meta gets stored as object attribute (state/state/state!!)
      ##   use meta attribute to get meta data after reading first record
      @meta = parse_meta( input )   ## note: assumes a hash gets returned
      logger.debug "  meta: >#{meta.inspect}<"  if logger.debug?
    else
      logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})"  if logger.debug?

      record = parse_record( input, sep: sep )
      record_num +=1

      ## note: requires block - enforce? how? why? why not?
      block.call( record )   ## yield( record )
    end
  end  # loop
end
parse_meta( input ) click to toggle source
# File lib/csvreader/parser_std.rb, line 380
def parse_meta( input )
  ## todo/check:
  ##  check again for input.peekn(4) =~ /^---[\n\r \t]$/ - why? why not?

  input.getc   ## eat-up (add document header ---) - skip "---"
  input.getc
  input.getc

  ## todo/fix: make peekn(4)=~/^---[\n\r \t]$/ "more strict"
  ##    use match() or something to always match regexp
  skip_spaces( input )   # eat-up optional whitespaces in header line
  skip_newline( input )

  buf = "---\n"    ## note: start buffer with yaml header line - why?
  ##   ::YAML.load("")        return false !!!
  ##   ::YAML.load("---\n")   returns nil -- yes!!  if we get nil return empty hash {}

  newline = true

  ## eat-up until we hit "---" again
  loop do
    if input.eof?
      raise ParseError.new( "end of input/stream - meta block footer >---< expected!!!!" )
    elsif (c=input.peek; c==LF || c==CR)
      while (c=input.peek; c==LF || c==CR )   ## add newlines
        buf << input.getc    ## eat-up all until end of line
      end
      newline = true
    elsif newline && input.peekn(4) =~ /^---[\n\r \t]?$/   ## check if meta block end marker?
      ## todo/fix/check: allow (ignore) spaces after ---  why? why not?
      input.getc   ## eat-up (add document header ---) - skip "---"
      input.getc
      input.getc
      skip_spaces( input )   # eat-up optional whitespaces in header line
      skip_newline( input )
      break
    else
      buf << input.getc
      newline = false
    end
  end

  data = ::YAML.load( buf )  ## note: MUST use "outer" scope (CsvReader defines its own YAML parser)
  ## todo: check edge cases - always should return a hash or nil
  ##     what to do with just integer, string or array etc. ???

  data = {}   if data.nil?     ## note: if nil return empty hash e.g. {}
  data
end
parse_quote( input, sep:, opening_quote:, closing_quote:) click to toggle source
# File lib/csvreader/parser_std.rb, line 168
def parse_quote( input, sep:, opening_quote:, closing_quote:)
  value = ""
  if input.peek == opening_quote
    input.getc  ## eat-up opening quote

    loop do
      while (c=input.peek; !(c==closing_quote || c==BACKSLASH || input.eof?))
        value << input.getc   ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
      end

      if input.eof?
        break
      elsif input.peek == BACKSLASH
        value << parse_escape( input, sep: sep )
      else   ## assume input.peek == quote
        input.getc ## eat-up quote
        if opening_quote == closing_quote && input.peek == closing_quote
          ## doubled up quote?
          #   note: only works (enabled) for "" or '' and NOT for «»,‹›.. (if opening and closing differ)
          value << input.getc   ## add doube quote and continue!!!!
        else
          break
        end
      end
    end
  else
    raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - CLOSING QUOTE (#{closing_quote}) expected in parse_quote!!!!" )
  end
  value
end
parse_record( input, sep: ) click to toggle source
# File lib/csvreader/parser_std.rb, line 351
def parse_record( input, sep: )
  values = []

  space   = config[:space]

  loop do
     value = parse_field( input, sep: sep )
     value = value.tr( space, ' ' )   if space && value.is_a?( String )

     logger.debug "value: »#{value}«"  if logger.debug?
     values << value

     if input.eof?
        break
     elsif (c=input.peek; c==LF || c==CR)
       skip_newline( input )
       break
     elsif input.peek == sep
       input.getc   ## eat-up FS(,)
     else
       raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (#{sep}) or RS (\\n) expected!!!!" )
     end
  end

  values
end
skip_newline( input ) click to toggle source
# File lib/csvreader/parser_std.rb, line 432
def skip_newline( input )    ## note: singular (strict) version
  return if input.eof?

  ## only skip CR LF or LF or CR
  if input.peek == CR
    input.getc ## eat-up
    input.getc  if input.peek == LF
  elsif input.peek == LF
    input.getc ## eat-up
  else
    # do nothing
  end
end
skip_spaces( input ) click to toggle source
# File lib/csvreader/parser_std.rb, line 457
def skip_spaces( input )
  return 0   if input.eof?

  ## note: return number of spaces skipped (e.g. 0,1,2,etc.)
  spaces_count = 0
  while (c=input.peek; c==SPACE || c==TAB)
    input.getc   ## note: always eat-up all spaces (" ") and tabs (\t)
    spaces_count += 1
  end
  spaces_count
end
skip_until_eol( input ) click to toggle source
# File lib/csvreader/parser_std.rb, line 448
def skip_until_eol( input )
  return if input.eof?

  while (c=input.peek; !(c==LF || c==CR || input.eof?))
    input.getc    ## eat-up all until end of line
  end
end