class Rarff::Relation

Attributes

attributes[RW]
instances[R]
name[RW]

Public Class Methods

new(name='') click to toggle source
# File lib/rarff.rb, line 129
def initialize(name='')
  @name = name
  @attributes = Array.new
  @instances = Array.new
end

Public Instance Methods

create_attributes(attr_parse=false) click to toggle source
# File lib/rarff.rb, line 176
def create_attributes(attr_parse=false)
  raise Exception, "Not enough data to create ARFF attributes" if @instances.nil? or 
    @instances.empty? or 
    @instances[0].empty?
  
  # Keep track of whether an attribute has been defined or not.
  # The only reason an attribute would not be defined in the first
  # row is if it has nil's in it. The geek inside screams for a binary
  # encoding like chmod but eh.
  attributes_defined = {}
  @instances.each_with_index { |row, i|
    row.each_with_index { |col, j|
      next if attributes_defined[j] or col.nil?
      
      attributes_defined[j] = true #whatever happens, we are going to define it
      if attr_parse
        if col =~ /^\-?\d+\.?\d*$/
          @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
        end
        next #parse next column - this one is finished
      end
      
      # No parsing - just take it how it is
      if col.kind_of?(Numeric)
        @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_NUMERIC)
      elsif col.kind_of?(String)
        @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_STRING)
      elsif col == false or col == true #exactly equal to a boolean
        @attributes[j] = Attribute.new("Attr#{j}", ATTRIBUTE_BOOLEAN)
      else
        raise Exception, "Could not parse attribute to ARFF data type: #{col.inspect}"
      end
    }
  }
  
  # Make sure all attributes have a definition, because otherwise
  # needless errors are thrown
  @instances[0].each_index do |i|
    @attributes[i] ||= Attribute.new("Attr#{i}", ATTRIBUTE_NUMERIC)
  end
end
expand_sparse(str) click to toggle source
# File lib/rarff.rb, line 244
def expand_sparse(str)
  arr = Array.new(@attributes.size, 0)
  str.gsub(/^\s*\{(.*)\}\s*$/, "\\1").split(/\s*\,\s*/).map { |pr|
    pra = pr.split(/\s/)
    arr[pra[0].to_i] = pra[1]
  }
  arr
end
instances=(instances, parse=false) click to toggle source

Assign instances to the internal array parse: choose to parse strings into numerics

# File lib/rarff.rb, line 169
def instances=(instances, parse=false)
  @instances = instances
  create_attributes(parse)
end
parse(str) click to toggle source
# File lib/rarff.rb, line 136
def parse(str)
  in_data_section = false

  # TODO: Doesn't handle commas in quoted attributes.
  str.split("\n").each { |line|
    next if line =~ /^\s*$/
    next if line =~ /^\s*#{COMMENT_MARKER}/
    next if line.my_scan(/^\s*#{RELATION_MARKER}\s*(.*)\s*$/i) { |name| @name = name }
    next if line.my_scan(/^\s*#{ATTRIBUTE_MARKER}\s*([^\s]*)\s+(.*)\s*$/i) { |name, type|
      @attributes.push(Attribute.new(name, type))
    }
    next if line.my_scan(/^\s*#{DATA_MARKER}/i) { in_data_section = true }
    next if in_data_section == false  ## Below is data section handling
    #      next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^\s*#{SPARSE_ARFF_BEGIN}(.*)#{SPARSE_ARFF_END}\s*$/) { |data|
    next if line.gsub(/^\s*(.*)\s*$/, "\\1").my_scan(/^#{ESC_SPARSE_ARFF_BEGIN}(.*)#{ESC_SPARSE_ARFF_END}$/) { |data|
      # Sparse ARFF
      # TODO: Factor duplication with non-sparse data below
      @instances << expand_sparse(data.first)
      create_attributes(true)
    }
    next if line.my_scan(/^\s*(.*)\s*$/) { |data|
      @instances << data.first.split(/,\s*/).map { |field|
        # Remove outer single quotes on strings, if any ('foo bar' --> foo bar)
        field.gsub(/^\s*\'(.*)\'\s*$/, "\\1")
      }
      create_attributes(true)
    }
  }
end
set_string_attributes_to_nominal(column_indices = nil) click to toggle source

Make all String type attributes into nominal attributes, because they are more useful in WEKA because more techniques handle them than strings.

column_indices is an optional argumetn specifying the columns that are to be set to nominal (0 based indexes). if nil (the default), then all columns are included

# File lib/rarff.rb, line 225
def set_string_attributes_to_nominal(column_indices = nil)
  nominals = {}
  # Frustratingly, we have to traverse this 2D array with the
  # wrong dimension first. Oh well.
  @instances.each_with_index do |row, row_index|
    row.each_with_index do |string, col_index|
      next unless @attributes[col_index].type == ATTRIBUTE_STRING
      next unless column_indices.nil? or column_indices.include?(col_index)

      nominals[col_index] ||= {}
      nominals[col_index][string] ||= true
    end
  end

  nominals.each do |index, strings|
    @attributes[index].type = "{#{strings.keys.join(',')}}"
  end
end
to_arff(sparse=false) click to toggle source
# File lib/rarff.rb, line 254
def to_arff(sparse=false)
  RELATION_MARKER + " #{@name}\n" +
    @attributes.join("\n") +
    "\n" +
    DATA_MARKER + "\n" +
    
    @instances.map { |inst|
    mapped = inst.map_with_index { |col, i|
      # First pass - quote strings with spaces, and dates
      # TODO: Doesn't handle cases in which strings already contain
      # quotes or are already quoted.
      unless col.nil?
        if @attributes[i].type =~ /^#{ATTRIBUTE_STRING}$/i
          if col =~ /[,\s+]/
            col = "'" + col + "'"
          end
        elsif @attributes[i].type =~ /^#{ATTRIBUTE_DATE}/i  ## Hack comparison. Ugh.
          col = '"' + col + '"'
        end
      end
      
      # Do the final output
      if sparse
        if col.nil? or
            (@attributes[i].type =~ /^#{ATTRIBUTE_NUMERIC}$/i and col == 0)
          nil
        else
          "#{i} #{col}"
        end
      else
        if col.nil?
          MISSING
        else
          col
        end
      end
    }
    
    if sparse
      mapped.reject{|col| col.nil?}.join(', ')
    else
      mapped.join(", ")
    end
  }.join("\n")
end
to_s() click to toggle source
# File lib/rarff.rb, line 301
def to_s
  to_arff
end