class Rover::DataFrame

Public Class Methods

new(*args) click to toggle source
# File lib/rover/data_frame.rb, line 3
def initialize(*args)
  data, options = process_args(args)

  @vectors = {}
  types = options[:types] || {}

  if data.is_a?(DataFrame)
    data.vectors.each do |k, v|
      @vectors[k] = v
    end
  elsif data.is_a?(Hash)
    data.to_h.each do |k, v|
      @vectors[k] =
        if v.respond_to?(:to_a)
          Vector.new(v, type: types[k])
        else
          v
        end
    end

    # handle scalars
    size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1
    @vectors.each_key do |k|
      @vectors[k] = to_vector(@vectors[k], size: size, type: types[k])
    end
  elsif data.is_a?(Array)
    vectors = {}
    raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) }
    keys = data.flat_map(&:keys).uniq
    keys.each do |k|
      vectors[k] = []
    end
    data.each do |d|
      keys.each do |k|
        vectors[k] << d[k]
      end
    end
    vectors.each do |k, v|
      @vectors[k] = to_vector(v, type: types[k])
    end
  elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
    result = data.connection.select_all(data.all.to_sql)
    result.columns.each_with_index do |k, i|
      @vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
    end
  else
    raise ArgumentError, "Cannot cast to data frame: #{data.class.name}"
  end

  # check keys
  @vectors.each_key do |k|
    check_key(k)
  end

  # check sizes
  sizes = @vectors.values.map(&:size).uniq
  if sizes.size > 1
    raise ArgumentError, "Different sizes: #{sizes}"
  end
end

Public Instance Methods

+(other) click to toggle source
# File lib/rover/data_frame.rb, line 319
def +(other)
  dup.concat(other)
end
==(other) click to toggle source

don't check types

# File lib/rover/data_frame.rb, line 360
def ==(other)
  size == other.size &&
  keys == other.keys &&
  keys.all? { |k| self[k].to_numo == other[k].to_numo }
end
[](where) click to toggle source
# File lib/rover/data_frame.rb, line 64
def [](where)
  if (where.is_a?(Vector) && where.to_numo.is_a?(Numo::Bit)) || where.is_a?(Numeric) || where.is_a?(Range) || (where.is_a?(Array) && where.all? { |v| v.is_a?(Integer) } )
    new_vectors = {}
    @vectors.each do |k, v|
      new_vectors[k] = v[where]
    end
    DataFrame.new(new_vectors)
  elsif where.is_a?(Array)
    # multiple columns
    df = DataFrame.new
    where.each do |k|
      check_column(k, true)
      df[k] = @vectors[k]
    end
    df
  else
    # single column
    @vectors[where]
  end
end
[]=(k, v) click to toggle source
# File lib/rover/data_frame.rb, line 102
def []=(k, v)
  check_key(k)
  v = to_vector(v, size: size)
  raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size
  @vectors[k] = v
end
any?() click to toggle source

should this check for columns as well?

# File lib/rover/data_frame.rb, line 116
def any?
  size > 0
end
clear() click to toggle source
# File lib/rover/data_frame.rb, line 125
def clear
  @vectors.clear
end
concat(other) click to toggle source

in-place, like Array#concat TODO make more performant

# File lib/rover/data_frame.rb, line 325
def concat(other)
  raise ArgumentError, "Must be a data frame" unless other.is_a?(DataFrame)

  size = self.size
  vectors.each do |k, v|
    @vectors[k] = Vector.new(v.to_a + (other[k] ? other[k].to_a : [nil] * other.size))
  end
  (other.vector_names - vector_names).each do |k|
    @vectors[k] = Vector.new([nil] * size + other[k].to_a)
  end
  self
end
count()
Alias for: size
delete(key) click to toggle source
# File lib/rover/data_frame.rb, line 139
def delete(key)
  @vectors.delete(key)
end
dup() click to toggle source
# File lib/rover/data_frame.rb, line 311
def dup
  df = DataFrame.new
  @vectors.each do |k, v|
    df[k] = v
  end
  df
end
each_row() { |map { |k, v| [k, v }| ... } click to toggle source
# File lib/rover/data_frame.rb, line 85
def each_row
  return enum_for(:each_row) unless block_given?

  size.times do |i|
    yield @vectors.map { |k, v| [k, v[i]] }.to_h
  end
end
empty?() click to toggle source

should this check for columns as well?

# File lib/rover/data_frame.rb, line 121
def empty?
  size == 0
end
except(*keys) click to toggle source
# File lib/rover/data_frame.rb, line 143
def except(*keys)
  dup.except!(*keys)
end
except!(*keys) click to toggle source
# File lib/rover/data_frame.rb, line 147
def except!(*keys)
  keys.each do |key|
    delete(key)
  end
  self
end
first(n = 1) click to toggle source
# File lib/rover/data_frame.rb, line 166
def first(n = 1)
  new_vectors = {}
  @vectors.each do |k, v|
    new_vectors[k] = v.first(n)
  end
  DataFrame.new(new_vectors)
end
group(*columns) click to toggle source
# File lib/rover/data_frame.rb, line 300
def group(*columns)
  Group.new(self, columns.flatten)
end
head(n = 5) click to toggle source
# File lib/rover/data_frame.rb, line 158
def head(n = 5)
  first(n)
end
include?(key) click to toggle source
# File lib/rover/data_frame.rb, line 154
def include?(key)
  @vectors.include?(key)
end
inner_join(other, on: nil) click to toggle source

see join for options

# File lib/rover/data_frame.rb, line 350
def inner_join(other, on: nil)
  join(other, on: on, how: "inner")
end
inspect() click to toggle source

TODO handle long text better

# File lib/rover/data_frame.rb, line 250
def inspect
  return "#<Rover::DataFrame>" if keys.empty?

  lines = []
  line_start = 0
  spaces = 2

  summarize = size >= 30

  @vectors.each do |k, v|
    v = summarize ? v.first(5).to_a + ["..."] + v.last(5).to_a : v.to_a
    width = ([k] + v).map(&:to_s).map(&:size).max
    width = 3 if width < 3

    if lines.empty? || lines[-2].map { |l| l.size + spaces }.sum + width > 120
      line_start = lines.size
      lines << []
      v.size.times do |i|
        lines << []
      end
      lines << []
    end

    lines[line_start] << "%#{width}s" % k.to_s
    v.each_with_index do |v2, i|
      lines[line_start + 1 + i] << "%#{width}s" % v2.to_s
    end
  end

  lines.pop
  lines.map { |l| l.join(" " * spaces) }.join("\n")
end
Also aliased as: to_s
keys() click to toggle source
# File lib/rover/data_frame.rb, line 133
def keys
  @vectors.keys
end
Also aliased as: names, vector_names
last(n = 1) click to toggle source
# File lib/rover/data_frame.rb, line 174
def last(n = 1)
  new_vectors = {}
  @vectors.each do |k, v|
    new_vectors[k] = v.last(n)
  end
  DataFrame.new(new_vectors)
end
left_join(other, on: nil) click to toggle source

see join for options

# File lib/rover/data_frame.rb, line 355
def left_join(other, on: nil)
  join(other, on: on, how: "left")
end
length()
Alias for: size
merge(other) click to toggle source
# File lib/rover/data_frame.rb, line 338
def merge(other)
  dup.merge!(other)
end
merge!(other) click to toggle source
# File lib/rover/data_frame.rb, line 342
def merge!(other)
  other.vectors.each do |k, v|
    self[k] = v
  end
  self
end
names()
Alias for: keys
one_hot(drop: false) click to toggle source

TODO raise error when collision

# File lib/rover/data_frame.rb, line 209
def one_hot(drop: false)
  df = DataFrame.new
  vectors.each do |k, v|
    if v.to_numo.is_a?(Numo::RObject)
      df.merge!(v.one_hot(drop: drop, prefix: "#{k}_"))
    else
      df[k] = v
    end
  end
  df
rescue ArgumentError => e
  if e.message == "All elements must be strings"
    # better error message
    raise ArgumentError, "All elements must be numeric or strings"
  end
  raise e
end
plot(x = nil, y = nil, type: nil) click to toggle source
# File lib/rover/data_frame.rb, line 366
def plot(x = nil, y = nil, type: nil)
  require "vega"

  raise ArgumentError, "Must specify columns" if keys.size != 2 && (!x || !y)
  x ||= keys[0]
  y ||= keys[1]
  type ||= begin
    if self[x].numeric? && self[y].numeric?
      "scatter"
    elsif types[x] == :object && self[y].numeric?
      "column"
    else
      raise "Cannot determine type. Use the type option."
    end
  end
  data = self[[x, y]]

  case type
  when "line", "area"
    x_type =
      if data[x].numeric?
        "quantitative"
      elsif data[x].all? { |v| v.is_a?(Date) || v.is_a?(Time) }
        "temporal"
      else
        "nominal"
      end

    scale = x_type == "temporal" ? {type: "utc"} : {}

    Vega.lite
      .data(data)
      .mark(type: type, tooltip: true, interpolate: "cardinal", point: {size: 60})
      .encoding(
        x: {field: x, type: x_type, scale: scale},
        y: {field: y, type: "quantitative"}
      )
      .config(axis: {labelFontSize: 12})
  when "pie"
    Vega.lite
      .data(data)
      .mark(type: "arc", tooltip: true)
      .encoding(
        color: {field: x, type: "nominal", sort: "none", axis: {title: nil}, legend: {labelFontSize: 12}},
        theta: {field: y, type: "quantitative"}
      )
      .view(stroke: nil)
  when "column"
    Vega.lite
      .data(data)
      .mark(type: "bar", tooltip: true)
      .encoding(
        # TODO determine label angle
        x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
        y: {field: y, type: "quantitative"}
      )
      .config(axis: {labelFontSize: 12})
  when "bar"
    Vega.lite
      .data(data)
      .mark(type: "bar", tooltip: true)
      .encoding(
        # TODO determine label angle
        y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}},
        x: {field: y, type: "quantitative"}
      )
      .config(axis: {labelFontSize: 12})
  when "scatter"
    Vega.lite
      .data(data)
      .mark(type: "circle", tooltip: true)
      .encoding(
        x: {field: x, type: "quantitative", scale: {zero: false}},
        y: {field: y, type: "quantitative", scale: {zero: false}},
        size: {value: 60}
      )
      .config(axis: {labelFontSize: 12})
  else
    raise ArgumentError, "Invalid type: #{type}"
  end
end
sample(*args, **kwargs) click to toggle source
# File lib/rover/data_frame.rb, line 182
def sample(*args, **kwargs)
  # TODO make more efficient
  indexes = (0...size).to_a.sample(*args, **kwargs)
  self[indexes]
end
shape() click to toggle source
# File lib/rover/data_frame.rb, line 129
def shape
  [size, @vectors.size]
end
size() click to toggle source
# File lib/rover/data_frame.rb, line 109
def size
  @vectors.values.first&.size || 0
end
Also aliased as: length, count
sort_by(&block) click to toggle source
# File lib/rover/data_frame.rb, line 296
def sort_by(&block)
  dup.sort_by!(&block)
end
sort_by!() { |map { |k, v| [k, v }| ... } click to toggle source
# File lib/rover/data_frame.rb, line 284
def sort_by!
  indexes =
    size.times.sort_by do |i|
      yield @vectors.map { |k, v| [k, v[i]] }.to_h
    end

  @vectors.each do |k, v|
    self[k] = v.to_numo.at(indexes)
  end
  self
end
tail(n = 5) click to toggle source
# File lib/rover/data_frame.rb, line 162
def tail(n = 5)
  last(n)
end
to_a() click to toggle source
# File lib/rover/data_frame.rb, line 188
def to_a
  a = []
  each_row do |row|
    a << row
  end
  a
end
to_csv() click to toggle source
# File lib/rover/data_frame.rb, line 227
def to_csv
  require "csv"
  CSV.generate do |csv|
    csv << keys
    numo = vectors.values.map(&:to_numo)
    size.times do |i|
      csv << numo.map { |n| n[i] }
    end
  end
end
to_h() click to toggle source
# File lib/rover/data_frame.rb, line 196
def to_h
  hsh = {}
  @vectors.each do |k, v|
    hsh[k] = v.to_a
  end
  hsh
end
to_html() click to toggle source

for IRuby

# File lib/rover/data_frame.rb, line 239
def to_html
  require "iruby"
  if size > 7
    # pass 8 rows so maxrows is applied
    IRuby::HTML.table((self[0..4] + self[-4..-1]).to_h, maxrows: 7)
  else
    IRuby::HTML.table(to_h)
  end
end
to_numo() click to toggle source
# File lib/rover/data_frame.rb, line 204
def to_numo
  Numo::NArray.column_stack(vectors.values.map(&:to_numo))
end
to_s()
Alias for: inspect
types() click to toggle source
# File lib/rover/data_frame.rb, line 98
def types
  @vectors.map { |k, v| [k, v.type] }.to_h
end
vector_names()
Alias for: keys
vectors() click to toggle source

dup to prevent direct modification of keys

# File lib/rover/data_frame.rb, line 94
def vectors
  @vectors.dup
end

Private Instance Methods

check_column(key, did_you_mean = false) click to toggle source

TODO in 0.3.0 always use did_you_mean

# File lib/rover/data_frame.rb, line 512
def check_column(key, did_you_mean = false)
  unless include?(key)
    if did_you_mean
      if RUBY_VERSION.to_f >= 2.6
        raise KeyError.new("Missing column: #{key}", receiver: self, key: key)
      else
        raise KeyError.new("Missing column: #{key}")
      end
    else
      raise ArgumentError, "Missing column: #{key}"
    end
  end
end
check_join_keys(df, keys) click to toggle source
# File lib/rover/data_frame.rb, line 504
def check_join_keys(df, keys)
  raise ArgumentError, "No keys" if keys.empty?
  missing_keys = keys.select { |k| !df.include?(k) }
  raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any?
end
check_key(key) click to toggle source
# File lib/rover/data_frame.rb, line 450
def check_key(key)
  raise ArgumentError, "Key must be a string or symbol, got #{key.inspect}" unless key.is_a?(String) || key.is_a?(Symbol)
end
join(other, how:, on: nil) click to toggle source

TODO make more efficient TODO add option to prefix/suffix keys? Supports:

  • on: :key

  • on: [:key1, :key2]

  • on: {key1a: :key1b, key2a: :key2b}

# File lib/rover/data_frame.rb, line 460
def join(other, how:, on: nil)
  self_on, other_on =
    if on.is_a?(Hash)
      [on.keys, on.values]
    else
      on ||= keys & other.keys
      on = [on] unless on.is_a?(Array)
      [on, on]
    end

  check_join_keys(self, self_on)
  check_join_keys(other, other_on)

  indexed = other.to_a.group_by { |r| r.values_at(*other_on) }
  indexed.default = []

  left = how == "left"

  vectors = {}
  keys = (self.keys + other.keys).uniq
  keys.each do |k|
    vectors[k] = []
  end

  each_row do |r|
    matches = indexed[r.values_at(*self_on)]
    if matches.empty?
      if left
        keys.each do |k|
          vectors[k] << r[k]
        end
      end
    else
      matches.each do |r2|
        keys.each do |k|
          vectors[k] << (r2[k] || r[k])
        end
      end
    end
  end

  DataFrame.new(vectors)
end
process_args(args) click to toggle source

can't use data = {} and keyword arguments as this causes an unknown keyword error when data is passed as DataFrame.new({a: …, b: …})

at the moment, there doesn't appear to be a way to distinguish between DataFrame.new({types: …}) which should set data, and DataFrame.new(types: …) which should set options bugs.ruby-lang.org/issues/16891

there aren't currently options that should be used without data if this is ever the case, we should still require data to prevent new options from breaking existing code

# File lib/rover/data_frame.rb, line 561
def process_args(args)
  data = args[0] || {}
  options = args.size > 1 && args.last.is_a?(Hash) ? args.pop : {}
  raise ArgumentError, "wrong number of arguments (given #{args.size}, expected 0..1)" if args.size > 1

  known_keywords = [:types]
  unknown_keywords = options.keys - known_keywords
  raise ArgumentError, "unknown keywords: #{unknown_keywords.join(", ")}" if unknown_keywords.any?

  [data, options]
end
to_vector(v, size: nil, type: nil) click to toggle source
# File lib/rover/data_frame.rb, line 526
def to_vector(v, size: nil, type: nil)
  if v.is_a?(Vector)
    v = v.to(type) if type && v.type != type
    return v
  end

  if size && !v.respond_to?(:to_a)
    v =
      if v.is_a?(Integer)
        Numo::Int64.new(size).fill(v)
      elsif v.is_a?(Numeric)
        Numo::DFloat.new(size).fill(v)
      elsif v == true || v == false
        Numo::Bit.new(size).fill(v)
      else
        # TODO make more efficient
        [v] * size
      end
  end

  Vector.new(v, type: type)
end