class Rover::DataFrame
Public Class Methods
# File lib/rover/data_frame.rb, line 3 def initialize(*args) data, options = process_args(args) @vectors = {} types = options[:types] || {} if data.is_a?(DataFrame) data.vectors.each do |k, v| @vectors[k] = v end elsif data.is_a?(Hash) data.to_h.each do |k, v| @vectors[k] = if v.respond_to?(:to_a) Vector.new(v, type: types[k]) else v end end # handle scalars size = @vectors.values.find { |v| v.is_a?(Vector) }&.size || 1 @vectors.each_key do |k| @vectors[k] = to_vector(@vectors[k], size: size, type: types[k]) end elsif data.is_a?(Array) vectors = {} raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) } keys = data.flat_map(&:keys).uniq keys.each do |k| vectors[k] = [] end data.each do |d| keys.each do |k| vectors[k] << d[k] end end vectors.each do |k, v| @vectors[k] = to_vector(v, type: types[k]) end elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base)) result = data.connection.select_all(data.all.to_sql) result.columns.each_with_index do |k, i| @vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k]) end else raise ArgumentError, "Cannot cast to data frame: #{data.class.name}" end # check keys @vectors.each_key do |k| check_key(k) end # check sizes sizes = @vectors.values.map(&:size).uniq if sizes.size > 1 raise ArgumentError, "Different sizes: #{sizes}" end end
Public Instance Methods
# File lib/rover/data_frame.rb, line 319 def +(other) dup.concat(other) end
don't check types
# File lib/rover/data_frame.rb, line 360 def ==(other) size == other.size && keys == other.keys && keys.all? { |k| self[k].to_numo == other[k].to_numo } end
# File lib/rover/data_frame.rb, line 64 def [](where) if (where.is_a?(Vector) && where.to_numo.is_a?(Numo::Bit)) || where.is_a?(Numeric) || where.is_a?(Range) || (where.is_a?(Array) && where.all? { |v| v.is_a?(Integer) } ) new_vectors = {} @vectors.each do |k, v| new_vectors[k] = v[where] end DataFrame.new(new_vectors) elsif where.is_a?(Array) # multiple columns df = DataFrame.new where.each do |k| check_column(k, true) df[k] = @vectors[k] end df else # single column @vectors[where] end end
# File lib/rover/data_frame.rb, line 102 def []=(k, v) check_key(k) v = to_vector(v, size: size) raise ArgumentError, "Size mismatch: expected #{size}, got #{v.size}" if @vectors.any? && v.size != size @vectors[k] = v end
should this check for columns as well?
# File lib/rover/data_frame.rb, line 116 def any? size > 0 end
# File lib/rover/data_frame.rb, line 125 def clear @vectors.clear end
in-place, like Array#concat TODO make more performant
# File lib/rover/data_frame.rb, line 325 def concat(other) raise ArgumentError, "Must be a data frame" unless other.is_a?(DataFrame) size = self.size vectors.each do |k, v| @vectors[k] = Vector.new(v.to_a + (other[k] ? other[k].to_a : [nil] * other.size)) end (other.vector_names - vector_names).each do |k| @vectors[k] = Vector.new([nil] * size + other[k].to_a) end self end
# File lib/rover/data_frame.rb, line 139 def delete(key) @vectors.delete(key) end
# File lib/rover/data_frame.rb, line 311 def dup df = DataFrame.new @vectors.each do |k, v| df[k] = v end df end
# File lib/rover/data_frame.rb, line 85 def each_row return enum_for(:each_row) unless block_given? size.times do |i| yield @vectors.map { |k, v| [k, v[i]] }.to_h end end
should this check for columns as well?
# File lib/rover/data_frame.rb, line 121 def empty? size == 0 end
# File lib/rover/data_frame.rb, line 143 def except(*keys) dup.except!(*keys) end
# File lib/rover/data_frame.rb, line 147 def except!(*keys) keys.each do |key| delete(key) end self end
# File lib/rover/data_frame.rb, line 166 def first(n = 1) new_vectors = {} @vectors.each do |k, v| new_vectors[k] = v.first(n) end DataFrame.new(new_vectors) end
# File lib/rover/data_frame.rb, line 300 def group(*columns) Group.new(self, columns.flatten) end
# File lib/rover/data_frame.rb, line 158 def head(n = 5) first(n) end
# File lib/rover/data_frame.rb, line 154 def include?(key) @vectors.include?(key) end
see join for options
# File lib/rover/data_frame.rb, line 350 def inner_join(other, on: nil) join(other, on: on, how: "inner") end
TODO handle long text better
# File lib/rover/data_frame.rb, line 250 def inspect return "#<Rover::DataFrame>" if keys.empty? lines = [] line_start = 0 spaces = 2 summarize = size >= 30 @vectors.each do |k, v| v = summarize ? v.first(5).to_a + ["..."] + v.last(5).to_a : v.to_a width = ([k] + v).map(&:to_s).map(&:size).max width = 3 if width < 3 if lines.empty? || lines[-2].map { |l| l.size + spaces }.sum + width > 120 line_start = lines.size lines << [] v.size.times do |i| lines << [] end lines << [] end lines[line_start] << "%#{width}s" % k.to_s v.each_with_index do |v2, i| lines[line_start + 1 + i] << "%#{width}s" % v2.to_s end end lines.pop lines.map { |l| l.join(" " * spaces) }.join("\n") end
# File lib/rover/data_frame.rb, line 133 def keys @vectors.keys end
# File lib/rover/data_frame.rb, line 174 def last(n = 1) new_vectors = {} @vectors.each do |k, v| new_vectors[k] = v.last(n) end DataFrame.new(new_vectors) end
see join for options
# File lib/rover/data_frame.rb, line 355 def left_join(other, on: nil) join(other, on: on, how: "left") end
# File lib/rover/data_frame.rb, line 338 def merge(other) dup.merge!(other) end
# File lib/rover/data_frame.rb, line 342 def merge!(other) other.vectors.each do |k, v| self[k] = v end self end
TODO raise error when collision
# File lib/rover/data_frame.rb, line 209 def one_hot(drop: false) df = DataFrame.new vectors.each do |k, v| if v.to_numo.is_a?(Numo::RObject) df.merge!(v.one_hot(drop: drop, prefix: "#{k}_")) else df[k] = v end end df rescue ArgumentError => e if e.message == "All elements must be strings" # better error message raise ArgumentError, "All elements must be numeric or strings" end raise e end
# File lib/rover/data_frame.rb, line 366 def plot(x = nil, y = nil, type: nil) require "vega" raise ArgumentError, "Must specify columns" if keys.size != 2 && (!x || !y) x ||= keys[0] y ||= keys[1] type ||= begin if self[x].numeric? && self[y].numeric? "scatter" elsif types[x] == :object && self[y].numeric? "column" else raise "Cannot determine type. Use the type option." end end data = self[[x, y]] case type when "line", "area" x_type = if data[x].numeric? "quantitative" elsif data[x].all? { |v| v.is_a?(Date) || v.is_a?(Time) } "temporal" else "nominal" end scale = x_type == "temporal" ? {type: "utc"} : {} Vega.lite .data(data) .mark(type: type, tooltip: true, interpolate: "cardinal", point: {size: 60}) .encoding( x: {field: x, type: x_type, scale: scale}, y: {field: y, type: "quantitative"} ) .config(axis: {labelFontSize: 12}) when "pie" Vega.lite .data(data) .mark(type: "arc", tooltip: true) .encoding( color: {field: x, type: "nominal", sort: "none", axis: {title: nil}, legend: {labelFontSize: 12}}, theta: {field: y, type: "quantitative"} ) .view(stroke: nil) when "column" Vega.lite .data(data) .mark(type: "bar", tooltip: true) .encoding( # TODO determine label angle x: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}}, y: {field: y, type: "quantitative"} ) .config(axis: {labelFontSize: 12}) when "bar" Vega.lite .data(data) .mark(type: "bar", tooltip: true) .encoding( # TODO determine label angle y: {field: x, type: "nominal", sort: "none", axis: {labelAngle: 0}}, x: {field: y, type: "quantitative"} ) .config(axis: {labelFontSize: 12}) when "scatter" Vega.lite .data(data) .mark(type: "circle", tooltip: true) .encoding( x: {field: x, type: "quantitative", scale: {zero: false}}, y: {field: y, type: "quantitative", scale: {zero: false}}, size: {value: 60} ) .config(axis: {labelFontSize: 12}) else raise ArgumentError, "Invalid type: #{type}" end end
# File lib/rover/data_frame.rb, line 182 def sample(*args, **kwargs) # TODO make more efficient indexes = (0...size).to_a.sample(*args, **kwargs) self[indexes] end
# File lib/rover/data_frame.rb, line 129 def shape [size, @vectors.size] end
# File lib/rover/data_frame.rb, line 109 def size @vectors.values.first&.size || 0 end
# File lib/rover/data_frame.rb, line 296 def sort_by(&block) dup.sort_by!(&block) end
# File lib/rover/data_frame.rb, line 284 def sort_by! indexes = size.times.sort_by do |i| yield @vectors.map { |k, v| [k, v[i]] }.to_h end @vectors.each do |k, v| self[k] = v.to_numo.at(indexes) end self end
# File lib/rover/data_frame.rb, line 162 def tail(n = 5) last(n) end
# File lib/rover/data_frame.rb, line 188 def to_a a = [] each_row do |row| a << row end a end
# File lib/rover/data_frame.rb, line 227 def to_csv require "csv" CSV.generate do |csv| csv << keys numo = vectors.values.map(&:to_numo) size.times do |i| csv << numo.map { |n| n[i] } end end end
# File lib/rover/data_frame.rb, line 196 def to_h hsh = {} @vectors.each do |k, v| hsh[k] = v.to_a end hsh end
for IRuby
# File lib/rover/data_frame.rb, line 239 def to_html require "iruby" if size > 7 # pass 8 rows so maxrows is applied IRuby::HTML.table((self[0..4] + self[-4..-1]).to_h, maxrows: 7) else IRuby::HTML.table(to_h) end end
# File lib/rover/data_frame.rb, line 204 def to_numo Numo::NArray.column_stack(vectors.values.map(&:to_numo)) end
# File lib/rover/data_frame.rb, line 98 def types @vectors.map { |k, v| [k, v.type] }.to_h end
dup to prevent direct modification of keys
# File lib/rover/data_frame.rb, line 94 def vectors @vectors.dup end
Private Instance Methods
TODO in 0.3.0 always use did_you_mean
# File lib/rover/data_frame.rb, line 512 def check_column(key, did_you_mean = false) unless include?(key) if did_you_mean if RUBY_VERSION.to_f >= 2.6 raise KeyError.new("Missing column: #{key}", receiver: self, key: key) else raise KeyError.new("Missing column: #{key}") end else raise ArgumentError, "Missing column: #{key}" end end end
# File lib/rover/data_frame.rb, line 504 def check_join_keys(df, keys) raise ArgumentError, "No keys" if keys.empty? missing_keys = keys.select { |k| !df.include?(k) } raise ArgumentError, "Missing keys: #{missing_keys.join(", ")}" if missing_keys.any? end
# File lib/rover/data_frame.rb, line 450 def check_key(key) raise ArgumentError, "Key must be a string or symbol, got #{key.inspect}" unless key.is_a?(String) || key.is_a?(Symbol) end
TODO make more efficient TODO add option to prefix/suffix keys? Supports:
-
on: :key
-
on: [:key1, :key2]
-
on: {key1a: :key1b, key2a: :key2b}
# File lib/rover/data_frame.rb, line 460 def join(other, how:, on: nil) self_on, other_on = if on.is_a?(Hash) [on.keys, on.values] else on ||= keys & other.keys on = [on] unless on.is_a?(Array) [on, on] end check_join_keys(self, self_on) check_join_keys(other, other_on) indexed = other.to_a.group_by { |r| r.values_at(*other_on) } indexed.default = [] left = how == "left" vectors = {} keys = (self.keys + other.keys).uniq keys.each do |k| vectors[k] = [] end each_row do |r| matches = indexed[r.values_at(*self_on)] if matches.empty? if left keys.each do |k| vectors[k] << r[k] end end else matches.each do |r2| keys.each do |k| vectors[k] << (r2[k] || r[k]) end end end end DataFrame.new(vectors) end
can't use data = {} and keyword arguments as this causes an unknown keyword error when data is passed as DataFrame.new
({a: …, b: …})
at the moment, there doesn't appear to be a way to distinguish between DataFrame.new
({types: …}) which should set data, and DataFrame.new
(types: …) which should set options bugs.ruby-lang.org/issues/16891
there aren't currently options that should be used without data if this is ever the case, we should still require data to prevent new options from breaking existing code
# File lib/rover/data_frame.rb, line 561 def process_args(args) data = args[0] || {} options = args.size > 1 && args.last.is_a?(Hash) ? args.pop : {} raise ArgumentError, "wrong number of arguments (given #{args.size}, expected 0..1)" if args.size > 1 known_keywords = [:types] unknown_keywords = options.keys - known_keywords raise ArgumentError, "unknown keywords: #{unknown_keywords.join(", ")}" if unknown_keywords.any? [data, options] end
# File lib/rover/data_frame.rb, line 526 def to_vector(v, size: nil, type: nil) if v.is_a?(Vector) v = v.to(type) if type && v.type != type return v end if size && !v.respond_to?(:to_a) v = if v.is_a?(Integer) Numo::Int64.new(size).fill(v) elsif v.is_a?(Numeric) Numo::DFloat.new(size).fill(v) elsif v == true || v == false Numo::Bit.new(size).fill(v) else # TODO make more efficient [v] * size end end Vector.new(v, type: type) end