class RdbmsSampler::TableSample

Attributes

pending_dependencies[R]

Public Class Methods

new(connection, schema_name, table_name, size = 1000) click to toggle source
# File lib/rdbms_sampler/table_sample.rb, line 11
def initialize(connection, schema_name, table_name, size = 1000)
  @schema = schema_name
  @table = table_name
  @connection = connection
  @size = size
  @pending_dependencies = Set.new
  @sample = Set.new
  @sampled = false
  @sampled_ids = Set.new
end

Public Instance Methods

add(row) click to toggle source

Add a row to the table sample. Returns number of new dependencies introduced.

# File lib/rdbms_sampler/table_sample.rb, line 57
def add(row)
  return 0 unless @sample.add? row
  @sampled_ids.add row['id'] if row['id']
  dependencies_for(row).collect { |dep|
    1 if @pending_dependencies.add?(dep)
  }.compact.sum
end
ensure_referential_integrity(sample) click to toggle source

@param [Sample] sample

# File lib/rdbms_sampler/table_sample.rb, line 66
def ensure_referential_integrity(sample)
  dependencies_in_progress = @pending_dependencies
  @pending_dependencies = Set.new
  dependencies_in_progress.map { |dependency|
    dependency_sample = sample.table_sample_for_dependency(dependency)
    dependency_sample.fulfil(dependency)
  }.compact.sum
end
fulfil(dependency) click to toggle source

Add the given dependency to the sample @param [Dependency] dependency

# File lib/rdbms_sampler/table_sample.rb, line 37
def fulfil(dependency)
  return 0 if fulfilled?(dependency)
  quoted_column = @connection.quote_column_name dependency.child_key
  quoted_value = @connection.quote dependency.value
  sql = "SELECT * FROM #{quoted_name} WHERE #{quoted_column} = #{quoted_value}"
  row = @connection.select_one(sql)
  raise "Could not fulfil #{dependency} using query [#{sql}]" if row.nil?
  add row
end
fulfilled?(dependency) click to toggle source

@param [Dependency] dependency

# File lib/rdbms_sampler/table_sample.rb, line 48
def fulfilled?(dependency)
  # FIXME: Only handles `id` column
  return false if dependency.child_key != 'id'

  @sampled_ids.include?(dependency.value)
end
identifier() click to toggle source
# File lib/rdbms_sampler/table_sample.rb, line 31
def identifier
  "#{@schema}.#{@table}"
end
quoted_name() click to toggle source
# File lib/rdbms_sampler/table_sample.rb, line 92
def quoted_name
  @connection.quote_table_name(@schema)+'.'+@connection.quote_table_name(@table)
end
sample!() click to toggle source
# File lib/rdbms_sampler/table_sample.rb, line 22
def sample!
  fetch(@size) unless @sampled
  @sample
end
size() click to toggle source
# File lib/rdbms_sampler/table_sample.rb, line 27
def size
  @sampled ? @sample.size : @size
end
to_sql() click to toggle source
# File lib/rdbms_sampler/table_sample.rb, line 75
def to_sql
  ret = "\n-- Sample from #{quoted_name} (#{@sample.count} rows)\n"
  unless @sample.empty?
    quoted_cols = @sample.first.keys.collect { |col| @connection.quote_column_name col }
    # INSERT in batches to reduce the likelihood of hitting `max_allowed_packet`
    @sample.each_slice(250) do |rows|
      values = rows.collect { |row|
        row.values.map { |val|
          @connection.quote(val)
        } * ','
      } * "),\n  ("
      ret << "INSERT INTO #{quoted_name} \n  (#{quoted_cols * ','}) \nVALUES \n  (#{values});\n"
    end
  end
  ret
end

Protected Instance Methods

dependencies_for(row) click to toggle source

@param [Array] row

# File lib/rdbms_sampler/table_sample.rb, line 119
def dependencies_for(row)
  foreign_keys.collect { |fk| dependency_for(fk, row) }.compact
end
dependency_for(fk, row) click to toggle source

@param [ForeignKey] fk @param [Array] row

# File lib/rdbms_sampler/table_sample.rb, line 112
def dependency_for(fk, row)
  unless (value = row[fk.key]).nil?
    Dependency.new(fk.schema, fk.table, fk.key, fk.referenced_schema, fk.referenced_table, fk.referenced_key, value)
  end
end
discover_foreign_keys() click to toggle source
# File lib/rdbms_sampler/table_sample.rb, line 127
    def discover_foreign_keys
      quoted_schema = @connection.quote @schema
      quoted_table = @connection.quote @table

      sql = <<SQL
      SELECT
        fk.constraint_name,
        fk.table_schema,
        fk.table_name,
        fk.column_name,
        fk.referenced_table_schema,
        fk.referenced_table_name,
        fk.referenced_column_name
      FROM information_schema.key_column_usage fk
      WHERE fk.referenced_column_name IS NOT NULL
      AND fk.table_schema = #{quoted_schema}
      AND fk.table_name = #{quoted_table}
SQL

      @connection.execute(sql).map do |row|
        ForeignKey.new(*row)
      end
    end
fetch(count = 1000) click to toggle source
# File lib/rdbms_sampler/table_sample.rb, line 98
def fetch(count = 1000)
  sql = "SELECT * FROM #{quoted_name}"
  unless (pks = self.primary_keys).count == 0
    order_by = @connection.quote_column_name(pks.first)
    sql += " ORDER BY #{order_by} DESC"
  end
  sql += " LIMIT #{count}"
  warn "  Sampling #{count} rows from #{quoted_name}..."
  @connection.select_all(sql).each { |row| add(row) }
  @sampled = true
end
foreign_keys() click to toggle source
# File lib/rdbms_sampler/table_sample.rb, line 123
def foreign_keys
  @fks ||= discover_foreign_keys
end
primary_keys() click to toggle source
# File lib/rdbms_sampler/table_sample.rb, line 151
    def primary_keys
      quoted_schema = @connection.quote @schema
      quoted_table = @connection.quote @table

      sql = <<SQL
      SELECT column_name
      FROM information_schema.key_column_usage
      WHERE constraint_name = 'PRIMARY'
      AND table_schema = #{quoted_schema}
      AND table_name = #{quoted_table}
SQL

      @connection.execute(sql).map do |row|
        row.first
      end
    end