class ComputeUnit::NvidiaGpu

Constants

MAKE
NVIDIA_PROC_PATH
NVIDIA_SMI
SUBTYPE
VENDOR_ID

Public Class Methods

blank_data() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 41
def self.blank_data
  @blank_data ||= {
    'memory.used [MiB]' => '0',
    'memory.free [MiB]' => '0',
    'memory.total [MiB]' => '0',
    'utilization.gpu [%]' => '0',
    'temperature.gpu' => '0',
    'power.draw [W]' => '0',
    'power.limit [W]' => '0',
    'power.max_limit [W]' => '0',
    'pstate' => 7,
    'fan.speed [%]' => '0',
    'clocks.current.memory [MHz]' => '0',
    'clocks.current.sm [MHz]' => '0'
  }
end
create_from_path(device_path, index, use_opencl = false) click to toggle source

@param device_path [String] - the device path of the device @param index [Integer] - the index of the device relative to other devices of the same class ie. GPU0

# File lib/compute_unit/gpus/nvidia_gpu.rb, line 174
def self.create_from_path(device_path, index, use_opencl = false)
  opts = {
    device_class_id: device_class(device_path),
    device_id: device(device_path),
    device_vendor_id: device_vendor(device_path),
    subsystem_vendor_id: subsystem_vendor(device_path),
    subsystem_device_id: subsystem_device(device_path),
    use_opencl: use_opencl,
    index: index
  }
  new(device_path, opts)
end
devices() click to toggle source

@return [Array] - returns a list of device paths of all devices specific to the vendor id

# File lib/compute_unit/gpus/nvidia_gpu.rb, line 168
def self.devices
  ComputeUnit::Gpu.devices.find_all { |f| device_vendor(f) == VENDOR_ID }
end
find_all(use_opencl = false) click to toggle source

@return [Array] - returns and array of gpu instances of NVIDIA type only

# File lib/compute_unit/gpus/nvidia_gpu.rb, line 188
def self.find_all(use_opencl = false)
  devices.map.with_index do |device_path, _index|
    found_index = ComputeUnit::Gpu.found_devices.index(device_path)
    create_from_path(device_path, found_index, use_opencl)
  end
end
new(device_path, opts = {}) click to toggle source
Calls superclass method ComputeUnit::Gpu::new
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 13
def initialize(device_path, opts = {})
  data = self.class.read_information_file(device_path).merge(opts)
  data[:pci_loc] = device_path
  data[:busid] = data[:bus_location]
  data[:bios] = data[:video_bios].upcase if data[:video_bios]
  data[:uuid] = data[:gpu_uuid]
  super(device_path, data)
end
read_information_file(device_path) click to toggle source

@return [Hash] - hash of card info given by the kernel {:model=>“GeForce GTX 1070”,

:irq=>"130",
:gpu_uuid=>"GPU-0116fb5c-66f4-1cba-c216-97f4600a8152",
:video_bios=>"86.04.50.40.4a",
:bus_type=>"PCIe",
:dma_size=>"47 bits",
:dma_mask=>"0x7fffffffffff",
:bus_location=>"0000:0d:00.0",
:device_minor=>"7"}
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 211
def self.read_information_file(device_path)
  device_name = File.basename(device_path)
  information_file = File.join(NVIDIA_PROC_PATH, device_name, 'information')

  File.open(information_file, 'r') do |file|
    content = file.read
    content.scan(/\n?([\w\s]*):\s+(.*)/).map { |key, value| [key.downcase.tr(' ', '_').to_sym, value] }.to_h
  end
end

Public Instance Methods

core_clock() click to toggle source

@return [Integer] the current core clock speed

# File lib/compute_unit/gpus/nvidia_gpu.rb, line 94
def core_clock
  meta['clocks.current.sm [MHz]'].to_i
end
fan() click to toggle source

@return [Integer] the fan speed

# File lib/compute_unit/gpus/nvidia_gpu.rb, line 99
def fan
  meta['fan.speed [%]'].to_i
end
information_file() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 160
def information_file
  @information_file ||= begin
    device_name = File.basename(device_path)
    File.join(NVIDIA_PROC_PATH, device_name, 'information')
  end
end
memory_clock() click to toggle source

@return [Integer] the current memory clock speed

# File lib/compute_unit/gpus/nvidia_gpu.rb, line 89
def memory_clock
  meta['clocks.current.memory [MHz]'].to_i
end
memory_free() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 152
def memory_free
  meta['memory.free [MiB]']
end
memory_total() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 144
def memory_total
  meta['memory.total [MiB]']
end
memory_used() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 148
def memory_used
  meta['memory.used [MiB]']
end
meta() click to toggle source

@return [Hash] the metadata from the nvidia-smi tool return cached data or fetch new data

# File lib/compute_unit/gpus/nvidia_gpu.rb, line 32
def meta
  if expired_metadata?
    logger.debug("Expired Nvidia Data for #{uuid} ")
    @meta = metadata
  else
    @meta ||= metadata
  end
end
metadata() click to toggle source

@note data returned from nvidia-smi @return [Hash]

 "name": "GeForce GTX 1070 Ti",
 "vbios_version": "86.04.85.00.63",
 "uuid": "GPU-a583cb04-f9b5-68f3-50b9-2b4ba1c7d14e",
"memory.used [MiB]": "2578 MiB",
"memory.free [MiB]": "5534 MiB",
"memory.total [MiB]": "8112 MiB",
"utilization.gpu [%]": "100",
"temperature.gpu": "53",
"power.draw [W]": "129.21",
"power.limit [W]": "130.00",
"power.max_limit [W]": "217.00",
"pstate": 2,
"fan.speed [%]": "75"
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 73
def metadata
  logger.debug("Calling #{NVIDIA_SMI}")
  data = `#{NVIDIA_SMI} --query-gpu=gpu_name,vbios_version,uuid,memory.used,memory.free,memory.total,utilization.gpu,temperature.gpu,power.draw,power.limit,power.max_limit,fan.speed,pstate,clocks.current.memory,clocks.current.sm -i #{index} --format=csv,nounits 2>&1`
  unless $CHILD_STATUS.success?
    # error code 15
    logger.error(data.delete("\n"))
    return self.class.blank_data
  end
  cards = if data
            CSV.parse(data, headers: true, header_converters: ->(f) { f.strip },
                            converters: ->(f) { f ? f.strip : nil }).map(&:to_h)
          end
  cards.first
end
power() click to toggle source

@return [Float] the power being used by the gpu

# File lib/compute_unit/gpus/nvidia_gpu.rb, line 104
def power
  meta['power.draw [W]'].strip.to_f + power_offset
end
power_limit() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 116
def power_limit
  meta['power.limit [W]'].strip.to_f
end
power_limit=(value) click to toggle source

@param value [Numeric] power in watts to set the gpu limit to

# File lib/compute_unit/gpus/nvidia_gpu.rb, line 125
def power_limit=(value)
  # in the correct format and above 10 watts
  raise ArgumentError.new("Power value #{value.to_i} cannot exceed #{power_max_limit}") unless value.to_i.between?(1, power_max_limit.to_i)

  output = `#{NVIDIA_SMI} -i #{index} -pl #{value}`
  if $CHILD_STATUS.success?
    logger.info("GPU#{index} power set to #{value} Watts")
  else
    logger.warn("GPU#{index} failed setting power to #{value}\n#{output}")
  end
  value.to_i
end
power_max_limit() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 120
def power_max_limit
  meta['power.max_limit [W]'].strip.to_f
end
pstate() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 112
def pstate
  meta['pstate'].to_i
end
reset_metadata() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 26
def reset_metadata
  @meta = nil
end
set_fan_limit(_value, _type = 'current') click to toggle source

@param value [Numeric] - the fan limit that should be applied to the gpu as a percentage @return [Numeric] - original passed in value after being set

# File lib/compute_unit/gpus/nvidia_gpu.rb, line 140
def set_fan_limit(_value, _type = 'current')
  raise NotImplementedError.new('Not implemented for Nvidia')
end
set_mem_clock_and_vddc(_mem_clock, _mem_volt) click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 195
def set_mem_clock_and_vddc(_mem_clock, _mem_volt)
  return unless experimental_on?

  logger.warn('Feature not enabled for nvidia')
end
subtype() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 22
def subtype
  SUBTYPE
end
temp() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 108
def temp
  meta['temperature.gpu'].to_i
end
utilization() click to toggle source
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 156
def utilization
  meta['utilization.gpu [%]'].sub(/%/, '').to_i
end