class ComputeUnit::NvidiaGpu
Constants
- MAKE
- NVIDIA_PROC_PATH
- NVIDIA_SMI
- SUBTYPE
- VENDOR_ID
Public Class Methods
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 41 def self.blank_data @blank_data ||= { 'memory.used [MiB]' => '0', 'memory.free [MiB]' => '0', 'memory.total [MiB]' => '0', 'utilization.gpu [%]' => '0', 'temperature.gpu' => '0', 'power.draw [W]' => '0', 'power.limit [W]' => '0', 'power.max_limit [W]' => '0', 'pstate' => 7, 'fan.speed [%]' => '0', 'clocks.current.memory [MHz]' => '0', 'clocks.current.sm [MHz]' => '0' } end
@param device_path [String] - the device path of the device @param index [Integer] - the index of the device relative to other devices of the same class ie. GPU0
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 174 def self.create_from_path(device_path, index, use_opencl = false) opts = { device_class_id: device_class(device_path), device_id: device(device_path), device_vendor_id: device_vendor(device_path), subsystem_vendor_id: subsystem_vendor(device_path), subsystem_device_id: subsystem_device(device_path), use_opencl: use_opencl, index: index } new(device_path, opts) end
@return [Array] - returns a list of device paths of all devices specific to the vendor id
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 168 def self.devices ComputeUnit::Gpu.devices.find_all { |f| device_vendor(f) == VENDOR_ID } end
@return [Array] - returns and array of gpu instances of NVIDIA type only
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 188 def self.find_all(use_opencl = false) devices.map.with_index do |device_path, _index| found_index = ComputeUnit::Gpu.found_devices.index(device_path) create_from_path(device_path, found_index, use_opencl) end end
ComputeUnit::Gpu::new
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 13 def initialize(device_path, opts = {}) data = self.class.read_information_file(device_path).merge(opts) data[:pci_loc] = device_path data[:busid] = data[:bus_location] data[:bios] = data[:video_bios].upcase if data[:video_bios] data[:uuid] = data[:gpu_uuid] super(device_path, data) end
@return [Hash] - hash of card info given by the kernel {:model=>“GeForce GTX 1070”,
:irq=>"130", :gpu_uuid=>"GPU-0116fb5c-66f4-1cba-c216-97f4600a8152", :video_bios=>"86.04.50.40.4a", :bus_type=>"PCIe", :dma_size=>"47 bits", :dma_mask=>"0x7fffffffffff", :bus_location=>"0000:0d:00.0", :device_minor=>"7"}
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 211 def self.read_information_file(device_path) device_name = File.basename(device_path) information_file = File.join(NVIDIA_PROC_PATH, device_name, 'information') File.open(information_file, 'r') do |file| content = file.read content.scan(/\n?([\w\s]*):\s+(.*)/).map { |key, value| [key.downcase.tr(' ', '_').to_sym, value] }.to_h end end
Public Instance Methods
@return [Integer] the current core clock speed
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 94 def core_clock meta['clocks.current.sm [MHz]'].to_i end
@return [Integer] the fan speed
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 99 def fan meta['fan.speed [%]'].to_i end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 160 def information_file @information_file ||= begin device_name = File.basename(device_path) File.join(NVIDIA_PROC_PATH, device_name, 'information') end end
@return [Integer] the current memory clock speed
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 89 def memory_clock meta['clocks.current.memory [MHz]'].to_i end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 152 def memory_free meta['memory.free [MiB]'] end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 144 def memory_total meta['memory.total [MiB]'] end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 148 def memory_used meta['memory.used [MiB]'] end
@return [Hash] the metadata from the nvidia-smi tool return cached data or fetch new data
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 32 def meta if expired_metadata? logger.debug("Expired Nvidia Data for #{uuid} ") @meta = metadata else @meta ||= metadata end end
@note data returned from nvidia-smi @return [Hash]
"name": "GeForce GTX 1070 Ti", "vbios_version": "86.04.85.00.63", "uuid": "GPU-a583cb04-f9b5-68f3-50b9-2b4ba1c7d14e", "memory.used [MiB]": "2578 MiB", "memory.free [MiB]": "5534 MiB", "memory.total [MiB]": "8112 MiB", "utilization.gpu [%]": "100", "temperature.gpu": "53", "power.draw [W]": "129.21", "power.limit [W]": "130.00", "power.max_limit [W]": "217.00", "pstate": 2, "fan.speed [%]": "75"
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 73 def metadata logger.debug("Calling #{NVIDIA_SMI}") data = `#{NVIDIA_SMI} --query-gpu=gpu_name,vbios_version,uuid,memory.used,memory.free,memory.total,utilization.gpu,temperature.gpu,power.draw,power.limit,power.max_limit,fan.speed,pstate,clocks.current.memory,clocks.current.sm -i #{index} --format=csv,nounits 2>&1` unless $CHILD_STATUS.success? # error code 15 logger.error(data.delete("\n")) return self.class.blank_data end cards = if data CSV.parse(data, headers: true, header_converters: ->(f) { f.strip }, converters: ->(f) { f ? f.strip : nil }).map(&:to_h) end cards.first end
@return [Float] the power being used by the gpu
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 104 def power meta['power.draw [W]'].strip.to_f + power_offset end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 116 def power_limit meta['power.limit [W]'].strip.to_f end
@param value [Numeric] power in watts to set the gpu limit to
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 125 def power_limit=(value) # in the correct format and above 10 watts raise ArgumentError.new("Power value #{value.to_i} cannot exceed #{power_max_limit}") unless value.to_i.between?(1, power_max_limit.to_i) output = `#{NVIDIA_SMI} -i #{index} -pl #{value}` if $CHILD_STATUS.success? logger.info("GPU#{index} power set to #{value} Watts") else logger.warn("GPU#{index} failed setting power to #{value}\n#{output}") end value.to_i end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 120 def power_max_limit meta['power.max_limit [W]'].strip.to_f end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 112 def pstate meta['pstate'].to_i end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 26 def reset_metadata @meta = nil end
@param value [Numeric] - the fan limit that should be applied to the gpu as a percentage @return [Numeric] - original passed in value after being set
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 140 def set_fan_limit(_value, _type = 'current') raise NotImplementedError.new('Not implemented for Nvidia') end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 195 def set_mem_clock_and_vddc(_mem_clock, _mem_volt) return unless experimental_on? logger.warn('Feature not enabled for nvidia') end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 22 def subtype SUBTYPE end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 108 def temp meta['temperature.gpu'].to_i end
# File lib/compute_unit/gpus/nvidia_gpu.rb, line 156 def utilization meta['utilization.gpu [%]'].sub(/%/, '').to_i end