From 0f232c8ec9d8732d29d7cd04c503425cdff9a548 Mon Sep 17 00:00:00 2001 From: "maofeng.huang" Date: Wed, 5 Mar 2025 15:05:56 +0800 Subject: [PATCH] Add the RunDiag API and fix the bug related to affinity retrieval --- README.md | 31 + pkg/ixdcgm/const.go | 1183 +++++++++++++++++++++++--- pkg/ixdcgm/device_common.go | 2 - pkg/ixdcgm/device_info.go | 48 +- pkg/ixdcgm/diag.go | 238 ++++++ pkg/ixdcgm/fields.go | 55 +- pkg/ixdcgm/include/dcgm_agent.h | 26 +- pkg/ixdcgm/include/dcgm_structs.h | 86 +- pkg/ixdcgm/include/ixdcgmApiExport.h | 17 - pkg/ixdcgm/include/ixdcgmFields.h | 502 ----------- pkg/ixdcgm/include/ixdcgmStructs.h | 1125 ++---------------------- pkg/ixdcgm/process_info.go | 1 - pkg/ixdcgm/types.go | 40 - pkg/ixdcgm/utils.go | 9 +- samples/diag/main.go | 46 + 15 files changed, 1604 insertions(+), 1805 deletions(-) create mode 100644 pkg/ixdcgm/diag.go delete mode 100644 pkg/ixdcgm/include/ixdcgmFields.h delete mode 100644 pkg/ixdcgm/types.go create mode 100644 samples/diag/main.go diff --git a/README.md b/README.md index bb8924b..00f5ed1 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,37 @@ Data : {61} ``` `Note`: Press Ctrl+C to stop the iteration output. + +To execute a diagnostic on the system and devices, run the following command: +``` +$ go run samples/diag/main.go + +# sample output + +Software: + + presence of drivers on the denylist (e.g. nouveau) pass + presence (and version) of NVML lib pass + presence (and version) of CUDA lib pass + presence (and version) of CUDA RT lib pass + character device permissions pass + +GPU: 0 + + Memory notrun + Diagnostic notrun + PCIe notrun + SM Stress notrun + Targeted Stress notrun + Targeted Power notrun + Memory Bandwidth notrun + Memtest notrun + Pulse Test notrun + EUD Test notrun + CPU EUD Test notrun + +``` + ## License Copyright (c) 2024 Iluvatar CoreX. All rights reserved. This project has an Apache-2.0 license, as diff --git a/pkg/ixdcgm/const.go b/pkg/ixdcgm/const.go index 5eafe6b..7df455f 100644 --- a/pkg/ixdcgm/const.go +++ b/pkg/ixdcgm/const.go @@ -23,127 +23,963 @@ package ixdcgm */ import "C" +type Short C.ushort + +type FieldValue_v1 struct { + Version uint + FieldId uint + FieldType uint + Status int + Ts int64 + Value [4096]byte +} + const ( - DCGM_FI_UNKNOWN Short = C.DCGM_FI_UNKNOWN - DCGM_FI_DRIVER_VERSION Short = C.DCGM_FI_DRIVER_VERSION - DCGM_FI_NVML_VERSION Short = C.DCGM_FI_NVML_VERSION - DCGM_FI_PROCESS_NAME Short = C.DCGM_FI_PROCESS_NAME - DCGM_FI_DEV_COUNT Short = C.DCGM_FI_DEV_COUNT - DCGM_FI_CUDA_DRIVER_VERSION Short = C.DCGM_FI_CUDA_DRIVER_VERSION - DCGM_FI_DEV_NAME Short = C.DCGM_FI_DEV_NAME - DCGM_FI_DEV_BRAND Short = C.DCGM_FI_DEV_BRAND - DCGM_FI_DEV_NVML_INDEX Short = C.DCGM_FI_DEV_NVML_INDEX - DCGM_FI_DEV_SERIAL Short = C.DCGM_FI_DEV_SERIAL - DCGM_FI_DEV_UUID Short = C.DCGM_FI_DEV_UUID - DCGM_FI_DEV_MINOR_NUMBER Short = C.DCGM_FI_DEV_MINOR_NUMBER - DCGM_FI_DEV_OEM_INFOROM_VER Short = C.DCGM_FI_DEV_OEM_INFOROM_VER - DCGM_FI_DEV_PCI_BUSID Short = C.DCGM_FI_DEV_PCI_BUSID - DCGM_FI_DEV_PCI_COMBINED_ID Short = C.DCGM_FI_DEV_PCI_COMBINED_ID - DCGM_FI_DEV_PCI_SUBSYS_ID Short = C.DCGM_FI_DEV_PCI_SUBSYS_ID - DCGM_FI_GPU_TOPOLOGY_PCI Short = C.DCGM_FI_GPU_TOPOLOGY_PCI - DCGM_FI_GPU_TOPOLOGY_NVLINK Short = C.DCGM_FI_GPU_TOPOLOGY_NVLINK - DCGM_FI_GPU_TOPOLOGY_AFFINITY Short = C.DCGM_FI_GPU_TOPOLOGY_AFFINITY - DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY Short = C.DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY - DCGM_FI_DEV_COMPUTE_MODE Short = C.DCGM_FI_DEV_COMPUTE_MODE - DCGM_FI_DEV_PERSISTENCE_MODE Short = C.DCGM_FI_DEV_PERSISTENCE_MODE - DCGM_FI_DEV_MIG_MODE Short = C.DCGM_FI_DEV_MIG_MODE - DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR Short = C.DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR - DCGM_FI_DEV_MIG_MAX_SLICES Short = C.DCGM_FI_DEV_MIG_MAX_SLICES - DCGM_FI_DEV_CPU_AFFINITY_0 Short = C.DCGM_FI_DEV_CPU_AFFINITY_0 - DCGM_FI_DEV_CPU_AFFINITY_1 Short = C.DCGM_FI_DEV_CPU_AFFINITY_1 - DCGM_FI_DEV_CPU_AFFINITY_2 Short = C.DCGM_FI_DEV_CPU_AFFINITY_2 - DCGM_FI_DEV_CPU_AFFINITY_3 Short = C.DCGM_FI_DEV_CPU_AFFINITY_3 - DCGM_FI_DEV_CC_MODE Short = C.DCGM_FI_DEV_CC_MODE - DCGM_FI_DEV_MIG_ATTRIBUTES Short = C.DCGM_FI_DEV_MIG_ATTRIBUTES - DCGM_FI_DEV_MIG_GI_INFO Short = C.DCGM_FI_DEV_MIG_GI_INFO - DCGM_FI_DEV_MIG_CI_INFO Short = C.DCGM_FI_DEV_MIG_CI_INFO - DCGM_FI_DEV_ECC_INFOROM_VER Short = C.DCGM_FI_DEV_ECC_INFOROM_VER - DCGM_FI_DEV_POWER_INFOROM_VER Short = C.DCGM_FI_DEV_POWER_INFOROM_VER - DCGM_FI_DEV_INFOROM_IMAGE_VER Short = C.DCGM_FI_DEV_INFOROM_IMAGE_VER - DCGM_FI_DEV_INFOROM_CONFIG_CHECK Short = C.DCGM_FI_DEV_INFOROM_CONFIG_CHECK - DCGM_FI_DEV_INFOROM_CONFIG_VALID Short = C.DCGM_FI_DEV_INFOROM_CONFIG_VALID - DCGM_FI_DEV_VBIOS_VERSION Short = C.DCGM_FI_DEV_VBIOS_VERSION - DCGM_FI_DEV_MEM_AFFINITY_0 Short = C.DCGM_FI_DEV_MEM_AFFINITY_0 - DCGM_FI_DEV_MEM_AFFINITY_1 Short = C.DCGM_FI_DEV_MEM_AFFINITY_1 - DCGM_FI_DEV_MEM_AFFINITY_2 Short = C.DCGM_FI_DEV_MEM_AFFINITY_2 - DCGM_FI_DEV_MEM_AFFINITY_3 Short = C.DCGM_FI_DEV_MEM_AFFINITY_3 - DCGM_FI_DEV_BAR1_TOTAL Short = C.DCGM_FI_DEV_BAR1_TOTAL - DCGM_FI_SYNC_BOOST Short = C.DCGM_FI_SYNC_BOOST - DCGM_FI_DEV_BAR1_USED Short = C.DCGM_FI_DEV_BAR1_USED - DCGM_FI_DEV_BAR1_FREE Short = C.DCGM_FI_DEV_BAR1_FREE - DCGM_FI_DEV_SM_CLOCK Short = C.DCGM_FI_DEV_SM_CLOCK - DCGM_FI_DEV_MEM_CLOCK Short = C.DCGM_FI_DEV_MEM_CLOCK - DCGM_FI_DEV_VIDEO_CLOCK Short = C.DCGM_FI_DEV_VIDEO_CLOCK - DCGM_FI_DEV_APP_SM_CLOCK Short = C.DCGM_FI_DEV_APP_SM_CLOCK - DCGM_FI_DEV_APP_MEM_CLOCK Short = C.DCGM_FI_DEV_APP_MEM_CLOCK - DCGM_FI_DEV_CLOCK_THROTTLE_REASONS Short = C.DCGM_FI_DEV_CLOCK_THROTTLE_REASONS - DCGM_FI_DEV_MAX_SM_CLOCK Short = C.DCGM_FI_DEV_MAX_SM_CLOCK - DCGM_FI_DEV_MAX_MEM_CLOCK Short = C.DCGM_FI_DEV_MAX_MEM_CLOCK - DCGM_FI_DEV_MAX_VIDEO_CLOCK Short = C.DCGM_FI_DEV_MAX_VIDEO_CLOCK - DCGM_FI_DEV_AUTOBOOST Short = C.DCGM_FI_DEV_AUTOBOOST - DCGM_FI_DEV_SUPPORTED_CLOCKS Short = C.DCGM_FI_DEV_SUPPORTED_CLOCKS - DCGM_FI_DEV_MEMORY_TEMP Short = C.DCGM_FI_DEV_MEMORY_TEMP - DCGM_FI_DEV_GPU_TEMP Short = C.DCGM_FI_DEV_GPU_TEMP - DCGM_FI_DEV_MEM_MAX_OP_TEMP Short = C.DCGM_FI_DEV_MEM_MAX_OP_TEMP - DCGM_FI_DEV_GPU_MAX_OP_TEMP Short = C.DCGM_FI_DEV_GPU_MAX_OP_TEMP - DCGM_FI_DEV_POWER_USAGE Short = C.DCGM_FI_DEV_POWER_USAGE - DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Short = C.DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION - DCGM_FI_DEV_POWER_USAGE_INSTANT Short = C.DCGM_FI_DEV_POWER_USAGE_INSTANT - DCGM_FI_DEV_SLOWDOWN_TEMP Short = C.DCGM_FI_DEV_SLOWDOWN_TEMP - DCGM_FI_DEV_SHUTDOWN_TEMP Short = C.DCGM_FI_DEV_SHUTDOWN_TEMP - DCGM_FI_DEV_POWER_MGMT_LIMIT Short = C.DCGM_FI_DEV_POWER_MGMT_LIMIT - DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN Short = C.DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN - DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX Short = C.DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX - DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF Short = C.DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF - DCGM_FI_DEV_ENFORCED_POWER_LIMIT Short = C.DCGM_FI_DEV_ENFORCED_POWER_LIMIT - DCGM_FI_DEV_PSTATE Short = C.DCGM_FI_DEV_PSTATE - DCGM_FI_DEV_FAN_SPEED Short = C.DCGM_FI_DEV_FAN_SPEED - DCGM_FI_DEV_PCIE_TX_THROUGHPUT Short = C.DCGM_FI_DEV_PCIE_TX_THROUGHPUT - DCGM_FI_DEV_PCIE_RX_THROUGHPUT Short = C.DCGM_FI_DEV_PCIE_RX_THROUGHPUT - DCGM_FI_DEV_PCIE_REPLAY_COUNTER Short = C.DCGM_FI_DEV_PCIE_REPLAY_COUNTER - DCGM_FI_DEV_GPU_UTIL Short = C.DCGM_FI_DEV_GPU_UTIL - DCGM_FI_DEV_MEM_COPY_UTIL Short = C.DCGM_FI_DEV_MEM_COPY_UTIL - DCGM_FI_DEV_ACCOUNTING_DATA Short = C.DCGM_FI_DEV_ACCOUNTING_DATA - DCGM_FI_DEV_ENC_UTIL Short = C.DCGM_FI_DEV_ENC_UTIL - DCGM_FI_DEV_DEC_UTIL Short = C.DCGM_FI_DEV_DEC_UTIL - DCGM_FI_DEV_XID_ERRORS Short = C.DCGM_FI_DEV_XID_ERRORS - DCGM_FI_DEV_PCIE_MAX_LINK_GEN Short = C.DCGM_FI_DEV_PCIE_MAX_LINK_GEN - DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH Short = C.DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH - DCGM_FI_DEV_PCIE_LINK_GEN Short = C.DCGM_FI_DEV_PCIE_LINK_GEN - DCGM_FI_DEV_PCIE_LINK_WIDTH Short = C.DCGM_FI_DEV_PCIE_LINK_WIDTH - DCGM_FI_DEV_POWER_VIOLATION Short = C.DCGM_FI_DEV_POWER_VIOLATION - DCGM_FI_DEV_THERMAL_VIOLATION Short = C.DCGM_FI_DEV_THERMAL_VIOLATION - DCGM_FI_DEV_SYNC_BOOST_VIOLATION Short = C.DCGM_FI_DEV_SYNC_BOOST_VIOLATION - DCGM_FI_DEV_BOARD_LIMIT_VIOLATION Short = C.DCGM_FI_DEV_BOARD_LIMIT_VIOLATION - DCGM_FI_DEV_LOW_UTIL_VIOLATION Short = C.DCGM_FI_DEV_LOW_UTIL_VIOLATION - DCGM_FI_DEV_RELIABILITY_VIOLATION Short = C.DCGM_FI_DEV_RELIABILITY_VIOLATION - DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION Short = C.DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION - DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION Short = C.DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION - DCGM_FI_DEV_FB_TOTAL Short = C.DCGM_FI_DEV_FB_TOTAL - DCGM_FI_DEV_FB_FREE Short = C.DCGM_FI_DEV_FB_FREE - DCGM_FI_DEV_FB_USED Short = C.DCGM_FI_DEV_FB_USED - DCGM_FI_DEV_FB_RESERVED Short = C.DCGM_FI_DEV_FB_RESERVED - DCGM_FI_DEV_FB_USED_PERCENT Short = C.DCGM_FI_DEV_FB_USED_PERCENT - DCGM_FI_DEV_ECC_CURRENT Short = C.DCGM_FI_DEV_ECC_CURRENT - DCGM_FI_DEV_ECC_PENDING Short = C.DCGM_FI_DEV_ECC_PENDING - DCGM_FI_DEV_ECC_SBE_VOL_DEV Short = C.DCGM_FI_DEV_ECC_SBE_VOL_DEV - DCGM_FI_DEV_ECC_DBE_VOL_DEV Short = C.DCGM_FI_DEV_ECC_DBE_VOL_DEV - DCGM_FI_INTERNAL_FIELDS_0_START Short = C.DCGM_FI_INTERNAL_FIELDS_0_START - DCGM_FI_INTERNAL_FIELDS_0_END Short = C.DCGM_FI_INTERNAL_FIELDS_0_END - DCGM_FI_PROF_SM_ACTIVE Short = C.DCGM_FI_PROF_SM_ACTIVE - DCGM_FI_PROF_SM_OCCUPANCY Short = C.DCGM_FI_PROF_SM_OCCUPANCY - DCGM_FI_PROF_DRAM_ACTIVE Short = C.DCGM_FI_PROF_DRAM_ACTIVE - - DCGM_FI_MAX_FIELDS Short = C.DCGM_FI_MAX_FIELDS + DCGM_FT_BINARY = uint('b') + DCGM_FT_DOUBLE = uint('d') + DCGM_FT_INT64 = uint('i') + DCGM_FT_STRING = uint('s') + DCGM_FT_TIMESTAMP = uint('t') + DCGM_FT_INT32_BLANK = int64(2147483632) + DCGM_FT_INT32_NOT_FOUND = int64(DCGM_FT_INT32_BLANK + 1) + DCGM_FT_INT32_NOT_SUPPORTED = int64(DCGM_FT_INT32_BLANK + 2) + DCGM_FT_INT32_NOT_PERMISSIONED = int64(DCGM_FT_INT32_BLANK + 3) + DCGM_FT_INT64_BLANK = int64(9223372036854775792) + DCGM_FT_INT64_NOT_FOUND = int64(DCGM_FT_INT64_BLANK + 1) + DCGM_FT_INT64_NOT_SUPPORTED = int64(DCGM_FT_INT64_BLANK + 2) + DCGM_FT_INT64_NOT_PERMISSIONED = int64(DCGM_FT_INT64_BLANK + 3) + DCGM_FT_FP64_BLANK = 140737488355328.0 + DCGM_FT_FP64_NOT_FOUND = float64(DCGM_FT_FP64_BLANK + 1.0) + DCGM_FT_FP64_NOT_SUPPORTED = float64(DCGM_FT_FP64_BLANK + 2.0) + DCGM_FT_FP64_NOT_PERMISSIONED = float64(DCGM_FT_FP64_BLANK + 3.0) + DCGM_FT_STR_BLANK = "<<>>" + DCGM_FT_STR_NOT_FOUND = "<<>>" + DCGM_FT_STR_NOT_SUPPORTED = "<<>>" + DCGM_FT_STR_NOT_PERMISSIONED = "<<>>" + + DCGM_FI_UNKNOWN = 0 + DCGM_FI_DRIVER_VERSION = 1 + DCGM_FI_NVML_VERSION = 2 + DCGM_FI_PROCESS_NAME = 3 + DCGM_FI_DEV_COUNT = 4 + DCGM_FI_CUDA_DRIVER_VERSION = 5 + DCGM_FI_DEV_NAME = 50 + DCGM_FI_DEV_BRAND = 51 + DCGM_FI_DEV_NVML_INDEX = 52 + DCGM_FI_DEV_SERIAL = 53 + DCGM_FI_DEV_UUID = 54 + DCGM_FI_DEV_MINOR_NUMBER = 55 + DCGM_FI_DEV_OEM_INFOROM_VER = 56 + DCGM_FI_DEV_PCI_BUSID = 57 + DCGM_FI_DEV_PCI_COMBINED_ID = 58 + DCGM_FI_DEV_PCI_SUBSYS_ID = 59 + DCGM_FI_GPU_TOPOLOGY_PCI = 60 + DCGM_FI_GPU_TOPOLOGY_NVLINK = 61 + DCGM_FI_GPU_TOPOLOGY_AFFINITY = 62 + DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY = 63 + DCGM_FI_DEV_COMPUTE_MODE = 65 + DCGM_FI_DEV_PERSISTENCE_MODE = 66 + DCGM_FI_DEV_MIG_MODE = 67 + DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR = 68 + DCGM_FI_DEV_MIG_MAX_SLICES = 69 + DCGM_FI_DEV_CPU_AFFINITY_0 = 70 + DCGM_FI_DEV_CPU_AFFINITY_1 = 71 + DCGM_FI_DEV_CPU_AFFINITY_2 = 72 + DCGM_FI_DEV_CPU_AFFINITY_3 = 73 + DCGM_FI_DEV_CC_MODE = 74 + DCGM_FI_DEV_MIG_ATTRIBUTES = 75 + DCGM_FI_DEV_MIG_GI_INFO = 76 + DCGM_FI_DEV_MIG_CI_INFO = 77 + DCGM_FI_DEV_ECC_INFOROM_VER = 80 + DCGM_FI_DEV_POWER_INFOROM_VER = 81 + DCGM_FI_DEV_INFOROM_IMAGE_VER = 82 + DCGM_FI_DEV_INFOROM_CONFIG_CHECK = 83 + DCGM_FI_DEV_INFOROM_CONFIG_VALID = 84 + DCGM_FI_DEV_VBIOS_VERSION = 85 + DCGM_FI_DEV_MEM_AFFINITY_0 = 86 + DCGM_FI_DEV_MEM_AFFINITY_1 = 87 + DCGM_FI_DEV_MEM_AFFINITY_2 = 88 + DCGM_FI_DEV_MEM_AFFINITY_3 = 89 + DCGM_FI_DEV_BAR1_TOTAL = 90 + DCGM_FI_SYNC_BOOST = 91 + DCGM_FI_DEV_BAR1_USED = 92 + DCGM_FI_DEV_BAR1_FREE = 93 + DCGM_FI_DEV_SM_CLOCK = 100 + DCGM_FI_DEV_MEM_CLOCK = 101 + DCGM_FI_DEV_VIDEO_CLOCK = 102 + DCGM_FI_DEV_APP_SM_CLOCK = 110 + DCGM_FI_DEV_APP_MEM_CLOCK = 111 + DCGM_FI_DEV_CLOCK_THROTTLE_REASONS = 112 + DCGM_FI_DEV_MAX_SM_CLOCK = 113 + DCGM_FI_DEV_MAX_MEM_CLOCK = 114 + DCGM_FI_DEV_MAX_VIDEO_CLOCK = 115 + DCGM_FI_DEV_AUTOBOOST = 120 + DCGM_FI_DEV_SUPPORTED_CLOCKS = 130 + DCGM_FI_DEV_MEMORY_TEMP = 140 + DCGM_FI_DEV_GPU_TEMP = 150 + DCGM_FI_DEV_MEM_MAX_OP_TEMP = 151 + DCGM_FI_DEV_GPU_MAX_OP_TEMP = 152 + DCGM_FI_DEV_POWER_USAGE = 155 + DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION = 156 + DCGM_FI_DEV_POWER_USAGE_INSTANT = 157 + DCGM_FI_DEV_SLOWDOWN_TEMP = 158 + DCGM_FI_DEV_SHUTDOWN_TEMP = 159 + DCGM_FI_DEV_POWER_MGMT_LIMIT = 160 + DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN = 161 + DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX = 162 + DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF = 163 + DCGM_FI_DEV_ENFORCED_POWER_LIMIT = 164 + DCGM_FI_DEV_PSTATE = 190 + DCGM_FI_DEV_FAN_SPEED = 191 + DCGM_FI_DEV_PCIE_TX_THROUGHPUT = 200 + DCGM_FI_DEV_PCIE_RX_THROUGHPUT = 201 + DCGM_FI_DEV_PCIE_REPLAY_COUNTER = 202 + DCGM_FI_DEV_GPU_UTIL = 203 + DCGM_FI_DEV_MEM_COPY_UTIL = 204 + DCGM_FI_DEV_ACCOUNTING_DATA = 205 + DCGM_FI_DEV_ENC_UTIL = 206 + DCGM_FI_DEV_DEC_UTIL = 207 + DCGM_FI_DEV_XID_ERRORS = 230 + DCGM_FI_DEV_PCIE_MAX_LINK_GEN = 235 + DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH = 236 + DCGM_FI_DEV_PCIE_LINK_GEN = 237 + DCGM_FI_DEV_PCIE_LINK_WIDTH = 238 + DCGM_FI_DEV_POWER_VIOLATION = 240 + DCGM_FI_DEV_THERMAL_VIOLATION = 241 + DCGM_FI_DEV_SYNC_BOOST_VIOLATION = 242 + DCGM_FI_DEV_BOARD_LIMIT_VIOLATION = 243 + DCGM_FI_DEV_LOW_UTIL_VIOLATION = 244 + DCGM_FI_DEV_RELIABILITY_VIOLATION = 245 + DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION = 246 + DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION = 247 + DCGM_FI_DEV_FB_TOTAL = 250 + DCGM_FI_DEV_FB_FREE = 251 + DCGM_FI_DEV_FB_USED = 252 + DCGM_FI_DEV_FB_RESERVED = 253 + DCGM_FI_DEV_FB_USED_PERCENT = 254 + DCGM_FI_DEV_C2C_LINK_COUNT = 285 + DCGM_FI_DEV_C2C_LINK_STATUS = 286 + DCGM_FI_DEV_C2C_MAX_BANDWIDTH = 287 + DCGM_FI_DEV_ECC_CURRENT = 300 + DCGM_FI_DEV_ECC_PENDING = 301 + DCGM_FI_DEV_ECC_SBE_VOL_TOTAL = 310 + DCGM_FI_DEV_ECC_DBE_VOL_TOTAL = 311 + DCGM_FI_DEV_ECC_SBE_AGG_TOTAL = 312 + DCGM_FI_DEV_ECC_DBE_AGG_TOTAL = 313 + DCGM_FI_DEV_ECC_SBE_VOL_L1 = 314 + DCGM_FI_DEV_ECC_DBE_VOL_L1 = 315 + DCGM_FI_DEV_ECC_SBE_VOL_L2 = 316 + DCGM_FI_DEV_ECC_DBE_VOL_L2 = 317 + DCGM_FI_DEV_ECC_SBE_VOL_DEV = 318 + DCGM_FI_DEV_ECC_DBE_VOL_DEV = 319 + DCGM_FI_DEV_ECC_SBE_VOL_REG = 320 + DCGM_FI_DEV_ECC_DBE_VOL_REG = 321 + DCGM_FI_DEV_ECC_SBE_VOL_TEX = 322 + DCGM_FI_DEV_ECC_DBE_VOL_TEX = 323 + DCGM_FI_DEV_ECC_SBE_AGG_L1 = 324 + DCGM_FI_DEV_ECC_DBE_AGG_L1 = 325 + DCGM_FI_DEV_ECC_SBE_AGG_L2 = 326 + DCGM_FI_DEV_ECC_DBE_AGG_L2 = 327 + DCGM_FI_DEV_ECC_SBE_AGG_DEV = 328 + DCGM_FI_DEV_ECC_DBE_AGG_DEV = 329 + DCGM_FI_DEV_ECC_SBE_AGG_REG = 330 + DCGM_FI_DEV_ECC_DBE_AGG_REG = 331 + DCGM_FI_DEV_ECC_SBE_AGG_TEX = 332 + DCGM_FI_DEV_ECC_DBE_AGG_TEX = 333 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX = 385 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH = 386 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL = 387 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW = 388 + DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE = 389 + DCGM_FI_DEV_RETIRED_SBE = 390 + DCGM_FI_DEV_RETIRED_DBE = 391 + DCGM_FI_DEV_RETIRED_PENDING = 392 + DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS = 393 + DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS = 394 + DCGM_FI_DEV_ROW_REMAP_FAILURE = 395 + DCGM_FI_DEV_ROW_REMAP_PENDING = 396 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 = 400 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 = 401 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 = 402 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 = 403 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 = 404 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 = 405 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL = 409 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 = 410 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 = 411 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 = 412 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 = 413 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 = 414 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 = 415 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL = 419 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 = 420 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 = 421 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 = 422 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 = 423 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 = 424 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 = 425 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL = 429 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 = 430 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 = 431 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 = 432 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 = 433 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 = 434 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 = 435 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL = 439 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L0 = 440 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L1 = 441 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L2 = 442 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L3 = 443 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L4 = 444 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L5 = 445 + DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL = 449 + DCGM_FI_DEV_GPU_NVLINK_ERRORS = 450 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 = 451 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 = 452 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 = 453 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 = 454 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 = 455 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 = 456 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 = 457 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 = 458 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 = 459 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 = 460 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 = 461 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 = 462 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 = 463 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 = 464 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 = 465 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 = 466 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 = 467 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 = 468 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 = 469 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 = 470 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 = 471 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 = 472 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 = 473 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 = 474 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L6 = 475 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L7 = 476 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L8 = 477 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L9 = 478 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L10 = 479 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L11 = 480 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 = 406 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 = 407 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 = 408 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 = 481 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 = 482 + DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 = 483 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 = 416 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 = 417 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 = 418 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 = 484 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 = 485 + DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 = 486 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 = 426 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 = 427 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 = 428 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 = 487 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 = 488 + DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 = 489 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 = 436 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 = 437 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 = 438 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 = 491 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 = 492 + DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 = 493 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L12 = 446 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L13 = 447 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L14 = 448 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L15 = 494 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L16 = 495 + DCGM_FI_DEV_NVLINK_BANDWIDTH_L17 = 496 + DCGM_FI_DEV_VIRTUAL_MODE = 500 + DCGM_FI_DEV_SUPPORTED_TYPE_INFO = 501 + DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS = 502 + DCGM_FI_DEV_VGPU_INSTANCE_IDS = 503 + DCGM_FI_DEV_VGPU_UTILIZATIONS = 504 + DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION = 505 + DCGM_FI_DEV_ENC_STATS = 506 + DCGM_FI_DEV_FBC_STATS = 507 + DCGM_FI_DEV_FBC_SESSIONS_INFO = 508 + DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS = 509 + DCGM_FI_DEV_VGPU_TYPE_INFO = 510 + DCGM_FI_DEV_VGPU_TYPE_NAME = 511 + DCGM_FI_DEV_VGPU_TYPE_CLASS = 512 + DCGM_FI_DEV_VGPU_TYPE_LICENSE = 513 + DCGM_FI_DEV_VGPU_VM_ID = 520 + DCGM_FI_DEV_VGPU_VM_NAME = 521 + DCGM_FI_DEV_VGPU_TYPE = 522 + DCGM_FI_DEV_VGPU_UUID = 523 + DCGM_FI_DEV_VGPU_DRIVER_VERSION = 524 + DCGM_FI_DEV_VGPU_MEMORY_USAGE = 525 + DCGM_FI_DEV_VGPU_LICENSE_STATUS = 526 + DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT = 527 + DCGM_FI_DEV_VGPU_ENC_STATS = 528 + DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO = 529 + DCGM_FI_DEV_VGPU_FBC_STATS = 530 + DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO = 531 + DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE = 532 + DCGM_FI_DEV_VGPU_PCI_ID = 533 + DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID = 534 + DCGM_FI_INTERNAL_FIELDS_0_START = 600 + DCGM_FI_INTERNAL_FIELDS_0_END = 699 + DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT = 701 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ = 702 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV = 703 + DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD = 704 + DCGM_FI_DEV_NVSWITCH_POWER_VDD = 705 + DCGM_FI_DEV_NVSWITCH_POWER_DVDD = 706 + DCGM_FI_DEV_NVSWITCH_POWER_HVDD = 707 + DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX = 780 + DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX = 781 + DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS = 782 + DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS = 783 + DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS = 784 + DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS = 785 + DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS = 786 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS = 787 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS = 788 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 = 789 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 = 790 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 = 791 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 = 792 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 = 793 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 = 794 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 = 795 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 = 796 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 = 797 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 = 798 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 = 799 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 = 800 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 = 801 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 = 802 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 = 803 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 = 804 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 = 805 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 = 806 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 = 807 + DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 = 808 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 = 809 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 = 810 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 = 811 + DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 = 812 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 = 813 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 = 814 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 = 815 + DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 = 816 + DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS = 856 + DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS = 857 + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT = 858 + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN = 859 + DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN = 860 + DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX = 861 + DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX = 862 + DCGM_FI_DEV_NVSWITCH_PHYS_ID = 863 + DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED = 864 + DCGM_FI_DEV_NVSWITCH_LINK_ID = 865 + DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN = 866 + DCGM_FI_DEV_NVSWITCH_PCIE_BUS = 867 + DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE = 868 + DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION = 869 + DCGM_FI_DEV_NVSWITCH_LINK_STATUS = 870 + DCGM_FI_DEV_NVSWITCH_LINK_TYPE = 871 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN = 872 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS = 873 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE = 874 + DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION = 875 + DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID = 876 + DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID = 877 + DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID = 878 + DCGM_FI_PROF_GR_ENGINE_ACTIVE = 1001 + DCGM_FI_PROF_SM_ACTIVE = 1002 + DCGM_FI_PROF_SM_OCCUPANCY = 1003 + DCGM_FI_PROF_PIPE_TENSOR_ACTIVE = 1004 + DCGM_FI_PROF_DRAM_ACTIVE = 1005 + DCGM_FI_PROF_PIPE_FP64_ACTIVE = 1006 + DCGM_FI_PROF_PIPE_FP32_ACTIVE = 1007 + DCGM_FI_PROF_PIPE_FP16_ACTIVE = 1008 + DCGM_FI_PROF_PCIE_TX_BYTES = 1009 + DCGM_FI_PROF_PCIE_RX_BYTES = 1010 + DCGM_FI_PROF_NVLINK_TX_BYTES = 1011 + DCGM_FI_PROF_NVLINK_RX_BYTES = 1012 + DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE = 1013 + DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE = 1014 + DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE = 1015 + DCGM_FI_PROF_PIPE_INT_ACTIVE = 1016 + DCGM_FI_PROF_NVDEC0_ACTIVE = 1017 + DCGM_FI_PROF_NVDEC1_ACTIVE = 1018 + DCGM_FI_PROF_NVDEC2_ACTIVE = 1019 + DCGM_FI_PROF_NVDEC3_ACTIVE = 1020 + DCGM_FI_PROF_NVDEC4_ACTIVE = 1021 + DCGM_FI_PROF_NVDEC5_ACTIVE = 1022 + DCGM_FI_PROF_NVDEC6_ACTIVE = 1023 + DCGM_FI_PROF_NVDEC7_ACTIVE = 1024 + DCGM_FI_PROF_NVJPG0_ACTIVE = 1025 + DCGM_FI_PROF_NVJPG1_ACTIVE = 1026 + DCGM_FI_PROF_NVJPG2_ACTIVE = 1027 + DCGM_FI_PROF_NVJPG3_ACTIVE = 1028 + DCGM_FI_PROF_NVJPG4_ACTIVE = 1029 + DCGM_FI_PROF_NVJPG5_ACTIVE = 1030 + DCGM_FI_PROF_NVJPG6_ACTIVE = 1031 + DCGM_FI_PROF_NVJPG7_ACTIVE = 1032 + DCGM_FI_PROF_NVOFA0_ACTIVE = 1033 + DCGM_FI_PROF_NVLINK_L0_TX_BYTES = 1040 + DCGM_FI_PROF_NVLINK_L0_RX_BYTES = 1041 + DCGM_FI_PROF_NVLINK_L1_TX_BYTES = 1042 + DCGM_FI_PROF_NVLINK_L1_RX_BYTES = 1043 + DCGM_FI_PROF_NVLINK_L2_TX_BYTES = 1044 + DCGM_FI_PROF_NVLINK_L2_RX_BYTES = 1045 + DCGM_FI_PROF_NVLINK_L3_TX_BYTES = 1046 + DCGM_FI_PROF_NVLINK_L3_RX_BYTES = 1047 + DCGM_FI_PROF_NVLINK_L4_TX_BYTES = 1048 + DCGM_FI_PROF_NVLINK_L4_RX_BYTES = 1049 + DCGM_FI_PROF_NVLINK_L5_TX_BYTES = 1050 + DCGM_FI_PROF_NVLINK_L5_RX_BYTES = 1051 + DCGM_FI_PROF_NVLINK_L6_TX_BYTES = 1052 + DCGM_FI_PROF_NVLINK_L6_RX_BYTES = 1053 + DCGM_FI_PROF_NVLINK_L7_TX_BYTES = 1054 + DCGM_FI_PROF_NVLINK_L7_RX_BYTES = 1055 + DCGM_FI_PROF_NVLINK_L8_TX_BYTES = 1056 + DCGM_FI_PROF_NVLINK_L8_RX_BYTES = 1057 + DCGM_FI_PROF_NVLINK_L9_TX_BYTES = 1058 + DCGM_FI_PROF_NVLINK_L9_RX_BYTES = 1059 + DCGM_FI_PROF_NVLINK_L10_TX_BYTES = 1060 + DCGM_FI_PROF_NVLINK_L10_RX_BYTES = 1061 + DCGM_FI_PROF_NVLINK_L11_TX_BYTES = 1062 + DCGM_FI_PROF_NVLINK_L11_RX_BYTES = 1063 + DCGM_FI_PROF_NVLINK_L12_TX_BYTES = 1064 + DCGM_FI_PROF_NVLINK_L12_RX_BYTES = 1065 + DCGM_FI_PROF_NVLINK_L13_TX_BYTES = 1066 + DCGM_FI_PROF_NVLINK_L13_RX_BYTES = 1067 + DCGM_FI_PROF_NVLINK_L14_TX_BYTES = 1068 + DCGM_FI_PROF_NVLINK_L14_RX_BYTES = 1069 + DCGM_FI_PROF_NVLINK_L15_TX_BYTES = 1070 + DCGM_FI_PROF_NVLINK_L15_RX_BYTES = 1071 + DCGM_FI_PROF_NVLINK_L16_TX_BYTES = 1072 + DCGM_FI_PROF_NVLINK_L16_RX_BYTES = 1073 + DCGM_FI_PROF_NVLINK_L17_TX_BYTES = 1074 + DCGM_FI_PROF_NVLINK_L17_RX_BYTES = 1075 + DCGM_FI_DEV_CPU_UTIL_TOTAL = 1100 + DCGM_FI_DEV_CPU_UTIL_USER = 1101 + DCGM_FI_DEV_CPU_UTIL_NICE = 1102 + DCGM_FI_DEV_CPU_UTIL_SYS = 1103 + DCGM_FI_DEV_CPU_UTIL_IRQ = 1104 + DCGM_FI_DEV_CPU_TEMP_CURRENT = 1110 + DCGM_FI_DEV_CPU_TEMP_WARNING = 1111 + DCGM_FI_DEV_CPU_TEMP_CRITICAL = 1112 + DCGM_FI_DEV_CPU_CLOCK_CURRENT = 1120 + DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT = 1130 + DCGM_FI_DEV_CPU_POWER_LIMIT = 1131 + DCGM_FI_DEV_CPU_VENDOR = 1140 + DCGM_FI_DEV_CPU_MODEL = 1141 + DCGM_FI_MAX_FIELDS = 1142 + + DCGM_ST_OK = 0 + DCGM_ST_BADPARAM = -1 + DCGM_ST_GENERIC_ERROR = -3 + DCGM_ST_MEMORY = -4 + DCGM_ST_NOT_CONFIGURED = -5 + DCGM_ST_NOT_SUPPORTED = -6 + DCGM_ST_INIT_ERROR = -7 + DCGM_ST_NVML_ERROR = -8 + DCGM_ST_PENDING = -9 + DCGM_ST_UNINITIALIZED = -10 + DCGM_ST_TIMEOUT = -11 + DCGM_ST_VER_MISMATCH = -12 + DCGM_ST_UNKNOWN_FIELD = -13 + DCGM_ST_NO_DATA = -14 + DCGM_ST_STALE_DATA = -15 + DCGM_ST_NOT_WATCHED = -16 + DCGM_ST_NO_PERMISSION = -17 + DCGM_ST_GPU_IS_LOST = -18 + DCGM_ST_RESET_REQUIRED = -19 + DCGM_ST_FUNCTION_NOT_FOUND = -20 + DCGM_ST_CONNECTION_NOT_VALID = -21 + DCGM_ST_GPU_NOT_SUPPORTED = -22 + DCGM_ST_GROUP_INCOMPATIBLE = -23 + DCGM_ST_MAX_LIMIT = -24 + DCGM_ST_LIBRARY_NOT_FOUND = -25 + DCGM_ST_DUPLICATE_KEY = -26 + DCGM_ST_GPU_IN_SYNC_BOOST_GROUP = -27 + DCGM_ST_GPU_NOT_IN_SYNC_BOOST_GROUP = -28 + DCGM_ST_REQUIRES_ROOT = -29 + DCGM_ST_NVVS_ERROR = -30 + DCGM_ST_INSUFFICIENT_SIZE = -31 + DCGM_ST_FIELD_UNSUPPORTED_BY_API = -32 + DCGM_ST_MODULE_NOT_LOADED = -33 + DCGM_ST_IN_USE = -34 + DCGM_ST_GROUP_IS_EMPTY = -35 + DCGM_ST_PROFILING_NOT_SUPPORTED = -36 + DCGM_ST_PROFILING_LIBRARY_ERROR = -37 + DCGM_ST_PROFILING_MULTI_PASS = -38 + DCGM_ST_DIAG_ALREADY_RUNNING = -39 + DCGM_ST_DIAG_BAD_JSON = -40 + DCGM_ST_DIAG_BAD_LAUNCH = -41 + DCGM_ST_DIAG_UNUSED = -42 + DCGM_ST_DIAG_THRESHOLD_EXCEEDED = -43 + DCGM_ST_INSUFFICIENT_DRIVER_VERSION = -44 + DCGM_ST_INSTANCE_NOT_FOUND = -45 + DCGM_ST_COMPUTE_INSTANCE_NOT_FOUND = -46 + DCGM_ST_CHILD_NOT_KILLED = -47 + DCGM_ST_3RD_PARTY_LIBRARY_ERROR = -48 + DCGM_ST_INSUFFICIENT_RESOURCES = -49 + DCGM_ST_PLUGIN_EXCEPTION = -50 + DCGM_ST_NVVS_ISOLATE_ERROR = -51 + DCGM_ST_NVVS_BINARY_NOT_FOUND = -52 + DCGM_ST_NVVS_KILLED = -53 + DCGM_ST_PAUSED = -54 + DCGM_ST_ALREADY_INITIALIZED = -55 +) + +var DCGM_FI = map[string]Short{ + "DCGM_FT_BINARY": Short('b'), + "DCGM_FT_DOUBLE": Short('d'), + "DCGM_FT_INT64": Short('i'), + "DCGM_FT_STRING": Short('s'), + "DCGM_FT_TIMESTAMP": Short('t'), + + "DCGM_FI_UNKNOWN": 0, + "DCGM_FI_DRIVER_VERSION": 1, + "DCGM_FI_NVML_VERSION": 2, + "DCGM_FI_PROCESS_NAME": 3, + "DCGM_FI_DEV_COUNT": 4, + "DCGM_FI_CUDA_DRIVER_VERSION": 5, + "DCGM_FI_DEV_NAME": 50, + "DCGM_FI_DEV_BRAND": 51, + "DCGM_FI_DEV_NVML_INDEX": 52, + "DCGM_FI_DEV_SERIAL": 53, + "DCGM_FI_DEV_UUID": 54, + "DCGM_FI_DEV_MINOR_NUMBER": 55, + "DCGM_FI_DEV_OEM_INFOROM_VER": 56, + "DCGM_FI_DEV_PCI_BUSID": 57, + "DCGM_FI_DEV_PCI_COMBINED_ID": 58, + "DCGM_FI_DEV_PCI_SUBSYS_ID": 59, + "DCGM_FI_GPU_TOPOLOGY_PCI": 60, + "DCGM_FI_GPU_TOPOLOGY_NVLINK": 61, + "DCGM_FI_GPU_TOPOLOGY_AFFINITY": 62, + "DCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY": 63, + "DCGM_FI_DEV_COMPUTE_MODE": 65, + "DCGM_FI_DEV_PERSISTENCE_MODE": 66, + "DCGM_FI_DEV_MIG_MODE": 67, + "DCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR": 68, + "DCGM_FI_DEV_MIG_MAX_SLICES": 69, + "DCGM_FI_DEV_CPU_AFFINITY_0": 70, + "DCGM_FI_DEV_CPU_AFFINITY_1": 71, + "DCGM_FI_DEV_CPU_AFFINITY_2": 72, + "DCGM_FI_DEV_CPU_AFFINITY_3": 73, + "DCGM_FI_DEV_CC_MODE": 74, + "DCGM_FI_DEV_MIG_ATTRIBUTES": 75, + "DCGM_FI_DEV_MIG_GI_INFO": 76, + "DCGM_FI_DEV_MIG_CI_INFO": 77, + "DCGM_FI_DEV_ECC_INFOROM_VER": 80, + "DCGM_FI_DEV_POWER_INFOROM_VER": 81, + "DCGM_FI_DEV_INFOROM_IMAGE_VER": 82, + "DCGM_FI_DEV_INFOROM_CONFIG_CHECK": 83, + "DCGM_FI_DEV_INFOROM_CONFIG_VALID": 84, + "DCGM_FI_DEV_VBIOS_VERSION": 85, + "DCGM_FI_DEV_MEM_AFFINITY_0": 86, + "DCGM_FI_DEV_MEM_AFFINITY_1": 87, + "DCGM_FI_DEV_MEM_AFFINITY_2": 88, + "DCGM_FI_DEV_MEM_AFFINITY_3": 89, + "DCGM_FI_DEV_BAR1_TOTAL": 90, + "DCGM_FI_SYNC_BOOST": 91, + "DCGM_FI_DEV_BAR1_USED": 92, + "DCGM_FI_DEV_BAR1_FREE": 93, + "DCGM_FI_DEV_SM_CLOCK": 100, + "DCGM_FI_DEV_MEM_CLOCK": 101, + "DCGM_FI_DEV_VIDEO_CLOCK": 102, + "DCGM_FI_DEV_APP_SM_CLOCK": 110, + "DCGM_FI_DEV_APP_MEM_CLOCK": 111, + "DCGM_FI_DEV_CLOCK_THROTTLE_REASONS": 112, + "DCGM_FI_DEV_MAX_SM_CLOCK": 113, + "DCGM_FI_DEV_MAX_MEM_CLOCK": 114, + "DCGM_FI_DEV_MAX_VIDEO_CLOCK": 115, + "DCGM_FI_DEV_AUTOBOOST": 120, + "DCGM_FI_DEV_SUPPORTED_CLOCKS": 130, + "DCGM_FI_DEV_MEMORY_TEMP": 140, + "DCGM_FI_DEV_GPU_TEMP": 150, + "DCGM_FI_DEV_MEM_MAX_OP_TEMP": 151, + "DCGM_FI_DEV_GPU_MAX_OP_TEMP": 152, + "DCGM_FI_DEV_POWER_USAGE": 155, + "DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION": 156, + "DCGM_FI_DEV_POWER_USAGE_INSTANT": 157, + "DCGM_FI_DEV_SLOWDOWN_TEMP": 158, + "DCGM_FI_DEV_SHUTDOWN_TEMP": 159, + "DCGM_FI_DEV_POWER_MGMT_LIMIT": 160, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_MIN": 161, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_MAX": 162, + "DCGM_FI_DEV_POWER_MGMT_LIMIT_DEF": 163, + "DCGM_FI_DEV_ENFORCED_POWER_LIMIT": 164, + "DCGM_FI_DEV_PSTATE": 190, + "DCGM_FI_DEV_FAN_SPEED": 191, + "DCGM_FI_DEV_PCIE_TX_THROUGHPUT": 200, + "DCGM_FI_DEV_PCIE_RX_THROUGHPUT": 201, + "DCGM_FI_DEV_PCIE_REPLAY_COUNTER": 202, + "DCGM_FI_DEV_GPU_UTIL": 203, + "DCGM_FI_DEV_MEM_COPY_UTIL": 204, + "DCGM_FI_DEV_ACCOUNTING_DATA": 205, + "DCGM_FI_DEV_ENC_UTIL": 206, + "DCGM_FI_DEV_DEC_UTIL": 207, + "DCGM_FI_DEV_XID_ERRORS": 230, + "DCGM_FI_DEV_PCIE_MAX_LINK_GEN": 235, + "DCGM_FI_DEV_PCIE_MAX_LINK_WIDTH": 236, + "DCGM_FI_DEV_PCIE_LINK_GEN": 237, + "DCGM_FI_DEV_PCIE_LINK_WIDTH": 238, + "DCGM_FI_DEV_POWER_VIOLATION": 240, + "DCGM_FI_DEV_THERMAL_VIOLATION": 241, + "DCGM_FI_DEV_SYNC_BOOST_VIOLATION": 242, + "DCGM_FI_DEV_BOARD_LIMIT_VIOLATION": 243, + "DCGM_FI_DEV_LOW_UTIL_VIOLATION": 244, + "DCGM_FI_DEV_RELIABILITY_VIOLATION": 245, + "DCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION": 246, + "DCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION": 247, + "DCGM_FI_DEV_FB_TOTAL": 250, + "DCGM_FI_DEV_FB_FREE": 251, + "DCGM_FI_DEV_FB_USED": 252, + "DCGM_FI_DEV_FB_RESERVED": 253, + "DCGM_FI_DEV_FB_USED_PERCENT": 254, + "DCGM_FI_DEV_C2C_LINK_COUNT": 285, + "DCGM_FI_DEV_C2C_LINK_STATUS": 286, + "DCGM_FI_DEV_C2C_MAX_BANDWIDTH": 287, + "DCGM_FI_DEV_ECC_CURRENT": 300, + "DCGM_FI_DEV_ECC_PENDING": 301, + "DCGM_FI_DEV_ECC_SBE_VOL_TOTAL": 310, + "DCGM_FI_DEV_ECC_DBE_VOL_TOTAL": 311, + "DCGM_FI_DEV_ECC_SBE_AGG_TOTAL": 312, + "DCGM_FI_DEV_ECC_DBE_AGG_TOTAL": 313, + "DCGM_FI_DEV_ECC_SBE_VOL_L1": 314, + "DCGM_FI_DEV_ECC_DBE_VOL_L1": 315, + "DCGM_FI_DEV_ECC_SBE_VOL_L2": 316, + "DCGM_FI_DEV_ECC_DBE_VOL_L2": 317, + "DCGM_FI_DEV_ECC_SBE_VOL_DEV": 318, + "DCGM_FI_DEV_ECC_DBE_VOL_DEV": 319, + "DCGM_FI_DEV_ECC_SBE_VOL_REG": 320, + "DCGM_FI_DEV_ECC_DBE_VOL_REG": 321, + "DCGM_FI_DEV_ECC_SBE_VOL_TEX": 322, + "DCGM_FI_DEV_ECC_DBE_VOL_TEX": 323, + "DCGM_FI_DEV_ECC_SBE_AGG_L1": 324, + "DCGM_FI_DEV_ECC_DBE_AGG_L1": 325, + "DCGM_FI_DEV_ECC_SBE_AGG_L2": 326, + "DCGM_FI_DEV_ECC_DBE_AGG_L2": 327, + "DCGM_FI_DEV_ECC_SBE_AGG_DEV": 328, + "DCGM_FI_DEV_ECC_DBE_AGG_DEV": 329, + "DCGM_FI_DEV_ECC_SBE_AGG_REG": 330, + "DCGM_FI_DEV_ECC_DBE_AGG_REG": 331, + "DCGM_FI_DEV_ECC_SBE_AGG_TEX": 332, + "DCGM_FI_DEV_ECC_DBE_AGG_TEX": 333, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX": 385, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH": 386, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL": 387, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW": 388, + "DCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE": 389, + "DCGM_FI_DEV_RETIRED_SBE": 390, + "DCGM_FI_DEV_RETIRED_DBE": 391, + "DCGM_FI_DEV_RETIRED_PENDING": 392, + "DCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS": 393, + "DCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS": 394, + "DCGM_FI_DEV_ROW_REMAP_FAILURE": 395, + "DCGM_FI_DEV_ROW_REMAP_PENDING": 396, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0": 400, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1": 401, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2": 402, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3": 403, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4": 404, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5": 405, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL": 409, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0": 410, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1": 411, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2": 412, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3": 413, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4": 414, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5": 415, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL": 419, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0": 420, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1": 421, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2": 422, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3": 423, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4": 424, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5": 425, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL": 429, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0": 430, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1": 431, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2": 432, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3": 433, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4": 434, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5": 435, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL": 439, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L0": 440, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L1": 441, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L2": 442, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L3": 443, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L4": 444, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L5": 445, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL": 449, + "DCGM_FI_DEV_GPU_NVLINK_ERRORS": 450, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6": 451, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7": 452, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8": 453, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9": 454, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10": 455, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11": 456, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6": 457, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7": 458, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8": 459, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9": 460, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10": 461, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11": 462, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6": 463, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7": 464, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8": 465, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9": 466, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10": 467, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11": 468, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6": 469, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7": 470, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8": 471, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9": 472, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10": 473, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11": 474, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L6": 475, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L7": 476, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L8": 477, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L9": 478, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L10": 479, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L11": 480, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12": 406, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13": 407, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14": 408, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15": 481, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16": 482, + "DCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17": 483, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12": 416, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13": 417, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14": 418, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15": 484, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16": 485, + "DCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17": 486, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12": 426, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13": 427, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14": 428, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15": 487, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16": 488, + "DCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17": 489, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12": 436, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13": 437, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14": 438, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15": 491, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16": 492, + "DCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17": 493, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L12": 446, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L13": 447, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L14": 448, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L15": 494, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L16": 495, + "DCGM_FI_DEV_NVLINK_BANDWIDTH_L17": 496, + "DCGM_FI_DEV_VIRTUAL_MODE": 500, + "DCGM_FI_DEV_SUPPORTED_TYPE_INFO": 501, + "DCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS": 502, + "DCGM_FI_DEV_VGPU_INSTANCE_IDS": 503, + "DCGM_FI_DEV_VGPU_UTILIZATIONS": 504, + "DCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION": 505, + "DCGM_FI_DEV_ENC_STATS": 506, + "DCGM_FI_DEV_FBC_STATS": 507, + "DCGM_FI_DEV_FBC_SESSIONS_INFO": 508, + "DCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS": 509, + "DCGM_FI_DEV_VGPU_TYPE_INFO": 510, + "DCGM_FI_DEV_VGPU_TYPE_NAME": 511, + "DCGM_FI_DEV_VGPU_TYPE_CLASS": 512, + "DCGM_FI_DEV_VGPU_TYPE_LICENSE": 513, + "DCGM_FI_DEV_VGPU_VM_ID": 520, + "DCGM_FI_DEV_VGPU_VM_NAME": 521, + "DCGM_FI_DEV_VGPU_TYPE": 522, + "DCGM_FI_DEV_VGPU_UUID": 523, + "DCGM_FI_DEV_VGPU_DRIVER_VERSION": 524, + "DCGM_FI_DEV_VGPU_MEMORY_USAGE": 525, + "DCGM_FI_DEV_VGPU_LICENSE_STATUS": 526, + "DCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT": 527, + "DCGM_FI_DEV_VGPU_ENC_STATS": 528, + "DCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO": 529, + "DCGM_FI_DEV_VGPU_FBC_STATS": 530, + "DCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO": 531, + "DCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE": 532, + "DCGM_FI_DEV_VGPU_PCI_ID": 533, + "DCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID": 534, + "DCGM_FI_INTERNAL_FIELDS_0_START": 600, + "DCGM_FI_INTERNAL_FIELDS_0_END": 699, + "DCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT": 701, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ": 702, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV": 703, + "DCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD": 704, + "DCGM_FI_DEV_NVSWITCH_POWER_VDD": 705, + "DCGM_FI_DEV_NVSWITCH_POWER_DVDD": 706, + "DCGM_FI_DEV_NVSWITCH_POWER_HVDD": 707, + "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX": 780, + "DCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX": 781, + "DCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS": 782, + "DCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS": 783, + "DCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS": 784, + "DCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS": 785, + "DCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS": 786, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS": 787, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS": 788, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0": 789, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1": 790, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2": 791, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3": 792, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0": 793, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1": 794, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2": 795, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3": 796, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0": 797, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1": 798, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2": 799, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3": 800, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0": 801, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1": 802, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2": 803, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3": 804, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0": 805, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1": 806, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2": 807, + "DCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3": 808, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0": 809, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1": 810, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2": 811, + "DCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3": 812, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0": 813, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1": 814, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2": 815, + "DCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3": 816, + "DCGM_FI_DEV_NVSWITCH_FATAL_ERRORS": 856, + "DCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS": 857, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT": 858, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN": 859, + "DCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN": 860, + "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX": 861, + "DCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX": 862, + "DCGM_FI_DEV_NVSWITCH_PHYS_ID": 863, + "DCGM_FI_DEV_NVSWITCH_RESET_REQUIRED": 864, + "DCGM_FI_DEV_NVSWITCH_LINK_ID": 865, + "DCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN": 866, + "DCGM_FI_DEV_NVSWITCH_PCIE_BUS": 867, + "DCGM_FI_DEV_NVSWITCH_PCIE_DEVICE": 868, + "DCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION": 869, + "DCGM_FI_DEV_NVSWITCH_LINK_STATUS": 870, + "DCGM_FI_DEV_NVSWITCH_LINK_TYPE": 871, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN": 872, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS": 873, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE": 874, + "DCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION": 875, + "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID": 876, + "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID": 877, + "DCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID": 878, + "DCGM_FI_PROF_GR_ENGINE_ACTIVE": 1001, + "DCGM_FI_PROF_SM_ACTIVE": 1002, + "DCGM_FI_PROF_SM_OCCUPANCY": 1003, + "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE": 1004, + "DCGM_FI_PROF_DRAM_ACTIVE": 1005, + "DCGM_FI_PROF_PIPE_FP64_ACTIVE": 1006, + "DCGM_FI_PROF_PIPE_FP32_ACTIVE": 1007, + "DCGM_FI_PROF_PIPE_FP16_ACTIVE": 1008, + "DCGM_FI_PROF_PCIE_TX_BYTES": 1009, + "DCGM_FI_PROF_PCIE_RX_BYTES": 1010, + "DCGM_FI_PROF_NVLINK_TX_BYTES": 1011, + "DCGM_FI_PROF_NVLINK_RX_BYTES": 1012, + "DCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE": 1013, + "DCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE": 1014, + "DCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE": 1015, + "DCGM_FI_PROF_PIPE_INT_ACTIVE": 1016, + "DCGM_FI_PROF_NVDEC0_ACTIVE": 1017, + "DCGM_FI_PROF_NVDEC1_ACTIVE": 1018, + "DCGM_FI_PROF_NVDEC2_ACTIVE": 1019, + "DCGM_FI_PROF_NVDEC3_ACTIVE": 1020, + "DCGM_FI_PROF_NVDEC4_ACTIVE": 1021, + "DCGM_FI_PROF_NVDEC5_ACTIVE": 1022, + "DCGM_FI_PROF_NVDEC6_ACTIVE": 1023, + "DCGM_FI_PROF_NVDEC7_ACTIVE": 1024, + "DCGM_FI_PROF_NVJPG0_ACTIVE": 1025, + "DCGM_FI_PROF_NVJPG1_ACTIVE": 1026, + "DCGM_FI_PROF_NVJPG2_ACTIVE": 1027, + "DCGM_FI_PROF_NVJPG3_ACTIVE": 1028, + "DCGM_FI_PROF_NVJPG4_ACTIVE": 1029, + "DCGM_FI_PROF_NVJPG5_ACTIVE": 1030, + "DCGM_FI_PROF_NVJPG6_ACTIVE": 1031, + "DCGM_FI_PROF_NVJPG7_ACTIVE": 1032, + "DCGM_FI_PROF_NVOFA0_ACTIVE": 1033, + "DCGM_FI_PROF_NVLINK_L0_TX_BYTES": 1040, + "DCGM_FI_PROF_NVLINK_L0_RX_BYTES": 1041, + "DCGM_FI_PROF_NVLINK_L1_TX_BYTES": 1042, + "DCGM_FI_PROF_NVLINK_L1_RX_BYTES": 1043, + "DCGM_FI_PROF_NVLINK_L2_TX_BYTES": 1044, + "DCGM_FI_PROF_NVLINK_L2_RX_BYTES": 1045, + "DCGM_FI_PROF_NVLINK_L3_TX_BYTES": 1046, + "DCGM_FI_PROF_NVLINK_L3_RX_BYTES": 1047, + "DCGM_FI_PROF_NVLINK_L4_TX_BYTES": 1048, + "DCGM_FI_PROF_NVLINK_L4_RX_BYTES": 1049, + "DCGM_FI_PROF_NVLINK_L5_TX_BYTES": 1050, + "DCGM_FI_PROF_NVLINK_L5_RX_BYTES": 1051, + "DCGM_FI_PROF_NVLINK_L6_TX_BYTES": 1052, + "DCGM_FI_PROF_NVLINK_L6_RX_BYTES": 1053, + "DCGM_FI_PROF_NVLINK_L7_TX_BYTES": 1054, + "DCGM_FI_PROF_NVLINK_L7_RX_BYTES": 1055, + "DCGM_FI_PROF_NVLINK_L8_TX_BYTES": 1056, + "DCGM_FI_PROF_NVLINK_L8_RX_BYTES": 1057, + "DCGM_FI_PROF_NVLINK_L9_TX_BYTES": 1058, + "DCGM_FI_PROF_NVLINK_L9_RX_BYTES": 1059, + "DCGM_FI_PROF_NVLINK_L10_TX_BYTES": 1060, + "DCGM_FI_PROF_NVLINK_L10_RX_BYTES": 1061, + "DCGM_FI_PROF_NVLINK_L11_TX_BYTES": 1062, + "DCGM_FI_PROF_NVLINK_L11_RX_BYTES": 1063, + "DCGM_FI_PROF_NVLINK_L12_TX_BYTES": 1064, + "DCGM_FI_PROF_NVLINK_L12_RX_BYTES": 1065, + "DCGM_FI_PROF_NVLINK_L13_TX_BYTES": 1066, + "DCGM_FI_PROF_NVLINK_L13_RX_BYTES": 1067, + "DCGM_FI_PROF_NVLINK_L14_TX_BYTES": 1068, + "DCGM_FI_PROF_NVLINK_L14_RX_BYTES": 1069, + "DCGM_FI_PROF_NVLINK_L15_TX_BYTES": 1070, + "DCGM_FI_PROF_NVLINK_L15_RX_BYTES": 1071, + "DCGM_FI_PROF_NVLINK_L16_TX_BYTES": 1072, + "DCGM_FI_PROF_NVLINK_L16_RX_BYTES": 1073, + "DCGM_FI_PROF_NVLINK_L17_TX_BYTES": 1074, + "DCGM_FI_PROF_NVLINK_L17_RX_BYTES": 1075, + "DCGM_FI_DEV_CPU_UTIL_TOTAL": 1100, + "DCGM_FI_DEV_CPU_UTIL_USER": 1101, + "DCGM_FI_DEV_CPU_UTIL_NICE": 1102, + "DCGM_FI_DEV_CPU_UTIL_SYS": 1103, + "DCGM_FI_DEV_CPU_UTIL_IRQ": 1104, + "DCGM_FI_DEV_CPU_TEMP_CURRENT": 1110, + "DCGM_FI_DEV_CPU_TEMP_WARNING": 1111, + "DCGM_FI_DEV_CPU_TEMP_CRITICAL": 1112, + "DCGM_FI_DEV_CPU_CLOCK_CURRENT": 1120, + "DCGM_FI_DEV_CPU_POWER_UTIL_CURRENT": 1130, + "DCGM_FI_DEV_CPU_POWER_LIMIT": 1131, + "DCGM_FI_DEV_CPU_VENDOR": 1140, + "DCGM_FI_DEV_CPU_MODEL": 1141, + "DCGM_FI_MAX_FIELDS": 1142, +} + +const ( + DCGM_FV_FLAG_LIVE_DATA = uint(0x00000001) ) type HealthSystem uint const ( - DCGM_HEALTH_WATCH_PCIE HealthSystem = 0x1 - DCGM_HEALTH_WATCH_THERMAL HealthSystem = 0x80 - DCGM_HEALTH_WATCH_POWER HealthSystem = 0x100 - DCGM_HEALTH_WATCH_ALL HealthSystem = 0xFFFFFFFF // All watches only support PCIE watch now + DCGM_HEALTH_WATCH_PCIE HealthSystem = 0x1 + DCGM_HEALTH_WATCH_NVLINK HealthSystem = 0x2 + DCGM_HEALTH_WATCH_PMU HealthSystem = 0x4 + DCGM_HEALTH_WATCH_MCU HealthSystem = 0x8 + DCGM_HEALTH_WATCH_MEM HealthSystem = 0x10 + DCGM_HEALTH_WATCH_SM HealthSystem = 0x20 + DCGM_HEALTH_WATCH_INFOROM HealthSystem = 0x40 + DCGM_HEALTH_WATCH_THERMAL HealthSystem = 0x80 + DCGM_HEALTH_WATCH_POWER HealthSystem = 0x100 + DCGM_HEALTH_WATCH_DRIVER HealthSystem = 0x200 + DCGM_HEALTH_WATCH_NVSWITCH_NONFATAL HealthSystem = 0x400 + DCGM_HEALTH_WATCH_NVSWITCH_FATAL HealthSystem = 0x800 + DCGM_HEALTH_WATCH_ALL HealthSystem = 0xFFFFFFFF ) type HealthResult uint @@ -153,3 +989,120 @@ const ( DCGM_HEALTH_RESULT_WARN HealthResult = 10 // A warning has been issued, refer to the response for more information DCGM_HEALTH_RESULT_FAIL HealthResult = 20 // A failure has been issued, refer to the response for more information ) + +// HealthCheckErrorCode error codes for passive and active health checks. +type HealthCheckErrorCode uint + +const ( + DCGM_FR_OK HealthCheckErrorCode = 0 // 0 No error + DCGM_FR_UNKNOWN HealthCheckErrorCode = 1 // 1 Unknown error code + DCGM_FR_UNRECOGNIZED HealthCheckErrorCode = 2 // 2 Unrecognized error code + DCGM_FR_PCI_REPLAY_RATE HealthCheckErrorCode = 3 // 3 Unacceptable rate of PCI errors + DCGM_FR_VOLATILE_DBE_DETECTED HealthCheckErrorCode = 4 // 4 Uncorrectable volatile double bit error + DCGM_FR_VOLATILE_SBE_DETECTED HealthCheckErrorCode = 5 // 5 Unacceptable rate of volatile single bit errors + DCGM_FR_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 6 // 6 Pending page retirements detected + DCGM_FR_RETIRED_PAGES_LIMIT HealthCheckErrorCode = 7 // 7 Unacceptable total page retirements detected + DCGM_FR_RETIRED_PAGES_DBE_LIMIT HealthCheckErrorCode = 8 // 8 Unacceptable total page retirements due to uncorrectable errors + DCGM_FR_CORRUPT_INFOROM HealthCheckErrorCode = 9 // 9 Corrupt inforom found + DCGM_FR_CLOCK_THROTTLE_THERMAL HealthCheckErrorCode = 10 // 10 Clocks being throttled due to overheating + DCGM_FR_POWER_UNREADABLE HealthCheckErrorCode = 11 // 11 Cannot get a reading for power from NVML + DCGM_FR_CLOCK_THROTTLE_POWER HealthCheckErrorCode = 12 // 12 Clock being throttled due to power restrictions + DCGM_FR_NVLINK_ERROR_THRESHOLD HealthCheckErrorCode = 13 // 13 Unacceptable rate of NVLink errors + DCGM_FR_NVLINK_DOWN HealthCheckErrorCode = 14 // 14 NVLink is down + DCGM_FR_NVSWITCH_FATAL_ERROR HealthCheckErrorCode = 15 // 15 Fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_NON_FATAL_ERROR HealthCheckErrorCode = 16 // 16 Non-fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_DOWN HealthCheckErrorCode = 17 // 17 NVSwitch is down - NOT USED: DEPRECATED + DCGM_FR_NO_ACCESS_TO_FILE HealthCheckErrorCode = 18 // 18 Cannot access a file + DCGM_FR_NVML_API HealthCheckErrorCode = 19 // 19 Error occurred on an NVML API - NOT USED: DEPRECATED + DCGM_FR_DEVICE_COUNT_MISMATCH HealthCheckErrorCode = 20 // 20 Disagreement in GPU count between /dev and NVML + DCGM_FR_BAD_PARAMETER HealthCheckErrorCode = 21 // 21 Bad parameter passed to API + DCGM_FR_CANNOT_OPEN_LIB HealthCheckErrorCode = 22 // 22 Cannot open a library that must be accessed + DCGM_FR_DENYLISTED_DRIVER HealthCheckErrorCode = 23 // 23 A driver on the denylist (nouveau) is active + DCGM_FR_NVML_LIB_BAD HealthCheckErrorCode = 24 // 24 NVML library is missing expected functions - NOT USED: DEPRECATED + DCGM_FR_GRAPHICS_PROCESSES HealthCheckErrorCode = 25 // 25 Graphics processes are active on this GPU + DCGM_FR_HOSTENGINE_CONN HealthCheckErrorCode = 26 // 26 Bad connection to nv-hostengine - NOT USED: DEPRECATED + DCGM_FR_FIELD_QUERY HealthCheckErrorCode = 27 // 27 Error querying a field from DCGM + DCGM_FR_BAD_CUDA_ENV HealthCheckErrorCode = 28 // 28 The environment has variables that hurt CUDA + DCGM_FR_PERSISTENCE_MODE HealthCheckErrorCode = 29 // 29 Persistence mode is disabled + DCGM_FR_LOW_BANDWIDTH HealthCheckErrorCode = 30 // 30 The bandwidth is unacceptably low + DCGM_FR_HIGH_LATENCY HealthCheckErrorCode = 31 // 31 Latency is too high + DCGM_FR_CANNOT_GET_FIELD_TAG HealthCheckErrorCode = 32 // 32 Cannot find a tag for a field + DCGM_FR_FIELD_VIOLATION HealthCheckErrorCode = 33 // 33 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD HealthCheckErrorCode = 34 // 34 The value for the specified field is above the threshold + DCGM_FR_FIELD_VIOLATION_DBL HealthCheckErrorCode = 35 // 35 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD_DBL HealthCheckErrorCode = 36 // 36 The value for the specified field is above the threshold + DCGM_FR_UNSUPPORTED_FIELD_TYPE HealthCheckErrorCode = 37 // 37 Field type cannot be supported + DCGM_FR_FIELD_THRESHOLD_TS HealthCheckErrorCode = 38 // 38 The value for the specified field is above the threshold + DCGM_FR_FIELD_THRESHOLD_TS_DBL HealthCheckErrorCode = 39 // 39 The value for the specified field is above the threshold + DCGM_FR_THERMAL_VIOLATIONS HealthCheckErrorCode = 40 // 40 Thermal violations detected + DCGM_FR_THERMAL_VIOLATIONS_TS HealthCheckErrorCode = 41 // 41 Thermal violations detected with a timestamp + DCGM_FR_TEMP_VIOLATION HealthCheckErrorCode = 42 // 42 Temperature is too high + DCGM_FR_THROTTLING_VIOLATION HealthCheckErrorCode = 43 // 43 Non-benign clock throttling is occurring + DCGM_FR_INTERNAL HealthCheckErrorCode = 44 // 44 An internal error was detected + DCGM_FR_PCIE_GENERATION HealthCheckErrorCode = 45 // 45 PCIe generation is too low + DCGM_FR_PCIE_WIDTH HealthCheckErrorCode = 46 // 46 PCIe width is too low + DCGM_FR_ABORTED HealthCheckErrorCode = 47 // 47 Test was aborted by a user signal + DCGM_FR_TEST_DISABLED HealthCheckErrorCode = 48 // 48 This test is disabled for this GPU + DCGM_FR_CANNOT_GET_STAT HealthCheckErrorCode = 49 // 49 Cannot get telemetry for a needed value + DCGM_FR_STRESS_LEVEL HealthCheckErrorCode = 50 // 50 Stress level is too low (bad performance) + DCGM_FR_CUDA_API HealthCheckErrorCode = 51 // 51 Error calling the specified CUDA API + DCGM_FR_FAULTY_MEMORY HealthCheckErrorCode = 52 // 52 Faulty memory detected on this GPU + DCGM_FR_CANNOT_SET_WATCHES HealthCheckErrorCode = 53 // 53 Unable to set field watches in DCGM - NOT USED: DEPRECATED + DCGM_FR_CUDA_UNBOUND HealthCheckErrorCode = 54 // 54 CUDA context is no longer bound + DCGM_FR_ECC_DISABLED HealthCheckErrorCode = 55 // 55 ECC memory is disabled right now + DCGM_FR_MEMORY_ALLOC HealthCheckErrorCode = 56 // 56 Cannot allocate memory on the GPU + DCGM_FR_CUDA_DBE HealthCheckErrorCode = 57 // 57 CUDA detected unrecovable double-bit error + DCGM_FR_MEMORY_MISMATCH HealthCheckErrorCode = 58 // 58 Memory error detected + DCGM_FR_CUDA_DEVICE HealthCheckErrorCode = 59 // 59 No CUDA device discoverable for existing GPU + DCGM_FR_ECC_UNSUPPORTED HealthCheckErrorCode = 60 // 60 ECC memory is unsupported by this SKU + DCGM_FR_ECC_PENDING HealthCheckErrorCode = 61 // 61 ECC memory is in a pending state - NOT USED: DEPRECATED + DCGM_FR_MEMORY_BANDWIDTH HealthCheckErrorCode = 62 // 62 Memory bandwidth is too low + DCGM_FR_TARGET_POWER HealthCheckErrorCode = 63 // 63 Cannot hit the target power draw + DCGM_FR_API_FAIL HealthCheckErrorCode = 64 // 64 The specified API call failed + DCGM_FR_API_FAIL_GPU HealthCheckErrorCode = 65 // 65 The specified API call failed for the specified GPU + DCGM_FR_CUDA_CONTEXT HealthCheckErrorCode = 66 // 66 Cannot create a CUDA context on this GPU + DCGM_FR_DCGM_API HealthCheckErrorCode = 67 // 67 DCGM API failure + DCGM_FR_CONCURRENT_GPUS HealthCheckErrorCode = 68 // 68 Need multiple GPUs to run this test + DCGM_FR_TOO_MANY_ERRORS HealthCheckErrorCode = 69 // 69 More errors than fit in the return struct - NOT USED: DEPRECATED + DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD HealthCheckErrorCode = 70 // 70 More than 100 CRC errors are happening per second + DCGM_FR_NVLINK_ERROR_CRITICAL HealthCheckErrorCode = 71 // 71 NVLink error for a field that should always be 0 + DCGM_FR_ENFORCED_POWER_LIMIT HealthCheckErrorCode = 72 // 72 The enforced power limit is too low to hit the target + DCGM_FR_MEMORY_ALLOC_HOST HealthCheckErrorCode = 73 // 73 Cannot allocate memory on the host + DCGM_FR_GPU_OP_MODE HealthCheckErrorCode = 74 // 74 Bad GPU operating mode for running plugin - NOT USED: DEPRECATED + DCGM_FR_NO_MEMORY_CLOCKS HealthCheckErrorCode = 75 // 75 No memory clocks with the needed MHz found - NOT USED: DEPRECATED + DCGM_FR_NO_GRAPHICS_CLOCKS HealthCheckErrorCode = 76 // 76 No graphics clocks with the needed MHz found - NOT USED: DEPRECATED + DCGM_FR_HAD_TO_RESTORE_STATE HealthCheckErrorCode = 77 // 77 Note that we had to restore a GPU's state + DCGM_FR_L1TAG_UNSUPPORTED HealthCheckErrorCode = 78 // 78 L1TAG test is unsupported by this SKU + DCGM_FR_L1TAG_MISCOMPARE HealthCheckErrorCode = 79 // 79 L1TAG test failed on a miscompare + DCGM_FR_ROW_REMAP_FAILURE HealthCheckErrorCode = 80 // 80 Row remapping failed (Ampere or newer GPUs) + DCGM_FR_UNCONTAINED_ERROR HealthCheckErrorCode = 81 // 81 Uncontained error - XID 95 + DCGM_FR_EMPTY_GPU_LIST HealthCheckErrorCode = 82 // 82 No GPU information given to plugin + DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS HealthCheckErrorCode = 83 // 83 Pending page retirements due to a DBE + DCGM_FR_UNCORRECTABLE_ROW_REMAP HealthCheckErrorCode = 84 // 84 Uncorrectable row remapping + DCGM_FR_PENDING_ROW_REMAP HealthCheckErrorCode = 85 // 85 Row remapping is pending + DCGM_FR_BROKEN_P2P_MEMORY_DEVICE HealthCheckErrorCode = 86 // 86 P2P copy test detected an error writing to this GPU + DCGM_FR_BROKEN_P2P_WRITER_DEVICE HealthCheckErrorCode = 87 // 87 P2P copy test detected an error writing from this GPU + DCGM_FR_NVSWITCH_NVLINK_DOWN HealthCheckErrorCode = 88 // 88 An NvLink is down for the specified NVSwitch - NOT USED: DEPRECATED + DCGM_FR_EUD_BINARY_PERMISSIONS HealthCheckErrorCode = 89 // 89 EUD binary permissions are incorrect + DCGM_FR_EUD_NON_ROOT_USER HealthCheckErrorCode = 90 // 90 EUD plugin is not running as root + DCGM_FR_EUD_SPAWN_FAILURE HealthCheckErrorCode = 91 // 91 EUD plugin failed to spawn the EUD binary + DCGM_FR_EUD_TIMEOUT HealthCheckErrorCode = 92 // 92 EUD plugin timed out + DCGM_FR_EUD_ZOMBIE HealthCheckErrorCode = 93 // 93 EUD process remains running after the plugin considers it finished + DCGM_FR_EUD_NON_ZERO_EXIT_CODE HealthCheckErrorCode = 94 // 94 EUD process exited with a non-zero exit code + DCGM_FR_EUD_TEST_FAILED HealthCheckErrorCode = 95 // 95 EUD test failed + DCGM_FR_FILE_CREATE_PERMISSIONS HealthCheckErrorCode = 96 // 96 We cannot create a file in this directory. + DCGM_FR_PAUSE_RESUME_FAILED HealthCheckErrorCode = 97 // 97 Pause/Resume failed + DCGM_FR_PCIE_H_REPLAY_VIOLATION HealthCheckErrorCode = 98 // 98 PCIe test caught correctable errors + DCGM_FR_GPU_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 99 // 99 Expected nvlinks up per gpu + DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP HealthCheckErrorCode = 100 // 100 Expected nvlinks up per nvswitch + DCGM_FR_XID_ERROR HealthCheckErrorCode = 101 // 101 XID error detected + DCGM_FR_SBE_VIOLATION HealthCheckErrorCode = 102 // 102 Single bit error detected + DCGM_FR_DBE_VIOLATION HealthCheckErrorCode = 103 // 103 Double bit error detected + DCGM_FR_PCIE_REPLAY_VIOLATION HealthCheckErrorCode = 104 // 104 PCIe replay errors detected + DCGM_FR_SBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 105 // 105 SBE threshold violated + DCGM_FR_DBE_THRESHOLD_VIOLATION HealthCheckErrorCode = 106 // 106 DBE threshold violated + DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION HealthCheckErrorCode = 107 // 107 PCIE replay count violated + DCGM_FR_CUDA_FM_NOT_INITIALIZED HealthCheckErrorCode = 108 // 108 The fabricmanager is not initialized + DCGM_FR_SXID_ERROR HealthCheckErrorCode = 109 // 109 NvSwitch fatal error detected + DCGM_FR_ERROR_SENTINEL HealthCheckErrorCode = 110 // 110 MUST BE THE LAST ERROR CODE +) diff --git a/pkg/ixdcgm/device_common.go b/pkg/ixdcgm/device_common.go index cf9d561..6989c48 100644 --- a/pkg/ixdcgm/device_common.go +++ b/pkg/ixdcgm/device_common.go @@ -19,8 +19,6 @@ package ixdcgm /* #cgo LDFLAGS: -ldl -#include "include/ixdcgmFields.h" -#include "include/ixdcgmStructs.h" #include "include/ixdcgmApiExport.h" */ import "C" diff --git a/pkg/ixdcgm/device_info.go b/pkg/ixdcgm/device_info.go index c8e16d4..6687a52 100644 --- a/pkg/ixdcgm/device_info.go +++ b/pkg/ixdcgm/device_info.go @@ -148,13 +148,13 @@ func getDeviceInfo(gpuId uint) (DeviceInfo, error) { } } - cpuAffinity, err := getCPUAffinity(gpuId) + cpuAffinity, err := getAffinity(gpuId, "CPU") if err != nil { - return DeviceInfo{}, err + fmt.Printf("Error getting cpu affinity, set CPU Affinity to N/A, err: %s", err) } - numaAffinity, err := getNUMAAffinity(gpuId) + numaAffinity, err := getAffinity(gpuId, "NUMA") if err != nil { - return DeviceInfo{}, err + fmt.Printf("Error getting numa affinity, set NUMA Affinity to N/A, err: %s", err) } var topology []P2PLink @@ -220,15 +220,9 @@ func getSupportedDevices() (gpus []uint, err error) { } return } -func getCPUAffinity(gpuId uint) (string, error) { - return getAffinity(gpuId, "CPU") -} - -func getNUMAAffinity(gpuId uint) (string, error) { - return getAffinity(gpuId, "MEM") -} -func getAffinity(gpuId uint, typ string) (string, error) { +// if err is not nil, return "N/A" as result +func getAffinity(gpuId uint, typ string) (result string, err error) { const ( affinity0 int = iota affinity1 @@ -244,7 +238,7 @@ func getAffinity(gpuId uint, typ string) (string, error) { affFields[affinity1] = DCGM_FI_DEV_CPU_AFFINITY_1 affFields[affinity2] = DCGM_FI_DEV_CPU_AFFINITY_2 affFields[affinity3] = DCGM_FI_DEV_CPU_AFFINITY_3 - case "MEM": + case "NUMA": affFields[affinity0] = DCGM_FI_DEV_MEM_AFFINITY_0 affFields[affinity1] = DCGM_FI_DEV_MEM_AFFINITY_1 affFields[affinity2] = DCGM_FI_DEV_MEM_AFFINITY_2 @@ -253,14 +247,14 @@ func getAffinity(gpuId uint, typ string) (string, error) { return "N/A", fmt.Errorf("not supported affinity type: %s", typ) } - fieldGrpName := fmt.Sprintf("affFields%d", gpuId) + fieldGrpName := fmt.Sprintf("%sAffFields%d", typ, gpuId) fieldGrpHdl, err := FieldGroupCreate(fieldGrpName, affFields) if err != nil { return "N/A", err } defer FieldGroupDestroy(fieldGrpHdl) - gpuGrpName := fmt.Sprintf("aff%d", gpuId) + gpuGrpName := fmt.Sprintf("%sAff%d", typ, gpuId) gpuGrpHdl, err := WatchFields([]uint{gpuId}, fieldGrpHdl, gpuGrpName) if err != nil { return "N/A", err @@ -269,16 +263,26 @@ func getAffinity(gpuId uint, typ string) (string, error) { values, err := GetLatestValuesForFields(gpuId, affFields) if err != nil { - return "N/A", err + return "N/A", fmt.Errorf("Error getting %s affinity: %s", typ, err) } - bits := make([]uint64, 4) - bits[0] = uint64(values[affinity0].Int64()) - bits[1] = uint64(values[affinity1].Int64()) - bits[2] = uint64(values[affinity2].Int64()) - bits[3] = uint64(values[affinity3].Int64()) + bits := make([]int64, 4) + bits[0] = values[affinity0].Int64() + bits[1] = values[affinity1].Int64() + bits[2] = values[affinity2].Int64() + bits[3] = values[affinity3].Int64() + for _, bit := range bits { + if bit >= DCGM_FT_INT64_BLANK { + // Retrieved affinity value is invalid. + return "N/A", nil + } + } - b := bitset.From(bits) + ubits := make([]uint64, len(bits)) + for i, val := range bits { + ubits[i] = uint64(val) + } + b := bitset.From(ubits) str := convertBitsetStr(b.String()) return str, nil } diff --git a/pkg/ixdcgm/diag.go b/pkg/ixdcgm/diag.go new file mode 100644 index 0000000..90e8eac --- /dev/null +++ b/pkg/ixdcgm/diag.go @@ -0,0 +1,238 @@ +/* +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this file except in compliance with the License. You may obtain +a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ixdcgm + +/* +#include "include/dcgm_agent.h" +#include "include/dcgm_structs.h" +*/ +import "C" +import ( + "context" + "fmt" + "time" + "unsafe" +) + +const ( + DCGM_PER_GPU_TEST_COUNT_V8 = 13 + DIAG_RESULT_STRING_SIZE = 1024 + + UNDEFINED_SWTEST = "UNDEFINED_SWTEST" + UNUSED_GPUTEST = "UNUSED_GPUTEST" +) + +type DiagType int + +const ( + DiagQuick DiagType = 1 // run a very basic health check on the system + DiagMedium DiagType = 2 // run a medium-length diagnostic (a few minutes) + DiagLong DiagType = 3 // run a extensive diagnostic (several minutes) + DiagExtended DiagType = 4 // run a very extensive diagnostic (many minutes) +) + +type DiagResult struct { + Status string + TestName string + TestOutput string + ErrorCode uint + ErrorMessage string +} + +type GpuResult struct { + GPU uint + RC uint + DiagResults []DiagResult +} + +type DiagResults struct { + Software []DiagResult + PerGpu []GpuResult + gpuCount uint +} + +func diagResultString(r int) string { + switch r { + case C.DCGM_DIAG_RESULT_PASS: + return "pass" + case C.DCGM_DIAG_RESULT_SKIP: + return "skipped" + case C.DCGM_DIAG_RESULT_WARN: + return "warn" + case C.DCGM_DIAG_RESULT_FAIL: + return "fail" + case C.DCGM_DIAG_RESULT_NOT_RUN: + return "notrun" + } + return "" +} + +func swTestName(t int) string { + switch t { + case C.DCGM_SWTEST_DENYLIST: + return "presence of drivers on the denylist (e.g. nouveau)" + case C.DCGM_SWTEST_NVML_LIBRARY: + return "presence (and version) of NVML lib" + case C.DCGM_SWTEST_CUDA_MAIN_LIBRARY: + return "presence (and version) of CUDA lib" + case C.DCGM_SWTEST_CUDA_RUNTIME_LIBRARY: + return "presence (and version) of CUDA RT lib" + case C.DCGM_SWTEST_PERMISSIONS: + return "character device permissions" + case C.DCGM_SWTEST_PERSISTENCE_MODE: + return "persistence mode enabled" + case C.DCGM_SWTEST_ENVIRONMENT: + return "CUDA environment vars that may slow tests" + case C.DCGM_SWTEST_PAGE_RETIREMENT: + return "pending frame buffer page retirement" + case C.DCGM_SWTEST_GRAPHICS_PROCESSES: + return "graphics processes running" + case C.DCGM_SWTEST_INFOROM: + return "inforom corruption" + } + return UNDEFINED_SWTEST +} + +func gpuTestName(t int) string { + switch t { + case C.DCGM_MEMORY_INDEX: + return "Memory" + case C.DCGM_DIAGNOSTIC_INDEX: + return "Diagnostic" + case C.DCGM_PCI_INDEX: + return "PCIe" + case C.DCGM_SM_STRESS_INDEX: + return "SM Stress" + case C.DCGM_TARGETED_STRESS_INDEX: + return "Targeted Stress" + case C.DCGM_TARGETED_POWER_INDEX: + return "Targeted Power" + case C.DCGM_MEMORY_BANDWIDTH_INDEX: + return "Memory Bandwidth" + case C.DCGM_MEMTEST_INDEX: + return "Memtest" + case C.DCGM_PULSE_TEST_INDEX: + return "Pulse Test" + case C.DCGM_EUD_TEST_INDEX: + return "EUD Test" + case C.DCGM_UNUSED3_TEST_INDEX: + return "CPU EUD Test" + case C.DCGM_SOFTWARE_INDEX: + return "Software" + case C.DCGM_CONTEXT_CREATE_INDEX: + return "Context Create" + } + return UNUSED_GPUTEST +} + +func newDiagResult(testResult C.dcgmDiagTestResult_v3, testName string) DiagResult { + msg := C.GoString((*C.char)(unsafe.Pointer(&testResult.error[0].msg))) + info := C.GoString((*C.char)(unsafe.Pointer(&testResult.info))) + + return DiagResult{ + Status: diagResultString(int(testResult.status)), + TestName: testName, + TestOutput: info, + ErrorCode: uint(testResult.error[0].code), + ErrorMessage: msg, + } +} + +func diagLevel(diagType DiagType) C.dcgmDiagnosticLevel_t { + switch diagType { + case DiagQuick: + return C.DCGM_DIAG_LVL_SHORT + case DiagMedium: + return C.DCGM_DIAG_LVL_MED + case DiagLong: + return C.DCGM_DIAG_LVL_LONG + case DiagExtended: + return C.DCGM_DIAG_LVL_XLONG + } + return C.DCGM_DIAG_LVL_INVALID +} + +// RunDiagWithTimeout executes a diagnostic with a specified timeout. +// If the diagnostic does not complete within the timeout duration, an error is returned. +func RunDiagWithTimeout(diagType DiagType, groupId GroupHandle, t time.Duration) (DiagResults, error) { + ctx, cancel := context.WithTimeout(context.Background(), t) + defer cancel() + + // Channels to receive the diagnostic results or error + resultChan := make(chan DiagResults, 1) + errChan := make(chan error, 1) + + // Run the diagnostic in a separate goroutine + go func() { + result, err := RunDiag(diagType, groupId) + if err != nil { + errChan <- err + } else { + resultChan <- result + } + }() + + // Wait for the diagnostic to complete or the timeout to occur + select { + case <-ctx.Done(): + return DiagResults{}, fmt.Errorf("Error: diagnostic execution timed out after %v", t) + case err := <-errChan: + return DiagResults{}, err + case result := <-resultChan: + return result, nil + } +} + +func RunDiag(diagType DiagType, groupId GroupHandle) (DiagResults, error) { + var diagResults C.dcgmDiagResponse_v10 + diagResults.version = makeVersion10(unsafe.Sizeof(diagResults)) + + result := C.dcgmRunDiagnostic(handle.handle, groupId.handle, diagLevel(diagType), (*C.dcgmDiagResponse_v10)(unsafe.Pointer(&diagResults))) + if err := errorString(result); err != nil { + return DiagResults{}, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + defer C.dcgmStopDiagnostic(handle.handle) + + var diagRun DiagResults + diagRun.gpuCount = uint(diagResults.gpuCount) + + for i := 0; i < int(diagResults.levelOneTestCount); i++ { + testName := swTestName(i) + if testName == UNDEFINED_SWTEST { + continue + } + dr := newDiagResult(diagResults.levelOneResults[i], testName) + diagRun.Software = append(diagRun.Software, dr) + } + + for i := uint(0); i < uint(diagResults.gpuCount); i++ { + r := diagResults.perGpuResponses[i] + gr := GpuResult{GPU: uint(r.gpuId), RC: uint(r.hwDiagnosticReturn)} + for j := 0; j < DCGM_PER_GPU_TEST_COUNT_V8; j++ { + testName := gpuTestName(j) + if testName == UNUSED_GPUTEST { + continue + } + dr := newDiagResult(r.results[j], testName) + gr.DiagResults = append(gr.DiagResults, dr) + } + diagRun.PerGpu = append(diagRun.PerGpu, gr) + } + + return diagRun, nil +} diff --git a/pkg/ixdcgm/fields.go b/pkg/ixdcgm/fields.go index e52404e..4773acb 100644 --- a/pkg/ixdcgm/fields.go +++ b/pkg/ixdcgm/fields.go @@ -32,10 +32,6 @@ const ( defaultUpdateFreq = 1000000 // usec defaultMaxKeepAge = 0 // sec defaultMaxKeepSamples = 1 // Keep one sample by default since we only ask for latest - - DCGM_INT32_BLANK = int32(2147483632) // 0x7ffffff0 - DCGM_INT64_BLANK = int64(9223372036854775792) // 0x7ffffffffffffff0 - DCGM_FP64_BLANK = float64(140737488355328.0) ) type FieldGrpHandle struct{ handle C.dcgmFieldGrp_t } @@ -104,21 +100,6 @@ func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error) return toFieldValue(values), nil } -func toFieldValue(values []C.dcgmFieldValue_v1) (fields []FieldValue_v1) { - fields = make([]FieldValue_v1, len(values)) - for i, v := range values { - fields[i] = FieldValue_v1{ - Version: uint(v.version), - FieldId: uint(v.fieldId), - FieldType: uint(v.fieldType), - Status: int(v.status), - Ts: int64(v.ts), - Value: v.value, - } - } - return -} - func GetFieldValueStr(fv FieldValue_v1, typ string) string { st := fv.Status if st != C.DCGM_ST_OK { @@ -128,14 +109,14 @@ func GetFieldValueStr(fv FieldValue_v1, typ string) string { switch typ { case "int64": value := *(*int64)(unsafe.Pointer(&fv.Value[0])) - if value >= DCGM_INT64_BLANK { + if value >= DCGM_FT_INT64_BLANK { return "N/A" // indicate the field is not supported } return fmt.Sprintf("%d", value) case "float64": value := *(*float64)(unsafe.Pointer(&fv.Value[0])) - if value >= DCGM_FP64_BLANK { + if value >= DCGM_FT_FP64_BLANK { return "N/A" // indicate the field is not supported } // sync the precision with the display of ixdcgmi @@ -193,3 +174,35 @@ func (e Field_Entity_Group) String() string { } return "unknown" } + +func toFieldValue(cfields []C.dcgmFieldValue_v1) []FieldValue_v1 { + fields := make([]FieldValue_v1, len(cfields)) + for i, f := range cfields { + fields[i] = FieldValue_v1{ + Version: uint(f.version), + FieldId: uint(f.fieldId), + FieldType: uint(f.fieldType), + Status: int(f.status), + Ts: int64(f.ts), + Value: f.value, + } + } + + return fields +} + +func (fv FieldValue_v1) Int64() int64 { + return *(*int64)(unsafe.Pointer(&fv.Value[0])) +} + +func (fv FieldValue_v1) Float64() float64 { + return *(*float64)(unsafe.Pointer(&fv.Value[0])) +} + +func (fv FieldValue_v1) String() string { + return C.GoString((*C.char)(unsafe.Pointer(&fv.Value[0]))) +} + +func (fv FieldValue_v1) Blob() [4096]byte { + return fv.Value +} diff --git a/pkg/ixdcgm/include/dcgm_agent.h b/pkg/ixdcgm/include/dcgm_agent.h index d126a78..24e3309 100644 --- a/pkg/ixdcgm/include/dcgm_agent.h +++ b/pkg/ixdcgm/include/dcgm_agent.h @@ -1603,7 +1603,7 @@ extern "C" * for details on creating the group. Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS to perform * operation on all the GPUs. * @param validate IN: The validation to perform after the action. - * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. + * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_v10 for details. * * @return * - \ref DCGM_ST_OK if the call was successful @@ -1618,7 +1618,7 @@ extern "C" dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmPolicyValidation_t validate, - dcgmDiagResponse_t* response); + dcgmDiagResponse_v10* response); /** * Inform the action manager to perform a manual validation of a group of GPUs on the system @@ -1627,7 +1627,7 @@ extern "C" * @param drd IN: Contains the group id, test names, test parameters, struct version, and the * validation that should be performed. Look at \ref dcgmGroupCreate for details on creating the group. * Alternatively, pass in the group id as \a DCGM_GROUP_ALL_GPUS to perform operation on all the GPUs. - * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_t for details. + * @param response OUT: Result of the validation process. Refer to \ref dcgmDiagResponse_v10 for details. * * @return * - \ref DCGM_ST_OK if the call was successful @@ -1638,9 +1638,9 @@ extern "C" * - \ref DCGM_ST_GROUP_INCOMPATIBLE if \a groupId refers to a group of non-homogeneous GPUs. This is * currently not allowed. */ - dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle, - dcgmRunDiag_v7* drd, - dcgmDiagResponse_t* response); + dcgmReturn_t DCGM_PUBLIC_API dcgmActionValidate_v2(dcgmHandle_t pDcgmHandle, + dcgmRunDiag_v8* drd, + dcgmDiagResponse_v10* response); /** * Run a diagnostic on a group of GPUs @@ -1667,7 +1667,17 @@ extern "C" dcgmReturn_t DCGM_PUBLIC_API dcgmRunDiagnostic(dcgmHandle_t pDcgmHandle, dcgmGpuGrp_t groupId, dcgmDiagnosticLevel_t diagLevel, - dcgmDiagResponse_t* diagResponse); + dcgmDiagResponse_v10* diagResponse); + /** + * Stop a diagnostic if there is one currently running. + * + * @param pDcgmHandle IN: DCGM Handle + * + * @return + * - \ref DCGM_ST_OK if the call was successful + * - \ref DCGM_ST_BADPARAM if a provided parameter is invalid or missing + */ + dcgmReturn_t DCGM_PUBLIC_API dcgmStopDiagnostic(dcgmHandle_t pDcgmHandle); /** @} */ // Closing for DCGMAPI_PO_MI @@ -2009,8 +2019,6 @@ extern "C" */ dcgmReturn_t DCGM_PUBLIC_API dcgmAddFakeInstances(dcgmHandle_t pDcgmHandle, dcgmMigHierarchy_v2* hierarchy); - dcgmReturn_t DCGM_PUBLIC_API dcgmDeviceOnSameBoard(dcgmHandle_t pixdcgmHandle, unsigned int gpuId1, unsigned int gpuId2, int* onSameBoard); - #ifdef __cplusplus } #endif diff --git a/pkg/ixdcgm/include/dcgm_structs.h b/pkg/ixdcgm/include/dcgm_structs.h index 9b6f169..572a1db 100644 --- a/pkg/ixdcgm/include/dcgm_structs.h +++ b/pkg/ixdcgm/include/dcgm_structs.h @@ -2485,6 +2485,19 @@ extern "C" dcgmDiagTestResult_v2 results[DCGM_PER_GPU_TEST_COUNT_V7]; //!< Array with a result for each per-gpu test } dcgmDiagResponsePerGpu_v3; +#define DCGM_DIAG_AUX_DATA_LEN 2048 + + /** + * Per test aux data structure v1 + * + * Since DCGM 3.3.7 + */ + typedef struct + { + unsigned int version; //!< version number (dcgmDiagTestAuxData_version1) + char data[DCGM_DIAG_AUX_DATA_LEN]; + } dcgmDiagTestAuxData_v1; + #define DCGM_SWTEST_COUNT 10 #define LEVEL_ONE_MAX_RESULTS 16 @@ -2505,6 +2518,27 @@ extern "C" #define DCGM_DEVICE_ID_LEN 5 #define DCGM_VERSION_LEN 12 + /** + * Global diagnostics result structure v10 + * + * Since DCGM 3.3.7 + */ + typedef struct + { + unsigned int version; //!< version number (dcgmDiagResponse_v10) + unsigned int gpuCount; //!< number of valid per GPU results + unsigned int levelOneTestCount; //!< number of valid levelOne results + + dcgmDiagTestResult_v3 levelOneResults[LEVEL_ONE_MAX_RESULTS]; //!< Basic, system-wide test results. + dcgmDiagResponsePerGpu_v5 perGpuResponses[DCGM_MAX_NUM_DEVICES]; //!< per GPU test results + dcgmDiagErrorDetail_v2 systemError; //!< System-wide error reported from NVVS + char devIds[DCGM_MAX_NUM_DEVICES][DCGM_DEVICE_ID_LEN]; //!< The SKU device id for each GPU + char devSerials[DCGM_MAX_NUM_DEVICES][DCGM_MAX_STR_LENGTH]; //!< Serial for the device + char dcgmVersion[DCGM_VERSION_LEN]; //!< A string representing DCGM's version + char driverVersion[DCGM_MAX_STR_LENGTH]; //!< A string representing the driver version + dcgmDiagTestAuxData_v1 auxDataPerTest[DCGM_PER_GPU_TEST_COUNT_V8]; //!< Aux data that each test returned. + } dcgmDiagResponse_v10; + /** * Global diagnostics result structure v9 * @@ -2563,10 +2597,10 @@ extern "C" char _unused[1024]; //!< No longer used } dcgmDiagResponse_v7; - /** - * Typedef for \ref dcgmDiagResponse_v9 - */ - typedef dcgmDiagResponse_v9 dcgmDiagResponse_t; +/** + * Version 10 for \ref dcgmDiagResponse_v10 + */ +#define dcgmDiagResponse_version10 MAKE_DCGM_VERSION(dcgmDiagResponse_v10, 10) /** * Version 9 for \ref dcgmDiagResponse_v9 @@ -2769,7 +2803,9 @@ extern "C" #define DCGM_MAX_TEST_NAMES_LEN 50 #define DCGM_MAX_TEST_PARMS 100 #define DCGM_MAX_TEST_PARMS_LEN 100 +#define DCGM_MAX_TEST_PARMS_LEN_V2 1050 #define DCGM_GPU_LIST_LEN 50 +#define DCGM_EXPECTED_ENTITIES_LEN 50 #define DCGM_FILE_LEN 30 #define DCGM_PATH_LEN 128 #define DCGM_THROTTLE_MASK_LEN 50 @@ -2807,6 +2843,48 @@ extern "C" */ #define DCGM_RUN_FLAGS_FAIL_EARLY 0x0010 + /** + * @} + */ + + /* + * Run diagnostic structure v8 + */ + typedef struct + { + unsigned int version; //!< version of this message + unsigned int flags; //!< flags specifying binary options for running it. See DCGM_RUN_FLAGS_* + unsigned int debugLevel; //!< 0-5 for the debug level the GPU diagnostic will use for logging. + dcgmGpuGrp_t groupId; //!< group of GPUs to verify. Cannot be specified together with gpuList. + dcgmPolicyValidation_t validate; //!< 0-3 for which tests to run. Optional. + char testNames[DCGM_MAX_TEST_NAMES][DCGM_MAX_TEST_NAMES_LEN]; //!< Specified list of test names. Optional. + char testParms[DCGM_MAX_TEST_PARMS] + [DCGM_MAX_TEST_PARMS_LEN_V2]; //!< Parameters to set for specified tests + //!< in the format: + //!< testName.parameterName=parameterValue. Optional. + char fakeGpuList[DCGM_GPU_LIST_LEN]; //!< Comma-separated list of GPUs. Cannot be specified with the groupId. + char gpuList[DCGM_GPU_LIST_LEN]; //!< Comma-separated list of GPUs. Cannot be specified with the groupId. + char debugLogFile[DCGM_PATH_LEN]; //!< Alternate name for the debug log file that should be used + char statsPath[DCGM_PATH_LEN]; //!< Path that the plugin's statistics files should be written to + char configFileContents[DCGM_MAX_CONFIG_FILE_LEN]; //!< Contents of nvvs config file (likely yaml) + char throttleMask[DCGM_THROTTLE_MASK_LEN]; //!< Throttle reasons to ignore as either integer mask or csv list + //!< of reasons + char pluginPath[DCGM_PATH_LEN]; //!< Custom path to the diagnostic plugins - No longer supported as of 2.2.9 + + unsigned int currentIteration; //!< The current iteration that will be executed + unsigned int totalIterations; //!< The total iterations that will be executed + unsigned int timeoutSeconds; //!< The timeout for the diagnostic in seconds + char _unusedBuf[DCGM_PATH_LEN]; //!< No longer used + unsigned int failCheckInterval; //!< How often the fail early checks should occur when enabled. + char + expectedNumEntities[DCGM_EXPECTED_ENTITIES_LEN]; //!< The expected number of entities the diag will run on. + } dcgmRunDiag_v8; + +/** + * Version 8 for \ref dcgmRunDiag_t + */ +#define dcgmRunDiag_version8 MAKE_DCGM_VERSION(dcgmRunDiag_v8, 8) + /** * @} */ diff --git a/pkg/ixdcgm/include/ixdcgmApiExport.h b/pkg/ixdcgm/include/ixdcgmApiExport.h index 56053b3..fb4d059 100644 --- a/pkg/ixdcgm/include/ixdcgmApiExport.h +++ b/pkg/ixdcgm/include/ixdcgmApiExport.h @@ -10,9 +10,6 @@ #define __IXDCGM_API_EXPORT_H__ #include "ixdcgmStructs.h" - -#include "ixdcgmFields.h" - #include #ifdef __cplusplus @@ -27,20 +24,6 @@ extern "C" #endif #define IXDCGM_PRIVATE_API __attribute((visibility("hidden"))) - /** - * Get the IxLink link status for every IxLink in this system. - * - * @param pixdcgmHandle IN: IxDCGM Handle - * @param linkStatus OUT: Structure in which to store IxLink link statuses. - * - * @return - * - \ref IXDCGM_RET_OK if the call was successful. - * - \ref IXDCGM_RET_NOT_SUPPORTED if the given entityGroup does not support enumeration. - * - \ref IXDCGM_RET_BADPARAM if any parameter is invalid - * - \ref IXDCGM_RET_VER_MISMATCH if the version of linkStatus is not ixdcgmLinkStatus_v3 - */ - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLinkStatus(ixdcgmHandle_t pixdcgmHandle, ixdcgmLinkStatus_v3 *linkStatus); - /** * Gets the 2 GPUs are on the same board or not. * @param pixdcgmHandle IN: IxDCGM Handle diff --git a/pkg/ixdcgm/include/ixdcgmFields.h b/pkg/ixdcgm/include/ixdcgmFields.h deleted file mode 100644 index 64a5b84..0000000 --- a/pkg/ixdcgm/include/ixdcgmFields.h +++ /dev/null @@ -1,502 +0,0 @@ -/* Copyright © 2024 Iluvatar CoreX. All rights reserved. */ -/* Copyright Declaration: This software, including all of its code and documentation, except for the third-party */ -/* software it contains, is a copyrighted work of Shanghai Iluvatar CoreX Semiconductor Co., Ltd. and its */ -/* affiliates (“Iluvatar CoreX”) in accordance with the PRC Copyright Law and relevant international treaties, */ -/* and all rights contained therein are enjoyed by Iluvatar CoreX. No user of this software shall have any right, */ -/* ownership or interest in this software and any use of this software shall be in compliance with the terms */ -/* and conditions of the End User License Agreement. */ -#ifndef __IXDCGMFIELDS_H__ -#define __IXDCGMFIELDS_H__ - - -#include - -#ifdef __cplusplus -extern "C" -{ -#endif - -#define IXDCGM_FT_BINARY 'b' -#define IXDCGM_FT_DOUBLE 'd' -#define IXDCGM_FT_INT64 'i' -#define IXDCGM_FT_STRING 's' -#define IXDCGM_FT_TIMESTAMP 't' -#define IXDCGM_FS_GLOBAL 0 -#define IXDCGM_FS_ENTITY 1 -#define IXDCGM_FS_DEVICE IXDCGM_FS_ENTITY -#define IXDCGM_CUDA_COMPUTE_CAPABILITY_MAJOR(x) ((uint64_t)(x) & 0xFFFF0000) -#define IXDCGM_CUDA_COMPUTE_CAPABILITY_MINOR(x) ((uint64_t)(x) & 0x0000FFFF) -#define IXDCGM_CLOCKS_THROTTLE_REASON_GPU_IDLE 0x0000000000000001LL -#define IXDCGM_CLOCKS_THROTTLE_REASON_CLOCKS_SETTING 0x0000000000000002LL -#define IXDCGM_CLOCKS_THROTTLE_REASON_SW_POWER_CAP 0x0000000000000004LL -#define IXDCGM_CLOCKS_THROTTLE_REASON_HW_SLOWDOWN 0x0000000000000008LL -#define IXDCGM_CLOCKS_THROTTLE_REASON_SYNC_BOOST 0x0000000000000010LL -#define IXDCGM_CLOCKS_THROTTLE_REASON_SW_THERMAL 0x0000000000000020LL -#define IXDCGM_CLOCKS_THROTTLE_REASON_HW_THERMAL 0x0000000000000040LL -#define IXDCGM_CLOCKS_THROTTLE_REASON_HW_POWER_BRAKE 0x0000000000000080LL -#define IXDCGM_CLOCKS_THROTTLE_REASON_DISPLAY_CLOCKS 0x0000000000000100LL -#define IXDCGM_FI_UNKNOWN 0 -#define IXDCGM_FI_DRIVER_VERSION 1 -#define IXDCGM_FI_NVML_VERSION 2 -#define IXDCGM_FI_PROCESS_NAME 3 -#define IXDCGM_FI_DEV_COUNT 4 -#define IXDCGM_FI_CUDA_DRIVER_VERSION 5 -#define IXDCGM_FI_DEV_NAME 50 -#define IXDCGM_FI_DEV_BRAND 51 -#define IXDCGM_FI_DEV_NVML_INDEX 52 -#define IXDCGM_FI_DEV_SERIAL 53 -#define IXDCGM_FI_DEV_UUID 54 -#define IXDCGM_FI_DEV_MINOR_NUMBER 55 -#define IXDCGM_FI_DEV_OEM_INFOROM_VER 56 -#define IXDCGM_FI_DEV_PCI_BUSID 57 -#define IXDCGM_FI_DEV_PCI_COMBINED_ID 58 -#define IXDCGM_FI_DEV_PCI_SUBSYS_ID 59 -#define IXDCGM_FI_GPU_TOPOLOGY_PCI 60 -#define IXDCGM_FI_GPU_TOPOLOGY_NVLINK 61 -#define IXDCGM_FI_GPU_TOPOLOGY_AFFINITY 62 -#define IXDCGM_FI_DEV_CUDA_COMPUTE_CAPABILITY 63 -#define IXDCGM_FI_DEV_COMPUTE_MODE 65 -#define IXDCGM_FI_DEV_PERSISTENCE_MODE 66 -#define IXDCGM_FI_DEV_MIG_MODE 67 -#define IXDCGM_FI_DEV_CUDA_VISIBLE_DEVICES_STR 68 -#define IXDCGM_FI_DEV_MIG_MAX_SLICES 69 -#define IXDCGM_FI_DEV_CPU_AFFINITY_0 70 -#define IXDCGM_FI_DEV_CPU_AFFINITY_1 71 -#define IXDCGM_FI_DEV_CPU_AFFINITY_2 72 -#define IXDCGM_FI_DEV_CPU_AFFINITY_3 73 -#define IXDCGM_FI_DEV_CC_MODE 74 -#define IXDCGM_FI_DEV_MIG_ATTRIBUTES 75 -#define IXDCGM_FI_DEV_MIG_GI_INFO 76 -#define IXDCGM_FI_DEV_MIG_CI_INFO 77 -#define IXDCGM_FI_DEV_ECC_INFOROM_VER 80 -#define IXDCGM_FI_DEV_POWER_INFOROM_VER 81 -#define IXDCGM_FI_DEV_INFOROM_IMAGE_VER 82 -#define IXDCGM_FI_DEV_INFOROM_CONFIG_CHECK 83 -#define IXDCGM_FI_DEV_INFOROM_CONFIG_VALID 84 -#define IXDCGM_FI_DEV_VBIOS_VERSION 85 -#define IXDCGM_FI_DEV_MEM_AFFINITY_0 86 -#define IXDCGM_FI_DEV_MEM_AFFINITY_1 87 -#define IXDCGM_FI_DEV_MEM_AFFINITY_2 88 -#define IXDCGM_FI_DEV_MEM_AFFINITY_3 89 -#define IXDCGM_FI_DEV_BAR1_TOTAL 90 -#define IXDCGM_FI_SYNC_BOOST 91 -#define IXDCGM_FI_DEV_BAR1_USED 92 -#define IXDCGM_FI_DEV_BAR1_FREE 93 -#define IXDCGM_FI_DEV_SM_CLOCK 100 -#define IXDCGM_FI_DEV_MEM_CLOCK 101 -#define IXDCGM_FI_DEV_VIDEO_CLOCK 102 -#define IXDCGM_FI_DEV_APP_SM_CLOCK 110 -#define IXDCGM_FI_DEV_APP_MEM_CLOCK 111 -#define IXDCGM_FI_DEV_CLOCK_THROTTLE_REASONS 112 -#define IXDCGM_FI_DEV_MAX_SM_CLOCK 113 -#define IXDCGM_FI_DEV_MAX_MEM_CLOCK 114 -#define IXDCGM_FI_DEV_MAX_VIDEO_CLOCK 115 -#define IXDCGM_FI_DEV_AUTOBOOST 120 -#define IXDCGM_FI_DEV_SUPPORTED_CLOCKS 130 -#define IXDCGM_FI_DEV_MEMORY_TEMP 140 -#define IXDCGM_FI_DEV_GPU_TEMP 150 -#define IXDCGM_FI_DEV_MEM_MAX_OP_TEMP 151 -#define IXDCGM_FI_DEV_GPU_MAX_OP_TEMP 152 -#define IXDCGM_FI_DEV_POWER_USAGE 155 -#define IXDCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION 156 -#define IXDCGM_FI_DEV_POWER_USAGE_INSTANT 157 -#define IXDCGM_FI_DEV_SLOWDOWN_TEMP 158 -#define IXDCGM_FI_DEV_SHUTDOWN_TEMP 159 -#define IXDCGM_FI_DEV_POWER_MGMT_LIMIT 160 -#define IXDCGM_FI_DEV_POWER_MGMT_LIMIT_MIN 161 -#define IXDCGM_FI_DEV_POWER_MGMT_LIMIT_MAX 162 -#define IXDCGM_FI_DEV_POWER_MGMT_LIMIT_DEF 163 -#define IXDCGM_FI_DEV_ENFORCED_POWER_LIMIT 164 -#define IXDCGM_FI_DEV_PSTATE 190 -#define IXDCGM_FI_DEV_FAN_SPEED 191 -#define IXDCGM_FI_DEV_PCIE_TX_THROUGHPUT 200 -#define IXDCGM_FI_DEV_PCIE_RX_THROUGHPUT 201 -#define IXDCGM_FI_DEV_PCIE_REPLAY_COUNTER 202 -#define IXDCGM_FI_DEV_GPU_UTIL 203 -#define IXDCGM_FI_DEV_MEM_COPY_UTIL 204 -#define IXDCGM_FI_DEV_ACCOUNTING_DATA 205 -#define IXDCGM_FI_DEV_ENC_UTIL 206 -#define IXDCGM_FI_DEV_DEC_UTIL 207 -#define IXDCGM_FI_DEV_XID_ERRORS 230 -#define IXDCGM_FI_DEV_PCIE_MAX_LINK_GEN 235 -#define IXDCGM_FI_DEV_PCIE_MAX_LINK_WIDTH 236 -#define IXDCGM_FI_DEV_PCIE_LINK_GEN 237 -#define IXDCGM_FI_DEV_PCIE_LINK_WIDTH 238 -#define IXDCGM_FI_DEV_POWER_VIOLATION 240 -#define IXDCGM_FI_DEV_THERMAL_VIOLATION 241 -#define IXDCGM_FI_DEV_SYNC_BOOST_VIOLATION 242 -#define IXDCGM_FI_DEV_BOARD_LIMIT_VIOLATION 243 -#define IXDCGM_FI_DEV_LOW_UTIL_VIOLATION 244 -#define IXDCGM_FI_DEV_RELIABILITY_VIOLATION 245 -#define IXDCGM_FI_DEV_TOTAL_APP_CLOCKS_VIOLATION 246 -#define IXDCGM_FI_DEV_TOTAL_BASE_CLOCKS_VIOLATION 247 -#define IXDCGM_FI_DEV_FB_TOTAL 250 -#define IXDCGM_FI_DEV_FB_FREE 251 -#define IXDCGM_FI_DEV_FB_USED 252 -#define IXDCGM_FI_DEV_FB_RESERVED 253 -#define IXDCGM_FI_DEV_FB_USED_PERCENT 254 -#define IXDCGM_FI_DEV_ECC_CURRENT 300 -#define IXDCGM_FI_DEV_ECC_PENDING 301 -#define IXDCGM_FI_DEV_ECC_SBE_VOL_TOTAL 310 -#define IXDCGM_FI_DEV_ECC_DBE_VOL_TOTAL 311 -#define IXDCGM_FI_DEV_ECC_SBE_AGG_TOTAL 312 -#define IXDCGM_FI_DEV_ECC_DBE_AGG_TOTAL 313 -#define IXDCGM_FI_DEV_ECC_SBE_VOL_L1 314 -#define IXDCGM_FI_DEV_ECC_DBE_VOL_L1 315 -#define IXDCGM_FI_DEV_ECC_SBE_VOL_L2 316 -#define IXDCGM_FI_DEV_ECC_DBE_VOL_L2 317 -#define IXDCGM_FI_DEV_ECC_SBE_VOL_DEV 318 -#define IXDCGM_FI_DEV_ECC_DBE_VOL_DEV 319 -#define IXDCGM_FI_DEV_ECC_SBE_VOL_REG 320 -#define IXDCGM_FI_DEV_ECC_DBE_VOL_REG 321 -#define IXDCGM_FI_DEV_ECC_SBE_VOL_TEX 322 -#define IXDCGM_FI_DEV_ECC_DBE_VOL_TEX 323 -#define IXDCGM_FI_DEV_ECC_SBE_AGG_L1 324 -#define IXDCGM_FI_DEV_ECC_DBE_AGG_L1 325 -#define IXDCGM_FI_DEV_ECC_SBE_AGG_L2 326 -#define IXDCGM_FI_DEV_ECC_DBE_AGG_L2 327 -#define IXDCGM_FI_DEV_ECC_SBE_AGG_DEV 328 -#define IXDCGM_FI_DEV_ECC_DBE_AGG_DEV 329 -#define IXDCGM_FI_DEV_ECC_SBE_AGG_REG 330 -#define IXDCGM_FI_DEV_ECC_DBE_AGG_REG 331 -#define IXDCGM_FI_DEV_ECC_SBE_AGG_TEX 332 -#define IXDCGM_FI_DEV_ECC_DBE_AGG_TEX 333 -#define IXDCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_MAX 385 -#define IXDCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_HIGH 386 -#define IXDCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_PARTIAL 387 -#define IXDCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_LOW 388 -#define IXDCGM_FI_DEV_BANKS_REMAP_ROWS_AVAIL_NONE 389 -#define IXDCGM_FI_DEV_RETIRED_SBE 390 -#define IXDCGM_FI_DEV_RETIRED_DBE 391 -#define IXDCGM_FI_DEV_RETIRED_PENDING 392 -#define IXDCGM_FI_DEV_UNCORRECTABLE_REMAPPED_ROWS 393 -#define IXDCGM_FI_DEV_CORRECTABLE_REMAPPED_ROWS 394 -#define IXDCGM_FI_DEV_ROW_REMAP_FAILURE 395 -#define IXDCGM_FI_DEV_ROW_REMAP_PENDING 396 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L0 400 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L1 401 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L2 402 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L3 403 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L4 404 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L5 405 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_TOTAL 409 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L0 410 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L1 411 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L2 412 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L3 413 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L4 414 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L5 415 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_TOTAL 419 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L0 420 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L1 421 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L2 422 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L3 423 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L4 424 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L5 425 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_TOTAL 429 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L0 430 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L1 431 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L2 432 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L3 433 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L4 434 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L5 435 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_TOTAL 439 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L0 440 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L1 441 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L2 442 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L3 443 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L4 444 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L5 445 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL 449 -#define IXDCGM_FI_DEV_GPU_NVLINK_ERRORS 450 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L6 451 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L7 452 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L8 453 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L9 454 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L10 455 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L11 456 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L6 457 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L7 458 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L8 459 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L9 460 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L10 461 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L11 462 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L6 463 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L7 464 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L8 465 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L9 466 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L10 467 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L11 468 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L6 469 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L7 470 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L8 471 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L9 472 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L10 473 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L11 474 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L6 475 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L7 476 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L8 477 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L9 478 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L10 479 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L11 480 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L12 406 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L13 407 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L14 408 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L15 481 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L16 482 -#define IXDCGM_FI_DEV_NVLINK_CRC_FLIT_ERROR_COUNT_L17 483 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L12 416 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L13 417 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L14 418 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L15 484 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L16 485 -#define IXDCGM_FI_DEV_NVLINK_CRC_DATA_ERROR_COUNT_L17 486 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L12 426 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L13 427 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L14 428 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L15 487 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L16 488 -#define IXDCGM_FI_DEV_NVLINK_REPLAY_ERROR_COUNT_L17 489 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L12 436 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L13 437 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L14 438 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L15 491 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L16 492 -#define IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT_L17 493 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L12 446 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L13 447 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L14 448 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L15 494 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L16 495 -#define IXDCGM_FI_DEV_NVLINK_BANDWIDTH_L17 496 -#define IXDCGM_FI_DEV_VIRTUAL_MODE 500 -#define IXDCGM_FI_DEV_SUPPORTED_TYPE_INFO 501 -#define IXDCGM_FI_DEV_CREATABLE_VGPU_TYPE_IDS 502 -#define IXDCGM_FI_DEV_VGPU_INSTANCE_IDS 503 -#define IXDCGM_FI_DEV_VGPU_UTILIZATIONS 504 -#define IXDCGM_FI_DEV_VGPU_PER_PROCESS_UTILIZATION 505 -#define IXDCGM_FI_DEV_ENC_STATS 506 -#define IXDCGM_FI_DEV_FBC_STATS 507 -#define IXDCGM_FI_DEV_FBC_SESSIONS_INFO 508 -#define IXDCGM_FI_DEV_SUPPORTED_VGPU_TYPE_IDS 509 -#define IXDCGM_FI_DEV_VGPU_TYPE_INFO 510 -#define IXDCGM_FI_DEV_VGPU_TYPE_NAME 511 -#define IXDCGM_FI_DEV_VGPU_TYPE_CLASS 512 -#define IXDCGM_FI_DEV_VGPU_TYPE_LICENSE 513 -#define IXDCGM_FI_DEV_VGPU_VM_ID 520 -#define IXDCGM_FI_DEV_VGPU_VM_NAME 521 -#define IXDCGM_FI_DEV_VGPU_TYPE 522 -#define IXDCGM_FI_DEV_VGPU_UUID 523 -#define IXDCGM_FI_DEV_VGPU_DRIVER_VERSION 524 -#define IXDCGM_FI_DEV_VGPU_MEMORY_USAGE 525 -#define IXDCGM_FI_DEV_VGPU_LICENSE_STATUS 526 -#define IXDCGM_FI_DEV_VGPU_FRAME_RATE_LIMIT 527 -#define IXDCGM_FI_DEV_VGPU_ENC_STATS 528 -#define IXDCGM_FI_DEV_VGPU_ENC_SESSIONS_INFO 529 -#define IXDCGM_FI_DEV_VGPU_FBC_STATS 530 -#define IXDCGM_FI_DEV_VGPU_FBC_SESSIONS_INFO 531 -#define IXDCGM_FI_DEV_VGPU_INSTANCE_LICENSE_STATE 532 -#define IXDCGM_FI_DEV_VGPU_PCI_ID 533 -#define IXDCGM_FI_DEV_VGPU_VM_GPU_INSTANCE_ID 534 -#define IXDCGM_FI_FIRST_VGPU_FIELD_ID 520 -#define IXDCGM_FI_LAST_VGPU_FIELD_ID 570 -#define IXDCGM_FI_MAX_VGPU_FIELDS IXDCGM_FI_LAST_VGPU_FIELD_ID - IXDCGM_FI_FIRST_VGPU_FIELD_ID -#define IXDCGM_FI_INTERNAL_FIELDS_0_START 600 -#define IXDCGM_FI_INTERNAL_FIELDS_0_END 699 -#define IXDCGM_FI_FIRST_NVSWITCH_FIELD_ID 700 -#define IXDCGM_FI_DEV_NVSWITCH_VOLTAGE_MVOLT 701 -#define IXDCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ 702 -#define IXDCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_REV 703 -#define IXDCGM_FI_DEV_NVSWITCH_CURRENT_IDDQ_DVDD 704 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_TX 780 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_THROUGHPUT_RX 781 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_FATAL_ERRORS 782 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_NON_FATAL_ERRORS 783 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_REPLAY_ERRORS 784 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_RECOVERY_ERRORS 785 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_FLIT_ERRORS 786 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS 787 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS 788 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC0 789 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC1 790 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC2 791 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_LOW_VC3 792 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC0 793 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC1 794 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC2 795 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_MEDIUM_VC3 796 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC0 797 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC1 798 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC2 799 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_HIGH_VC3 800 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC0 801 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC1 802 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC2 803 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_PANIC_VC3 804 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC0 805 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC1 806 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC2 807 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_LATENCY_COUNT_VC3 808 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE0 809 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE1 810 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE2 811 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_CRC_ERRORS_LANE3 812 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE0 813 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE1 814 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE2 815 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_ECC_ERRORS_LANE3 816 -#define IXDCGM_FI_DEV_NVSWITCH_FATAL_ERRORS 856 -#define IXDCGM_FI_DEV_NVSWITCH_NON_FATAL_ERRORS 857 -#define IXDCGM_FI_DEV_NVSWITCH_TEMPERATURE_CURRENT 858 -#define IXDCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SLOWDOWN 859 -#define IXDCGM_FI_DEV_NVSWITCH_TEMPERATURE_LIMIT_SHUTDOWN 860 -#define IXDCGM_FI_DEV_NVSWITCH_THROUGHPUT_TX 861 -#define IXDCGM_FI_DEV_NVSWITCH_THROUGHPUT_RX 862 -#define IXDCGM_FI_DEV_NVSWITCH_PHYS_ID 863 -#define IXDCGM_FI_DEV_NVSWITCH_RESET_REQUIRED 864 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_ID 865 -#define IXDCGM_FI_DEV_NVSWITCH_PCIE_DOMAIN 866 -#define IXDCGM_FI_DEV_NVSWITCH_PCIE_BUS 867 -#define IXDCGM_FI_DEV_NVSWITCH_PCIE_DEVICE 868 -#define IXDCGM_FI_DEV_NVSWITCH_PCIE_FUNCTION 869 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_STATUS 870 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_TYPE 871 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DOMAIN 872 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_BUS 873 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_DEVICE 874 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_REMOTE_PCIE_FUNCTION 875 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_ID 876 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_DEVICE_LINK_SID 877 -#define IXDCGM_FI_DEV_NVSWITCH_LINK_DEVICE_UUID 878 -#define IXDCGM_FI_LAST_NVSWITCH_FIELD_ID 899 -#define IXDCGM_FI_MAX_NVSWITCH_FIELDS IXDCGM_FI_LAST_NVSWITCH_FIELD_ID - IXDCGM_FI_FIRST_NVSWITCH_FIELD_ID + 1 -#define IXDCGM_FI_PROF_GR_ENGINE_ACTIVE 1001 -#define IXDCGM_FI_PROF_SM_ACTIVE 1002 -#define IXDCGM_FI_PROF_SM_OCCUPANCY 1003 -#define IXDCGM_FI_PROF_PIPE_TENSOR_ACTIVE 1004 -#define IXDCGM_FI_PROF_DRAM_ACTIVE 1005 -#define IXDCGM_FI_PROF_PIPE_FP64_ACTIVE 1006 -#define IXDCGM_FI_PROF_PIPE_FP32_ACTIVE 1007 -#define IXDCGM_FI_PROF_PIPE_FP16_ACTIVE 1008 -#define IXDCGM_FI_PROF_PCIE_TX_BYTES 1009 -#define IXDCGM_FI_PROF_PCIE_RX_BYTES 1010 -#define IXDCGM_FI_PROF_NVLINK_TX_BYTES 1011 -#define IXDCGM_FI_PROF_NVLINK_RX_BYTES 1012 -#define IXDCGM_FI_PROF_PIPE_TENSOR_IMMA_ACTIVE 1013 -#define IXDCGM_FI_PROF_PIPE_TENSOR_HMMA_ACTIVE 1014 -#define IXDCGM_FI_PROF_PIPE_TENSOR_DFMA_ACTIVE 1015 -#define IXDCGM_FI_PROF_PIPE_INT_ACTIVE 1016 -#define IXDCGM_FI_PROF_NVDEC0_ACTIVE 1017 -#define IXDCGM_FI_PROF_NVDEC1_ACTIVE 1018 -#define IXDCGM_FI_PROF_NVDEC2_ACTIVE 1019 -#define IXDCGM_FI_PROF_NVDEC3_ACTIVE 1020 -#define IXDCGM_FI_PROF_NVDEC4_ACTIVE 1021 -#define IXDCGM_FI_PROF_NVDEC5_ACTIVE 1022 -#define IXDCGM_FI_PROF_NVDEC6_ACTIVE 1023 -#define IXDCGM_FI_PROF_NVDEC7_ACTIVE 1024 -#define IXDCGM_FI_PROF_NVJPG0_ACTIVE 1025 -#define IXDCGM_FI_PROF_NVJPG1_ACTIVE 1026 -#define IXDCGM_FI_PROF_NVJPG2_ACTIVE 1027 -#define IXDCGM_FI_PROF_NVJPG3_ACTIVE 1028 -#define IXDCGM_FI_PROF_NVJPG4_ACTIVE 1029 -#define IXDCGM_FI_PROF_NVJPG5_ACTIVE 1030 -#define IXDCGM_FI_PROF_NVJPG6_ACTIVE 1031 -#define IXDCGM_FI_PROF_NVJPG7_ACTIVE 1032 -#define IXDCGM_FI_PROF_NVOFA0_ACTIVE 1033 -#define IXDCGM_FI_PROF_NVLINK_L0_TX_BYTES 1040 -#define IXDCGM_FI_PROF_NVLINK_L0_RX_BYTES 1041 -#define IXDCGM_FI_PROF_NVLINK_L1_TX_BYTES 1042 -#define IXDCGM_FI_PROF_NVLINK_L1_RX_BYTES 1043 -#define IXDCGM_FI_PROF_NVLINK_L2_TX_BYTES 1044 -#define IXDCGM_FI_PROF_NVLINK_L2_RX_BYTES 1045 -#define IXDCGM_FI_PROF_NVLINK_L3_TX_BYTES 1046 -#define IXDCGM_FI_PROF_NVLINK_L3_RX_BYTES 1047 -#define IXDCGM_FI_PROF_NVLINK_L4_TX_BYTES 1048 -#define IXDCGM_FI_PROF_NVLINK_L4_RX_BYTES 1049 -#define IXDCGM_FI_PROF_NVLINK_L5_TX_BYTES 1050 -#define IXDCGM_FI_PROF_NVLINK_L5_RX_BYTES 1051 -#define IXDCGM_FI_PROF_NVLINK_L6_TX_BYTES 1052 -#define IXDCGM_FI_PROF_NVLINK_L6_RX_BYTES 1053 -#define IXDCGM_FI_PROF_NVLINK_L7_TX_BYTES 1054 -#define IXDCGM_FI_PROF_NVLINK_L7_RX_BYTES 1055 -#define IXDCGM_FI_PROF_NVLINK_L8_TX_BYTES 1056 -#define IXDCGM_FI_PROF_NVLINK_L8_RX_BYTES 1057 -#define IXDCGM_FI_PROF_NVLINK_L9_TX_BYTES 1058 -#define IXDCGM_FI_PROF_NVLINK_L9_RX_BYTES 1059 -#define IXDCGM_FI_PROF_NVLINK_L10_TX_BYTES 1060 -#define IXDCGM_FI_PROF_NVLINK_L10_RX_BYTES 1061 -#define IXDCGM_FI_PROF_NVLINK_L11_TX_BYTES 1062 -#define IXDCGM_FI_PROF_NVLINK_L11_RX_BYTES 1063 -#define IXDCGM_FI_PROF_NVLINK_L12_TX_BYTES 1064 -#define IXDCGM_FI_PROF_NVLINK_L12_RX_BYTES 1065 -#define IXDCGM_FI_PROF_NVLINK_L13_TX_BYTES 1066 -#define IXDCGM_FI_PROF_NVLINK_L13_RX_BYTES 1067 -#define IXDCGM_FI_PROF_NVLINK_L14_TX_BYTES 1068 -#define IXDCGM_FI_PROF_NVLINK_L14_RX_BYTES 1069 -#define IXDCGM_FI_PROF_NVLINK_L15_TX_BYTES 1070 -#define IXDCGM_FI_PROF_NVLINK_L15_RX_BYTES 1071 -#define IXDCGM_FI_PROF_NVLINK_L16_TX_BYTES 1072 -#define IXDCGM_FI_PROF_NVLINK_L16_RX_BYTES 1073 -#define IXDCGM_FI_PROF_NVLINK_L17_TX_BYTES 1074 -#define IXDCGM_FI_PROF_NVLINK_L17_RX_BYTES 1075 -#define IXDCGM_FI_PROF_NVLINK_THROUGHPUT_FIRST IXDCGM_FI_PROF_NVLINK_L0_TX_BYTES -#define IXDCGM_FI_PROF_NVLINK_THROUGHPUT_LAST IXDCGM_FI_PROF_NVLINK_L17_RX_BYTES -#define IXDCGM_FI_MAX_FIELDS 1076 - - typedef enum ixdcgm_field_entity_group_t - { - IXDCGM_FE_NONE = 0, /*!< Field is not associated with an entity. Field scope should be IXDCGM_FS_GLOBAL */ - IXDCGM_FE_GPU, /*!< Field is associated with a GPU entity */ - IXDCGM_FE_VGPU, /*!< Field is associated with a VGPU entity */ - IXDCGM_FE_SWITCH, /*!< Field is associated with a Switch entity */ - IXDCGM_FE_GPU_I, /*!< Field is associated with a GPU Instance entity */ - IXDCGM_FE_GPU_CI, /*!< Field is associated with a GPU Compute Instance entity */ - IXDCGM_FE_LINK, /*!< Field is associated with an NVLink */ - IXDCGM_FE_CPU, /*!< Field is associated with a CPU node */ - IXDCGM_FE_CPU_CORE, /*!< Field is associated with a CPU */ - - IXDCGM_FE_COUNT /*!< Number of elements in this enumeration. Keep this entry last */ - } ixdcgm_field_entity_group_t; - typedef unsigned int ixdcgm_field_eid_t; - - typedef struct - { - char shortName[10]; /*!< Short name corresponding to field. This short name is used to identify columns in dmon - output.*/ - char unit[4]; /*!< The unit of value. Eg: C(elsius), W(att), MB/s*/ - short width; /*!< Maximum width/number of digits that a value for field can have.*/ - } ixdcgm_field_output_format_t, *ixdcgm_field_output_format_p; - - typedef struct - { - unsigned short fieldId; /*!< Field identifier. IXDCGM_FI_? #define */ - char fieldType; /*!< Field type. IXDCGM_FT_? #define */ - unsigned char size; /*!< field size in bytes (raw value size). 0=variable (like IXDCGM_FT_STRING) */ - char tag[48]; /*!< Tag for this field for serialization like 'device_temperature' */ - int scope; /*!< Field scope. IXDCGM_FS_? #define of this field's association */ - int nvmlFieldId; /*!< Optional NVML field this DCGM field maps to. 0 = no mapping. - Otherwise, this should be a NVML_FI_? #define from nvml.h */ - ixdcgm_field_entity_group_t - entityLevel; /*!< Field entity level. IXDCGM_FE_? specifying at what level the field is queryable */ - - ixdcgm_field_output_format_p valueFormat; /*!< pointer to the structure that holds the formatting the - values for fields */ - } ixdcgm_field_meta_t; - - typedef const ixdcgm_field_meta_t* ixdcgm_field_meta_p; - - int __attribute((visibility("default"))) ixdcgmFieldsInit(void); - int __attribute((visibility("default"))) ixdcgmFieldsTerm(void); - - ixdcgm_field_meta_p __attribute((visibility("default"))) ixdcgmFieldGetById(unsigned short fieldId); - - bool ixdcgmIsFieldSupported(unsigned short fieldId); - bool ixdcgmIsNvlinkField(unsigned short fieldId); - const char __attribute((visibility("default"))) * - ixdcgmFieldsGetEntityGroupString(ixdcgm_field_entity_group_t entityGroupId); - -#ifdef __cplusplus -} -#endif - -#endif // end of __IXDCGMFIELDS_H__ \ No newline at end of file diff --git a/pkg/ixdcgm/include/ixdcgmStructs.h b/pkg/ixdcgm/include/ixdcgmStructs.h index 20848a1..3c5d66e 100644 --- a/pkg/ixdcgm/include/ixdcgmStructs.h +++ b/pkg/ixdcgm/include/ixdcgmStructs.h @@ -11,1073 +11,68 @@ #include -#include "ixdcgmFields.h" - -#define MAKE_IXDCGM_VERSION(typeName, ver) (unsigned int)(sizeof(typeName) | ((unsigned long)(ver) << 24U)) - -#define IXDCGM_MAX_STR_LENGTH 256 -#define IXDCGM_MAX_BLOB_LENGTH 4096 - -#define IXDCGM_VGPU_NAME_BUFFER_SIZE 64 -#define IXDCGM_DEVICE_UUID_BUFFER_SIZE 80 - -#define IXDCGM_CONFIG_COMPUTEMODE_DEFAULT 0 -#define IXDCGM_CONFIG_COMPUTEMODE_PROHIBITED 1 -#define IXDCGM_CONFIG_COMPUTEMODE_EXCLUSIVE_PROCESS 2 - -#define IXDCGM_GROUP_MAX_ENTITIES 64 - -/*IXDCGM_FI_DEV_NVLINK_RECOVERY_ERROR_COUNT field not supported, - set IXDCGM_LINK_ERROR_COUNT and IXDCGM_HEALTH_WATCH_LINK_ERROR_NUM_FIELDS to 3. */ -#define IXDCGM_LINK_ERROR_COUNT 3 -#define IXDCGM_HEALTH_WATCH_LINK_ERROR_NUM_FIELDS 3 - -#define IXDCGM_INT32_BLANK 0x7ffffff0 -#define IXDCGM_INT64_BLANK 0x7ffffffffffffff0ll -#define IXDCGM_FP64_BLANK 140737488355328.0 -#define IXDCGM_STR_BLANK "<<>>" - -#define IXDCGM_INT32_NOT_FOUND (IXDCGM_INT32_BLANK + 1) -#define IXDCGM_INT64_NOT_FOUND (IXDCGM_INT64_BLANK + 1) -#define IXDCGM_FP64_NOT_FOUND (IXDCGM_FP64_BLANK + 1.0) -#define IXDCGM_STR_NOT_FOUND "<<>>" - -#define IXDCGM_INT32_NOT_SUPPORTED (IXDCGM_INT32_BLANK + 2) -#define IXDCGM_INT64_NOT_SUPPORTED (IXDCGM_INT64_BLANK + 2) -#define IXDCGM_FP64_NOT_SUPPORTED (IXDCGM_FP64_BLANK + 2.0) -#define IXDCGM_STR_NOT_SUPPORTED "<<>>" - -/* Represents and error where fetching the value is not allowed with our current credentials */ -#define IXDCGM_INT32_NOT_PERMISSIONED (IXDCGM_INT32_BLANK + 3) -#define IXDCGM_INT64_NOT_PERMISSIONED (IXDCGM_INT64_BLANK + 3) -#define IXDCGM_FP64_NOT_PERMISSIONED (IXDCGM_FP64_BLANK + 3.0) -#define IXDCGM_STR_NOT_PERMISSIONED "<<>>" -#define IXDCGM_INT32_IS_BLANK(val) (((val) >= IXDCGM_INT32_BLANK) ? 1 : 0) -#define IXDCGM_INT64_IS_BLANK(val) (((val) >= IXDCGM_INT64_BLANK) ? 1 : 0) -#define IXDCGM_FP64_IS_BLANK(val) (((val) >= IXDCGM_FP64_BLANK ? 1 : 0)) -#define IXDCGM_STR_IS_BLANK(val) (val == strstr(val, "<<<") && strstr(val, ">>>")) - -#define IXDCGM_AFFINITY_BITMASK_ARRAY_SIZE 8 - typedef enum ixdcgmReturn_enum { - IXDCGM_RET_OK = 0, //!< Success - IXDCGM_RET_BADPARAM = -1, //!< A bad parameter was passed to a function - IXDCGM_RET_GENERIC_ERROR = -3, //!< A generic, unspecified error - IXDCGM_RET_MEMORY = -4, //!< An out of memory error occurred - IXDCGM_RET_NOT_CONFIGURED = -5, //!< Setting not configured - IXDCGM_RET_NOT_SUPPORTED = -6, //!< Feature not supported - IXDCGM_RET_INIT_ERROR = -7, //!< DCGM Init error - IXDCGM_RET_NVML_ERROR = -8, //!< When NVML returns error - IXDCGM_RET_PENDING = -9, //!< Object is in pending state of something else - IXDCGM_RET_UNINITIALIZED = -10, //!< Object is in undefined state - IXDCGM_RET_TIMEOUT = -11, //!< Requested operation timed out - IXDCGM_RET_VER_MISMATCH = -12, //!< Version mismatch between received and understood API - IXDCGM_RET_UNKNOWN_FIELD = -13, //!< Unknown field id - IXDCGM_RET_NO_DATA = -14, //!< No data is available - IXDCGM_RET_STALE_DATA = -15, //!< Data is considered stale - IXDCGM_RET_NOT_WATCHED = -16, //!< The given field id is not being updated by the cache manager - IXDCGM_RET_NO_PERMISSION = -17, //!< Do not have permission to perform the desired action - IXDCGM_RET_GPU_IS_LOST = -18, //!< GPU is no longer reachable - IXDCGM_RET_RESET_REQUIRED = -19, //!< GPU requires a reset - IXDCGM_RET_FUNCTION_NOT_FOUND = -20, //!< The function that was requested was not found (bindings only error) - IXDCGM_RET_CONNECTION_NOT_VALID = -21, //!< The connection to the host engine is not valid any longer - IXDCGM_RET_GPU_NOT_SUPPORTED = -22, //!< This GPU is not supported by DCGM - IXDCGM_RET_GROUP_INCOMPATIBLE = -23, //!< The GPUs of the provided group are not compatible with each other for the - //!< requested operation - IXDCGM_RET_MAX_LIMIT = -24, //!< Max limit reached for the object - IXDCGM_RET_LIBRARY_NOT_FOUND = -25, //!< DCGM library could not be found - IXDCGM_RET_DUPLICATE_KEY = -26, //!< Duplicate key passed to a function - IXDCGM_RET_GPU_IN_SYNC_BOOST_GROUP = -27, //!< GPU is already a part of a sync boost group - IXDCGM_RET_GPU_NOT_IN_SYNC_BOOST_GROUP = -28, //!< GPU is not a part of a sync boost group - IXDCGM_RET_REQUIRES_ROOT = -29, //!< This operation cannot be performed when the host engine is running as non-root - IXDCGM_RET_IXVS_ERROR = -30, //!< DCGM GPU Diagnostic was successfully executed, but reported an error. - IXDCGM_RET_INSUFFICIENT_SIZE = -31, //!< An input argument is not large enough - IXDCGM_RET_FIELD_UNSUPPORTED_BY_API = -32, //!< The given field ID is not supported by the API being called - IXDCGM_RET_MODULE_NOT_LOADED = -33, //!< This request is serviced by a module of DCGM that is not currently loaded - IXDCGM_RET_IN_USE = -34, //!< The requested operation could not be completed because the affected - //!< resource is in use - IXDCGM_RET_GROUP_IS_EMPTY = - -35, //!< This group is empty and the requested operation is not valid on an empty group - IXDCGM_RET_PROFILING_NOT_SUPPORTED = -36, //!< Profiling is not supported for this group of GPUs or GPU. - IXDCGM_RET_PROFILING_LIBRARY_ERROR = -37, //!< The third-party Profiling module returned an unrecoverable error. - IXDCGM_RET_PROFILING_MULTI_PASS = -38, //!< The requested profiling metrics cannot be collected in a single pass - IXDCGM_RET_DIAG_ALREADY_RUNNING = -39, //!< A diag instance is already running, cannot run a new diag until - //!< the current one finishes. - IXDCGM_RET_DIAG_BAD_JSON = -40, //!< The DCGM GPU Diagnostic returned JSON that cannot be parsed - IXDCGM_RET_DIAG_BAD_LAUNCH = -41, //!< Error while launching the DCGM GPU Diagnostic - IXDCGM_RET_DIAG_UNUSED = -42, //!< Unused - IXDCGM_RET_DIAG_THRESHOLD_EXCEEDED = -43, //!< A field value met or exceeded the error threshold. - IXDCGM_RET_INSUFFICIENT_DRIVER_VERSION = -44, //!< The installed driver version is insufficient for this API - IXDCGM_RET_INSTANCE_NOT_FOUND = -45, //!< The specified GPU instance does not exist - IXDCGM_RET_COMPUTE_INSTANCE_NOT_FOUND = -46, //!< The specified GPU compute instance does not exist - IXDCGM_RET_CHILD_NOT_KILLED = -47, //!< Couldn't kill a child process within the retries - IXDCGM_RET_3RD_PARTY_LIBRARY_ERROR = -48, //!< Detected an error in a 3rd-party library - IXDCGM_RET_INSUFFICIENT_RESOURCES = -49, //!< Not enough resources available - IXDCGM_RET_PLUGIN_EXCEPTION = -50, //!< Exception thrown from a diagnostic plugin - IXDCGM_RET_IXVS_ISOLATE_ERROR = -51, //!< The diagnostic returned an error that indicates the need for isolation - IXDCGM_RET_IXVS_BINARY_NOT_FOUND = -52, //!< The NVVS binary was not found in the specified location - IXDCGM_RET_IXVS_KILLED = -53, //!< The NVVS process was killed by a signal - IXDCGM_RET_PAUSED = -54, //!< The hostengine and all modules are paused - IXDCGM_RET_ALREADY_INITIALIZED = -55, //!< The object is already initialized + IXDCGM_RET_OK = 0, //!< Success + IXDCGM_RET_BADPARAM = -1, //!< A bad parameter was passed to a function + IXDCGM_RET_GENERIC_ERROR = -3, //!< A generic, unspecified error + IXDCGM_RET_MEMORY = -4, //!< An out of memory error occurred + IXDCGM_RET_NOT_CONFIGURED = -5, //!< Setting not configured + IXDCGM_RET_NOT_SUPPORTED = -6, //!< Feature not supported + IXDCGM_RET_INIT_ERROR = -7, //!< DCGM Init error + IXDCGM_RET_NVML_ERROR = -8, //!< When NVML returns error + IXDCGM_RET_PENDING = -9, //!< Object is in pending state of something else + IXDCGM_RET_UNINITIALIZED = -10, //!< Object is in undefined state + IXDCGM_RET_TIMEOUT = -11, //!< Requested operation timed out + IXDCGM_RET_VER_MISMATCH = -12, //!< Version mismatch between received and understood API + IXDCGM_RET_UNKNOWN_FIELD = -13, //!< Unknown field id + IXDCGM_RET_NO_DATA = -14, //!< No data is available + IXDCGM_RET_STALE_DATA = -15, //!< Data is considered stale + IXDCGM_RET_NOT_WATCHED = -16, //!< The given field id is not being updated by the cache manager + IXDCGM_RET_NO_PERMISSION = -17, //!< Do not have permission to perform the desired action + IXDCGM_RET_GPU_IS_LOST = -18, //!< GPU is no longer reachable + IXDCGM_RET_RESET_REQUIRED = -19, //!< GPU requires a reset + IXDCGM_RET_FUNCTION_NOT_FOUND = -20, //!< The function that was requested was not found (bindings only error) + IXDCGM_RET_CONNECTION_NOT_VALID = -21, //!< The connection to the host engine is not valid any longer + IXDCGM_RET_GPU_NOT_SUPPORTED = -22, //!< This GPU is not supported by DCGM + IXDCGM_RET_GROUP_INCOMPATIBLE = -23, //!< The GPUs of the provided group are not compatible with each other for the + //!< requested operation + IXDCGM_RET_MAX_LIMIT = -24, //!< Max limit reached for the object + IXDCGM_RET_LIBRARY_NOT_FOUND = -25, //!< DCGM library could not be found + IXDCGM_RET_DUPLICATE_KEY = -26, //!< Duplicate key passed to a function + IXDCGM_RET_GPU_IN_SYNC_BOOST_GROUP = -27, //!< GPU is already a part of a sync boost group + IXDCGM_RET_GPU_NOT_IN_SYNC_BOOST_GROUP = -28, //!< GPU is not a part of a sync boost group + IXDCGM_RET_REQUIRES_ROOT = -29, //!< This operation cannot be performed when the host engine is running as non-root + IXDCGM_RET_IXVS_ERROR = -30, //!< DCGM GPU Diagnostic was successfully executed, but reported an error. + IXDCGM_RET_INSUFFICIENT_SIZE = -31, //!< An input argument is not large enough + IXDCGM_RET_FIELD_UNSUPPORTED_BY_API = -32, //!< The given field ID is not supported by the API being called + IXDCGM_RET_MODULE_NOT_LOADED = -33, //!< This request is serviced by a module of DCGM that is not currently loaded + IXDCGM_RET_IN_USE = -34, //!< The requested operation could not be completed because the affected + //!< resource is in use + IXDCGM_RET_GROUP_IS_EMPTY = -35, //!< This group is empty and the requested operation is not valid on an empty group + IXDCGM_RET_PROFILING_NOT_SUPPORTED = -36, //!< Profiling is not supported for this group of GPUs or GPU. + IXDCGM_RET_PROFILING_LIBRARY_ERROR = -37, //!< The third-party Profiling module returned an unrecoverable error. + IXDCGM_RET_PROFILING_MULTI_PASS = -38, //!< The requested profiling metrics cannot be collected in a single pass + IXDCGM_RET_DIAG_ALREADY_RUNNING = -39, //!< A diag instance is already running, cannot run a new diag until + //!< the current one finishes. + IXDCGM_RET_DIAG_BAD_JSON = -40, //!< The DCGM GPU Diagnostic returned JSON that cannot be parsed + IXDCGM_RET_DIAG_BAD_LAUNCH = -41, //!< Error while launching the DCGM GPU Diagnostic + IXDCGM_RET_DIAG_UNUSED = -42, //!< Unused + IXDCGM_RET_DIAG_THRESHOLD_EXCEEDED = -43, //!< A field value met or exceeded the error threshold. + IXDCGM_RET_INSUFFICIENT_DRIVER_VERSION = -44, //!< The installed driver version is insufficient for this API + IXDCGM_RET_INSTANCE_NOT_FOUND = -45, //!< The specified GPU instance does not exist + IXDCGM_RET_COMPUTE_INSTANCE_NOT_FOUND = -46, //!< The specified GPU compute instance does not exist + IXDCGM_RET_CHILD_NOT_KILLED = -47, //!< Couldn't kill a child process within the retries + IXDCGM_RET_3RD_PARTY_LIBRARY_ERROR = -48, //!< Detected an error in a 3rd-party library + IXDCGM_RET_INSUFFICIENT_RESOURCES = -49, //!< Not enough resources available + IXDCGM_RET_PLUGIN_EXCEPTION = -50, //!< Exception thrown from a diagnostic plugin + IXDCGM_RET_IXVS_ISOLATE_ERROR = -51, //!< The diagnostic returned an error that indicates the need for isolation + IXDCGM_RET_IXVS_BINARY_NOT_FOUND = -52, //!< The NVVS binary was not found in the specified location + IXDCGM_RET_IXVS_KILLED = -53, //!< The NVVS process was killed by a signal + IXDCGM_RET_PAUSED = -54, //!< The hostengine and all modules are paused + IXDCGM_RET_ALREADY_INITIALIZED = -55, //!< The object is already initialized } ixdcgmReturn_t; -typedef enum -{ - ixdcgmLogLevelNone = 0, /*!< No logging */ - ixdcgmLogLevelFatal = 1, /*!< Fatal Errors */ - ixdcgmLogLevelError = 2, /*!< Errors */ - ixdcgmLogLevelWarning = 3, /*!< Warnings */ - ixdcgmLogLevelInfo = 4, /*!< Informative */ - ixdcgmLogLevelDebug = 5, /*!< Debug information */ - ixdcgmLogLevelVerbose = 6 /*!< Verbose debugging information */ -} ixdcgmLogLevel_t; - -typedef uintptr_t ixdcgmHandle_t; //!< Identifier for ixDCGM Handle -typedef uintptr_t ixdcgmGpuGrp_t; //!< Identifier for a group of GPUs. A group can have one or more GPUs -typedef uintptr_t ixdcgmFieldGrp_t; //!< Identifier for a group of fields. -typedef uintptr_t ixdcgmStatus_t; //!< Identifier for list of status codes - -typedef struct -{ - /* data */ - ixdcgmHandle_t ixdcgmHandler; - const char* logFileName; - ixdcgmLogLevel_t loglevelDefault; -} ixdcgmStartParams; - -typedef struct -{ - unsigned int version; /*!< Version number*/ - unsigned int persistAfterDisconnect; /*!< 1 = do not clean up after this connection. - 0 = clean up after this connection */ - unsigned int timeoutMs; /*!< wait in milliseconds before giving up */ - unsigned int addressIsUnixSocket; /*!< unix socket filename (1) or a TCP/IP address (0) */ -} ixdcgmConnectParams; - -#define ixdcgmConnectParams_version2 MAKE_IXDCGM_VERSION(ixdcgmConnectParams, 2) - -typedef enum ixdcgmOperationMode_enum -{ - IXDCGM_OPERATION_MODE_AUTO = 1, - IXDCGM_OPERATION_MODE_MANUAL = 2 -} ixdcgmOperationMode_t; - -typedef enum -{ - ixdcgmModuleIdCore = 0, //!< Core DCGM - always loaded - ixdcgmModuleIdNvSwitch = 1, //!< NvSwitch Module - ixdcgmModuleIdVGPU = 2, //!< VGPU Module - ixdcgmModuleIdIntrospect = 3, //!< Introspection Module - ixdcgmModuleIdHealth = 4, //!< Health Module - ixdcgmModuleIdPolicy = 5, //!< Policy Module - ixdcgmModuleIdConfig = 6, //!< Config Module - ixdcgmModuleIdDiag = 7, //!< GPU Diagnostic Module - ixdcgmModuleIdProfiling = 8, //!< Profiling Module - ixdcgmModuleIdSysmon = 9, //!< System Monitoring Module - - ixdcgmModuleIdCount //!< Always last. 1 greater than largest value above -} ixdcgmModuleId_t; - -typedef enum ixdcgmOrder_enum -{ - IXDCGM_ORDER_ASCENDING = 1, //!< Data with earliest (lowest) timestamps returned first - IXDCGM_ORDER_DESCENDING = 2 //!< Data with latest (highest) timestamps returned first -} ixdcgmOrder_t; - -typedef enum -{ - ixdcgmModuleStatusNotLoaded = 0, //!< Module has not been loaded yet - ixdcgmModuleStatusDenylisted = 1, //!< Module is on the denylist; can't be loaded - ixdcgmModuleStatusFailed = 2, //!< Loading the module failed - ixdcgmModuleStatusLoaded = 3, //!< Module has been loaded - ixdcgmModuleStatusUnloaded = 4, //!< Module has been unloaded, happens during shutdown - ixdcgmModuleStatusPaused = 5, /*!< Module has been paused. This is a temporary state that will - move to ixdcgmModuleStatusLoaded once the module is resumed. - This status implies that the module is loaded. */ -} ixdcgmModuleStatus_t; - -typedef struct -{ - ixdcgmModuleId_t id; //!< ID of this module - ixdcgmModuleStatus_t status; //!< Status of this module -} ixdcgmModuleGetStatusesModule_t; - -/* This is larger than ixdcgmModuleIdCount so we can add modules without versioning this request */ -#define IXDCGM_MODULE_STATUSES_CAPACITY 16 - -typedef struct -{ - unsigned int version; //!< Version of this request. Should be ixdcgmModuleGetStatuses_version1 - unsigned int numStatuses; //!< Number of entries in statuses[] that are populated - ixdcgmModuleGetStatusesModule_t statuses[IXDCGM_MODULE_STATUSES_CAPACITY]; //!< Per-module status information -} ixdcgmModuleGetStatuses_v1; - -/** - * Version 1 of dcgmModuleGetStatuses - */ -#define ixdcgmModuleGetStatuses_version1 MAKE_IXDCGM_VERSION(ixdcgmModuleGetStatuses_v1, 1) -#define ixdcgmModuleGetStatuses_version ixdcgmModuleGetStatuses_version1 -typedef ixdcgmModuleGetStatuses_v1 ixdcgmModuleGetStatuses_t; - -typedef struct -{ - unsigned int version; /*!< Version number. Use ixdcgmStartEmbeddedV2Params_version2 */ - ixdcgmOperationMode_t opMode; /*!< IN: Collect data automatically or manually when asked by the user. */ - ixdcgmHandle_t ixdcgmHandle; /*!< OUT: DCGM Handle to use for API calls */ - const char* logFile; /*!< IN: File that DCGM should log to. NULL = do not log. '-' = stdout */ - ixdcgmLogLevel_t logLevel; /*!< IN: Severity at which DCGM should log to logFile */ - unsigned int denyListCount; /*!< IN: Number of modules to be added to the denylist in denyList[] */ - const char* serviceAccount; /*!< IN: Service account for unprivileged processes */ - ixdcgmModuleId_t denyList[ixdcgmModuleIdCount]; /*!< IN: IDs of modules to be added to the denylist */ - char _padding[4]; /*!< IN: Unused. Aligns the struct to 8 bytes. */ -} ixdcgmStartEmbeddedParam; - -typedef unsigned int ixdcgm_connection_id_t; -#define IXDCGM_CONNECTION_ID_NONE ((ixdcgm_connection_id_t)0) - -#define IXDCGM_HOSTENGINE_DEFAULT_PORT 5777 -#define IXDCGM_HOSTENGINE_LOCAL_ADDR "0.0.0.0" // Default set to listen to ALL IP addrs -#define IXDCGM_HOSTENGINE_DEFAULT_SOCKET "/tmp/ix-hostengine" // Default set to listen to ALL IP addrs - -#define IXDCGM_EMBEDDED_HANDLE 0x7fffffff -#define IXDCGM_MAX_NUM_DEVICES 32 -#define IXDCGM_MAX_NUM_GROUPS 64 - -#define IXDCGM_CMI_F_WATCHED 0x00000001 /* Is this field being watched? */ - -typedef struct -{ - unsigned int version; - char rawBuildInfoString[IXDCGM_MAX_STR_LENGTH * 2]; -} ixdcgmVersionInfo_v2; - -#define ixdcgmVersionInfo_version2 MAKE_IXDCGM_VERSION(ixdcgmVersionInfo_v2, 2) - -#define ixdcgmVersionInfo_version ixdcgmVersionInfo_version2 -typedef ixdcgmVersionInfo_v2 ixdcgmVersionInfo_t; - -/** - * Type of GPU groups - */ -typedef enum ixdcgmGroupType_enum -{ - IXDCGM_GROUP_DEFAULT = 0, //!< All the GPUs on the node are added to the group - IXDCGM_GROUP_EMPTY = 1, //!< Creates an empty group - IXDCGM_GROUP_DEFAULT_NVSWITCHES = 2, //!< All NvSwitches of the node are added to the group - IXDCGM_GROUP_DEFAULT_INSTANCES = 3, //!< All GPU instances of the node are added to the group - IXDCGM_GROUP_DEFAULT_COMPUTE_INSTANCES = 4, //!< All compute instances of the node are added to the group - IXDCGM_GROUP_DEFAULT_EVERYTHING = 5, //!< All entities are added to this default group -} ixdcgmGroupType_t; - -/** - * Identifies for special IXDCGM groups - */ -#define IXDCGM_GROUP_ALL_GPUS 0x7fffffff -#define IXDCGM_GROUP_ALL_NVSWITCHES 0x7ffffffe -#define IXDCGM_GROUP_ALL_INSTANCES 0x7ffffffd -#define IXDCGM_GROUP_ALL_COMPUTE_INSTANCES 0x7ffffffc -#define IXDCGM_GROUP_ALL_ENTITIES 0x7ffffffb - -#define IXDCGM_MAX_CLOCKS 256 -#define IXDCGM_GEGE_FLAG_ONLY_SUPPORTED 0x00000001 - -#define IXDCGM_GROUP_MAX_ENTITIES 64 -#define IXDCGM_MAX_FIELD_IDS_PER_FIELD_GROUP 128 -#define IXDCGM_MAX_NUM_FIELD_GROUPS 64 - -#define IXDCGM_FV_FLAG_LIVE_DATA 0x00000001 -/** - * Default maximum age of samples kept (usec) - */ -#define IXDCGM_MAX_AGE_USEC_DEFAULT 30000000 - -typedef struct -{ - int version; //!< Version Number (ixdcgmClockSet_version) - unsigned int memClock; //!< Memory Clock (Memory Clock value OR DCGM_INT32_BLANK to Ignore/Use compatible - //!< value with smClk) - unsigned int smClock; //!< SM Clock (SM Clock value OR DCGM_INT32_BLANK to Ignore/Use compatible value with memClk) -} ixdcgmClockSet_v1; - -/** - * Typedef for \ref ixdcgmClockSet_v1 - */ -typedef ixdcgmClockSet_v1 ixdcgmClockSet_t; - -/** - * Version 1 for \ref ixdcgmClockSet_v1 - */ -#define ixdcgmClockSet_version1 MAKE_IXDCGM_VERSION(ixdcgmClockSet_v1, 1) - -/** - * Latest version for \ref ixdcgmClockSet_t - */ -#define ixdcgmClockSet_version ixdcgmClockSet_version1 - -typedef struct -{ - unsigned int version; //!< Version Number (ixdcgmDeviceSupportedClockSets_version) - unsigned int count; //!< Number of supported clocks - ixdcgmClockSet_t - clockSet[IXDCGM_MAX_CLOCKS]; //!< Valid clock sets for the device. Upto \ref count entries are filled -} ixdcgmDeviceSupportedClockSets_v1; -/** - * Typedef for \ref ixdcgmDeviceSupportedClockSets_v1 - */ -typedef ixdcgmDeviceSupportedClockSets_v1 ixdcgmDeviceSupportedClockSets_t; - -/** - * Version 1 for \ref ixdcgmDeviceSupportedClockSets_v1 - */ -#define ixdcgmDeviceSupportedClockSets_version1 MAKE_IXDCGM_VERSION(ixdcgmDeviceSupportedClockSets_v1, 1) - -/** - * Latest version for \ref ixdcgmDeviceSupportedClockSets_t - */ -#define ixdcgmDeviceSupportedClockSets_version ixdcgmDeviceSupportedClockSets_version1 - -typedef struct -{ - ixdcgm_field_entity_group_t entityGroupId; //!< Entity Group ID entity belongs to - ixdcgm_field_eid_t entityId; //!< Entity ID of the entity -} ixdcgmGroupEntityPair_t; - -typedef struct -{ - unsigned int version; //!< Version Number (use ixdcgmGroupInfo_version2) - unsigned int count; //!< count of entityIds returned in \a entityList - char groupName[IXDCGM_MAX_STR_LENGTH]; //!< Group Name - ixdcgmGroupEntityPair_t entityList[IXDCGM_GROUP_MAX_ENTITIES]; //!< List of the entities that are in this group -} ixdcgmGroupInfo_v2; - -/** - * Typedef for \ref ixdcgmGroupInfo_v2 - */ -typedef ixdcgmGroupInfo_v2 ixdcgmGroupInfo_t; - -/** - * Version 2 for \ref ixdcgmGroupInfo_v2 - */ -#define ixdcgmGroupInfo_version2 MAKE_IXDCGM_VERSION(ixdcgmGroupInfo_v2, 2) - -/** - * Latest version for \ref ixdcgmGroupInfo_t - */ -#define ixdcgmGroupInfo_version ixdcgmGroupInfo_version2 - -typedef struct -{ - unsigned int version; //!< Version Number - unsigned int slowdownTemp; //!< Slowdown temperature - unsigned int shutdownTemp; //!< Shutdown temperature -} ixdcgmDeviceThermals; -typedef ixdcgmDeviceThermals ixdcgmDeviceThermals_t; - -typedef struct -{ - unsigned int version; //!< Version Number - unsigned int curPowerLimit; //!< Power management limit associated with this device (in W) - unsigned int defaultPowerLimit; //!< Power management limit effective at device boot (in W) - unsigned int enforcedPowerLimit; //!< Effective power limit that the driver enforces after taking into account - //!< all limiters (in W) - unsigned int minPowerLimit; //!< Minimum power management limit (in W) - unsigned int maxPowerLimit; //!< Maximum power management limit (in W) -} ixdcgmDevicePowerLimits; -typedef ixdcgmDevicePowerLimits ixdcgmDevicePowerLimits_t; - -typedef struct -{ - unsigned int version; //!< Version Number (ixdcgmDeviceIdentifiers_version) - char brandName[IXDCGM_MAX_STR_LENGTH]; //!< Brand Name - char deviceName[IXDCGM_MAX_STR_LENGTH]; //!< Name of the device - char pciBusId[IXDCGM_MAX_STR_LENGTH]; //!< PCI Bus ID - char serial[IXDCGM_MAX_STR_LENGTH]; //!< Serial for the device - char uuid[IXDCGM_MAX_STR_LENGTH]; //!< UUID for the device - char vbios[IXDCGM_MAX_STR_LENGTH]; //!< VBIOS version - char inforomImageVersion[IXDCGM_MAX_STR_LENGTH]; //!< Inforom Image version - unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id - unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID - char driverVersion[IXDCGM_MAX_STR_LENGTH]; //!< Driver Version - unsigned int virtualizationMode; //!< Virtualization Mode -} ixdcgmDeviceIdentifiers_v1; -typedef ixdcgmDeviceIdentifiers_v1 ixdcgmDeviceIdentifiers_t; - -typedef struct -{ - unsigned int version; //!< Version Number (ixdcgmDeviceMemoryUsage_version) - unsigned int bar1Total; //!< Total BAR1 size in megabytes - unsigned int fbTotal; //!< Total framebuffer memory in megabytes - unsigned int fbUsed; //!< Used framebuffer memory in megabytes - unsigned int fbFree; //!< Free framebuffer memory in megabytes -} ixdcgmDeviceMemoryUsage_v1; -typedef ixdcgmDeviceMemoryUsage_v1 ixdcgmDeviceMemoryUsage_t; - -typedef struct -{ - unsigned int version; - unsigned int persistenceModeEnabled; - unsigned int migModeEnabled; - unsigned int confidentialComputeMode; -} ixdcgmDeviceSettings; - -typedef ixdcgmDeviceSettings ixdcgmDeviceSettings_t; - -typedef struct -{ - unsigned int version; //!< Version number (ixdcgmDeviceAttributes_version) - ixdcgmDeviceSupportedClockSets_t clockSets; //!< Supported clocks for the device - ixdcgmDeviceThermals_t thermalSettings; //!< Thermal settings for the device - ixdcgmDevicePowerLimits_t powerLimits; //!< Various power limits for the device - ixdcgmDeviceIdentifiers_t identifiers; //!< Identifiers for the device - ixdcgmDeviceMemoryUsage_t memoryUsage; //!< Memory usage info for the device - ixdcgmDeviceSettings_t settings; //!< Basic device settings -} ixdcgmDeviceAttributes; - -typedef ixdcgmDeviceAttributes ixdcgmDeviceAttributes_t; -#define ixdcgmDeviceAttributes_version3 MAKE_IXDCGM_VERSION(ixdcgmDeviceAttributes, 3) -#define ixdcgmDeviceAttributes_version ixdcgmDeviceAttributes_version3 - -typedef struct -{ - // version must always be first - unsigned int version; //!< version number (ixdcgmFieldValue_version1) - - unsigned short fieldId; //!< One of IXDCGM_FI_? - unsigned short fieldType; //!< One of IXDCGM_FT_? - int status; //!< Status for the querying the field. IXDCGM_RET_OK or one of IXDCGM_RET_? - int64_t ts; //!< Timestamp in usec since 1970 - union { - int64_t i64; //!< Int64 value - double dbl; //!< Double value - char str[IXDCGM_MAX_STR_LENGTH]; //!< NULL terminated string - char blob[IXDCGM_MAX_BLOB_LENGTH]; //!< Binary blob - } value; //!< Value -} ixdcgmFieldValue_v1; -#define ixdcgmFieldValue_version1 MAKE_IXDCGM_VERSION(ixdcgmFieldValue_v1, 1) - -typedef struct -{ - // version must always be first - unsigned int version; //!< version number (ixdcgmFieldValue_version2) - ixdcgm_field_entity_group_t entityGroupId; //!< Entity group this field value's entity belongs to - ixdcgm_field_eid_t entityId; //!< Entity this field value belongs to - unsigned short fieldId; //!< One of IXDCGM_FI_? - unsigned short fieldType; //!< One of IXDCGM_FT_? - int status; //!< Status for the querying the field. IXDCGM_RET_OK or one of IXDCGM_RET_? - unsigned int unused; //!< Unused for now to align ts to an 8-byte boundary. - int64_t ts; //!< Timestamp in usec since 1970 - union { - int64_t i64; //!< Int64 value - double dbl; //!< Double value - char str[IXDCGM_MAX_STR_LENGTH]; //!< NULL terminated string - char blob[IXDCGM_MAX_BLOB_LENGTH]; //!< Binary blob - } value; //!< Value -} ixdcgmFieldValue_v2; -#define ixdcgmFieldValue_version2 MAKE_IXDCGM_VERSION(ixdcgmFieldValue_v2, 2) - -/** - * User callback function for processing one or more field updates. This callback will - * be invoked one or more times per field until all of the expected field values have been - * enumerated. It is up to the callee to detect when the field id changes - * - * @param gpuId IN: GPU ID of the GPU this field value set belongs to - * @param values IN: Field values. These values must be copied as they will be destroyed as soon as this - * call returns. - * @param numValues IN: Number of entries that are valid in values[] - * @param userData IN: User data pointer passed to the update function that generated this callback - * - * @returns - * 0 if OK - * <0 if enumeration should stop. This allows to callee to abort field value enumeration. - * - */ -typedef int (*ixdcgmFieldValueEnumeration_f)(unsigned int gpuId, - ixdcgmFieldValue_v1* values, - int numValues, - void* userData); - -/* Bitmask values for ixdcgmGetFieldIdSummary - Sync with DcgmcmSummaryType_t */ -#define IXDCGM_SUMMARY_MIN 0x00000001 -#define IXDCGM_SUMMARY_MAX 0x00000002 -#define IXDCGM_SUMMARY_AVG 0x00000004 -#define IXDCGM_SUMMARY_SUM 0x00000008 -#define IXDCGM_SUMMARY_COUNT 0x00000010 -#define IXDCGM_SUMMARY_INTEGRAL 0x00000020 -#define IXDCGM_SUMMARY_DIFF 0x00000040 -#define IXDCGM_SUMMARY_SIZE 7 - -/* ixdcgmSummaryResponse_t is part of ixdcgmFieldSummaryRequest, so it uses ixdcgmFieldSummaryRequest's version. */ - -typedef struct -{ - unsigned int fieldType; //!< type of field that is summarized (int64 or fp64) - unsigned int summaryCount; //!< the number of populated summaries in \ref values - union { - int64_t i64; - double fp64; - } values[IXDCGM_SUMMARY_SIZE]; //!< array for storing the values of each summary. The summaries are stored - //!< in order. For example, if MIN AND MAX are requested, then 0 will be MIN - //!< and 1 will be MAX. If AVG and DIFF were requested, then AVG would be 0 - //!< and 1 would be DIFF -} ixdcgmSummaryResponse_t; - -typedef struct -{ - unsigned int version; //!< version of this message - ixdcgmFieldSummaryRequest_v1 - unsigned short fieldId; //!< field id to be summarized - ixdcgm_field_entity_group_t entityGroupId; //!< the type of entity whose field we're getting - ixdcgm_field_eid_t entityId; //!< ordinal id for this entity - uint32_t summaryTypeMask; //!< bit-mask of IXDCGM_SUMMARY_*, the requested summaries - uint64_t startTime; //!< start time for the interval being summarized. 0 means to use - //!< any data before. - uint64_t endTime; //!< end time for the interval being summarized. 0 means to use - //!< any data after. - ixdcgmSummaryResponse_t response; //!< response data for this request -} ixdcgmFieldSummaryRequest; - -typedef ixdcgmFieldSummaryRequest ixdcgmFieldSummaryRequest_t; - -#define ixdcgmFieldSummaryRequest_version1 MAKE_IXDCGM_VERSION(ixdcgmFieldSummaryRequest, 1) - -typedef struct -{ - ixdcgmFieldSummaryRequest_t fsr; //!< IN/OUT: field summary populated on success - unsigned int cmdRet; //!< OUT: Error code generated -} ixdcgmGetFieldSummary_v1; - -typedef int (*ixdcgmFieldValueEntityEnumeration_f)(ixdcgm_field_entity_group_t entityGroupId, - ixdcgm_field_eid_t entityId, - ixdcgmFieldValue_v1* values, - int numValues, - void* userData); -typedef enum ixdcgmPerGpuTestIndices_enum -{ - IXDCGM_MEMORY_INDEX = 0, //!< Memory test index - IXDCGM_DIAGNOSTIC_INDEX = 1, //!< Diagnostic test index - IXDCGM_PCIE_INDEX = 2, //!< PCIe test index - IXDCGM_SM_STRESS_INDEX = 3, //!< SM Stress test index - IXDCGM_TARGETED_STRESS_INDEX = 4, //!< Targeted Stress test index - IXDCGM_TARGETED_POWER_INDEX = 5, //!< Targeted Power test index - IXDCGM_MEMORY_BANDWIDTH_INDEX = 6, //!< Memory bandwidth test index - IXDCGM_MEMTEST_INDEX = 7, //!< Memtest test index - IXDCGM_PULSE_TEST_INDEX = 8, //!< Pulse test index - IXDCGM_EUD_TEST_INDEX = 9, //!< EUD test index - // Remaining tests are included for convenience but have different execution rules - // See IXDCGM_PER_GPU_TEST_COUNT - IXDCGM_UNUSED2_TEST_INDEX = 10, - IXDCGM_UNUSED3_TEST_INDEX = 11, - IXDCGM_UNUSED4_TEST_INDEX = 12, - IXDCGM_UNUSED5_TEST_INDEX = 13, - IXDCGM_SOFTWARE_INDEX = 14, //!< Software test index - IXDCGM_CONTEXT_CREATE_INDEX = 15, //!< Context create test index - IXDCGM_UNKNOWN_INDEX = 16 //!< Unknown test -} ixdcgmPerGpuTestIndices_t; - -typedef enum ixdcgmChipArchitecture_enum -{ - IXDCGM_CHIP_ARCH_OLDER = 1, //!< All GPUs older than Kepler - IXDCGM_CHIP_ARCH_NVKEPLER = 2, //!< All Kepler-architecture parts - IXDCGM_CHIP_ARCH_NVMAXWELL = 3, //!< All Maxwell-architecture parts - IXDCGM_CHIP_ARCH_NVPASCAL = 4, //!< All Pascal-architecture parts - IXDCGM_CHIP_ARCH_NVVOLTA = 5, //!< All Volta-architecture parts - IXDCGM_CHIP_ARCH_NVTURING = 6, //!< All Turing-architecture parts - IXDCGM_CHIP_ARCH_NVAMPERE = 7, //!< All Ampere-architecture parts - IXDCGM_CHIP_ARCH_NVADA = 8, //!< All Ada-architecture parts - IXDCGM_CHIP_ARCH_NVHOPPER = 9, //!< All Hopper-architecture parts - - IXDCGM_CHIP_ARCH_IX_BI = 100, - IXDCGM_CHIP_ARCH_IX_MR = 101, - IXDCGM_CHIP_ARCH_COUNT, //!< Keep this second to last, exclude unknown - IXDCGM_CHIP_ARCH_UNKNOWN = 0xffffffff //!< Anything else, presumably something newer -} ixdcgmChipArchitecture_t; - -typedef enum -{ - IXDCGM_GPU_VIRTUALIZATION_MODE_NONE = 0, //!< Represents Bare Metal GPU - IXDCGM_GPU_VIRTUALIZATION_MODE_PASSTHROUGH = 1, //!< Device is associated with GPU-Passthrough - IXDCGM_GPU_VIRTUALIZATION_MODE_VGPU = 2, //!< Device is associated with vGPU inside virtual machine. - IXDCGM_GPU_VIRTUALIZATION_MODE_HOST_VGPU = 3, //!< Device is associated with VGX hypervisor in vGPU mode - IXDCGM_GPU_VIRTUALIZATION_MODE_HOST_VSGA = 4, //!< Device is associated with VGX hypervisor in vSGA mode -} ixdcgmGpuVirtualizationMode_t; - -typedef struct -{ - unsigned int syncBoost; //!< Sync Boost Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored). Note that - //!< using this setting may result in lower clocks than targetClocks - ixdcgmClockSet_t targetClocks; //!< Target clocks. Set smClock and memClock to DCGM_INT32_BLANK to ignore/use - //!< compatible values. For GPUs > Maxwell, setting this implies autoBoost=0 -} ixdcgmConfigPerfStateSettings_t; - -typedef enum ixdcgmConfigPowerLimitType_enum -{ - IXDCGM_CONFIG_POWER_CAP_INDIVIDUAL = 0, //!< Represents the power cap to be applied for each member of the group - IXDCGM_CONFIG_POWER_BUDGET_GROUP = 1, //!< Represents the power budget for the entire group -} ixdcgmConfigPowerLimitType_t; - -typedef struct -{ - ixdcgmConfigPowerLimitType_t - type; //!< Flag to represent power cap for each GPU or power budget for the group of GPUs - unsigned int val; //!< Power Limit in Watts (Set a value OR DCGM_INT32_BLANK to Ignore) -} ixdcgmConfigPowerLimit_t; - -typedef struct -{ - unsigned int version; //!< Version number (ixdcgmConfig_version) - unsigned int gpuId; //!< GPU ID - unsigned int eccMode; //!< ECC Mode (0: Disabled, 1 : Enabled, DCGM_INT32_BLANK : Ignored) - unsigned int computeMode; //!< Compute Mode (One of DCGM_CONFIG_COMPUTEMODE_? OR DCGM_INT32_BLANK to Ignore) - ixdcgmConfigPerfStateSettings_t perfState; //!< Performance State Settings (clocks / boost mode) - ixdcgmConfigPowerLimit_t powerLimit; //!< Power Limits -} ixdcgmConfig_v1; - -typedef ixdcgmConfig_v1 ixdcgmConfig_t; - -#define ixdcgmConfig_version1 MAKE_IXDCGM_VERSION(ixdcgmConfig_v1, 1) - -#define ixdcgmConfig_version ixdcgmConfig_version1 - -typedef enum ixdcgmConfigType_enum -{ - IXDCGM_CONFIG_TARGET_STATE = 0, //!< The target configuration values to be applied - IXDCGM_CONFIG_CURRENT_STATE = 1, //!< The current configuration state -} ixdcgmConfigType_t; - -typedef enum ixdcgmLinkState_enum -{ - ixdcgmLinkStateNotSupported = 0, //!< Link is unsupported by this GPU (Default for GPUs) - ixdcgmLinkStateDisabled = 1, //!< Link is supported for this link but this link is disabled - ixdcgmLinkStateDown = 2, //!< This Link link is down (inactive) - ixdcgmLinkStateUp = 3 //!< This Link link is up (active) -} ixdcgmLinkState_t; - -#define IXDCGM_MAX_LINKS_PER_GPU 18 -#define IXDCGM_MAX_NUM_SWITCHES 12 -#define IXDCGM_MAX_LINKS_PER_SWITCH 64 - -typedef struct -{ - ixdcgm_field_eid_t entityId; //!< Entity ID of the GPU (gpuId) - ixdcgmLinkState_t linkState[IXDCGM_MAX_LINKS_PER_GPU]; //!< Per-GPU link states -} ixdcgmLinkGpuLinkStatus_v3; - -typedef struct -{ - ixdcgm_field_eid_t entityId; //!< Entity ID of the NvSwitch (physicalId) - ixdcgmLinkState_t linkState[IXDCGM_MAX_LINKS_PER_SWITCH]; //!< Per-NvSwitch link states -} ixdcgmSwitchLinkStatus_t; - -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmNvLinkStatus_version1 - unsigned int numGpus; //!< Number of entries in gpus[] that are populated - ixdcgmLinkGpuLinkStatus_v3 gpus[IXDCGM_MAX_NUM_DEVICES]; //!< Per-GPU NvLink link statuses - unsigned int numNvSwitches; //!< Number of entries in nvSwitches[] that are populated - ixdcgmSwitchLinkStatus_t nvSwitches[IXDCGM_MAX_NUM_SWITCHES]; //!< Per-NvSwitch link statuses -} ixdcgmLinkStatus_v3; - -typedef ixdcgmLinkStatus_v3 ixdcgmLinkStatus_t; - -#define ixdcgmLinkStatus_version3 MAKE_IXDCGM_VERSION(ixdcgmLinkStatus_v3, 3) - -typedef struct -{ - unsigned int version; //!< Version Number. Should match ixdcgmDevicePidAccountingStats_version - unsigned int pid; //!< Process id of the process these stats are for - unsigned int gpuUtilization; //!< Percent of time over the process's lifetime during which one or more kernels - //!< was executing on the GPU. - //!< Set to DCGM_INT32_NOT_SUPPORTED if is not supported - unsigned int memoryUtilization; //!< Percent of time over the process's lifetime during which global (device) - //!< memory was being read or written. - //!< Set to DCGM_INT32_NOT_SUPPORTED if is not supported - unsigned long long maxMemoryUsage; //!< Maximum total memory in bytes that was ever allocated by the process. - //!< Set to DCGM_INT64_NOT_SUPPORTED if is not supported - unsigned long long startTimestamp; //!< CPU Timestamp in usec representing start time for the process - unsigned long long activeTimeUsec; //!< Amount of time in usec during which the compute context was active. - //!< Note that this does not mean the context was being used. endTimestamp - //!< can be computed as startTimestamp + activeTime -} ixdcgmDevicePidAccountingStats_v1; - -/** - * Typedef for \ref ixdcgmDevicePidAccountingStats_v1 - */ -typedef ixdcgmDevicePidAccountingStats_v1 ixdcgmDevicePidAccountingStats_t; -#define ixdcgmDevicePidAccountingStats_version1 MAKE_IXDCGM_VERSION(ixdcgmDevicePidAccountingStats_v1, 1) -#define ixdcgmDevicePidAccountingStats_version ixdcgmDevicePidAccountingStats_version1 - -typedef struct -{ - unsigned int pid; - double smUtil; - double memUtil; -} ixdcgmProcessUtilInfo_t; - -typedef struct -{ - double util; - unsigned int pid; -} ixdcgmProcessUtilSample_t; - -typedef struct -{ - unsigned int version; //!< Version Number (ixdcgmDeviceVgpuProcessUtilInfo_version) - union { - unsigned int vgpuId; //!< vGPU instance ID - unsigned int vgpuProcessSamplesCount; //!< Count of processes running in the vGPU VM,for which utilization - //!< rates are being reported in this cycle. - } vgpuProcessUtilInfo; - unsigned int pid; //!< Process ID of the process running in the vGPU VM. - char processName[IXDCGM_VGPU_NAME_BUFFER_SIZE]; //!< Process Name of process running in the vGPU VM. - unsigned int smUtil; //!< GPU utilization of process running in the vGPU VM. - unsigned int memUtil; //!< Memory utilization of process running in the vGPU VM. - unsigned int encUtil; //!< Encoder utilization of process running in the vGPU VM. - unsigned int decUtil; //!< Decoder utilization of process running in the vGPU VM. -} ixdcgmDeviceVgpuProcessUtilInfo_v1; - -/** - * Typedef for \ref ixdcgmDeviceVgpuProcessUtilInfo_v1 - */ -typedef ixdcgmDeviceVgpuProcessUtilInfo_v1 ixdcgmDeviceVgpuProcessUtilInfo_t; - -/** - * Version 1 for \ref ixdcgmDeviceVgpuProcessUtilInfo_v1 - */ -#define ixdcgmDeviceVgpuProcessUtilInfo_version1 MAKE_IXDCGM_VERSION(ixdcgmDeviceVgpuProcessUtilInfo_v1, 1) - -typedef enum ixdcgmGpuLevel_enum -{ - IXDCGM_TOPOLOGY_UNINITIALIZED = 0x0, - - /** \name PCI connectivity states */ - /**@{*/ - IXDCGM_TOPOLOGY_BOARD = 0x1, //!< multi-GPU board - IXDCGM_TOPOLOGY_SINGLE = 0x2, //!< all devices that only need traverse a single PCIe switch - IXDCGM_TOPOLOGY_MULTIPLE = 0x4, //!< all devices that need not traverse a host bridge - IXDCGM_TOPOLOGY_HOSTBRIDGE = 0x8, //!< all devices that are connected to the same host bridge - IXDCGM_TOPOLOGY_CPU = 0x10, //!< all devices that are connected to the same CPU but possibly multiple host bridges - IXDCGM_TOPOLOGY_SYSTEM = 0x20, //!< all devices in the system - /**@}*/ - - /** \name LINK connectivity states */ - /**@{*/ - IXDCGM_TOPOLOGY_LINK1 = 0x0100, //!< GPUs connected via a single LINK link - IXDCGM_TOPOLOGY_LINK2 = 0x0200, //!< GPUs connected via two LINK links - IXDCGM_TOPOLOGY_LINK3 = 0x0400, //!< GPUs connected via three LINK links - IXDCGM_TOPOLOGY_LINK4 = 0x0800, //!< GPUs connected via four LINK links - IXDCGM_TOPOLOGY_LINK5 = 0x1000, //!< GPUs connected via five LINK links - IXDCGM_TOPOLOGY_LINK6 = 0x2000, //!< GPUs connected via six LINK links - IXDCGM_TOPOLOGY_LINK7 = 0x4000, //!< GPUs connected via seven LINK links - IXDCGM_TOPOLOGY_LINK8 = 0x8000, //!< GPUs connected via eight LINK links - IXDCGM_TOPOLOGY_LINK9 = 0x10000, //!< GPUs connected via nine LINK links - IXDCGM_TOPOLOGY_LINK10 = 0x20000, //!< GPUs connected via ten LINK links - IXDCGM_TOPOLOGY_LINK11 = 0x40000, //!< GPUs connected via 11 LINK links - IXDCGM_TOPOLOGY_LINK12 = 0x80000, //!< GPUs connected via 12 LINK links - IXDCGM_TOPOLOGY_LINK13 = 0x100000, //!< GPUs connected via 13 LINK links - IXDCGM_TOPOLOGY_LINK14 = 0x200000, //!< GPUs connected via 14 LINK links - IXDCGM_TOPOLOGY_LINK15 = 0x400000, //!< GPUs connected via 15 LINK links - IXDCGM_TOPOLOGY_LINK16 = 0x800000, //!< GPUs connected via 16 LINK links - IXDCGM_TOPOLOGY_LINK17 = 0x1000000, //!< GPUs connected via 17 LINK links - IXDCGM_TOPOLOGY_LINK18 = 0x2000000, //!< GPUs connected via 18 LINK links - /**@}*/ -} ixdcgmGpuTopologyLevel_t; - -// the PCI paths are the lower 8 bits of the path information -#define IXDCGM_TOPOLOGY_PATH_PCI(x) (ixdcgmGpuTopologyLevel_t)((unsigned int)(x) & 0xFF) - -// the LINK paths are the upper 24 bits of the path information -#define IXDCGM_TOPOLOGY_PATH_LINK(x) (ixdcgmGpuTopologyLevel_t)((unsigned int)(x) & 0xFFFFFF00) - -#define IXDCGM_AFFINITY_BITMASK_ARRAY_SIZE 8 - -/** No hints specified */ -#define IXDCGM_TOPO_HINT_F_NONE 0x00000000 - -/** Ignore the health of the GPUs when picking GPUs for job - * execution. By default, only healthy GPUs are considered. - */ -#define IXDCGM_TOPO_HINT_F_IGNOREHEALTH 0x00000001 - -/** - * Device topology information - */ -typedef struct -{ - unsigned int version; //!< version number (ixdcgmDeviceTopology_version) - - unsigned long cpuAffinityMask[IXDCGM_AFFINITY_BITMASK_ARRAY_SIZE]; //!< affinity mask for the specified GPU - //!< a 1 represents affinity to the CPU in that - //!< bit position supports up to 256 cores - unsigned int numGpus; //!< number of valid entries in gpuPaths - - struct - { - unsigned int gpuId; //!< gpuId to which the path represents - ixdcgmGpuTopologyLevel_t path; //!< path to the gpuId from this GPU. Note that this is a bit-mask - //!< of IXDCGM_TOPOLOGY_* values and can contain both PCIe topology - //!< and NvLink topology where applicable. For instance: - //!< 0x210 = IXDCGM_TOPOLOGY_CPU | IXDCGM_TOPOLOGY_LINK2 - //!< Use the macros IXDCGM_TOPOLOGY_PATH_LINK and - //!< IXDCGM_TOPOLOGY_PATH_PCI to mask the NvLink and PCI paths, respectively. - unsigned int localLinkIds; //!< bits representing the local links connected to gpuId - //!< e.g. if this field == 3, links 0 and 1 are connected, - //!< field is only valid if LINKS actually exist between GPUs - } gpuPaths[IXDCGM_MAX_NUM_DEVICES - 1]; -} ixdcgmDeviceTopology_v1; - -/** - * Typedef for \ref ixdcgmDeviceTopology_v1 - */ -typedef ixdcgmDeviceTopology_v1 ixdcgmDeviceTopology_t; - -/** - * Version 1 for \ref ixdcgmDeviceTopology_v1 - */ -#define ixdcgmDeviceTopology_version1 MAKE_IXDCGM_VERSION(ixdcgmDeviceTopology_v1, 1) - -/** - * Latest version for \ref ixdcgmDeviceTopology_t - */ -#define ixdcgmDeviceTopology_version ixdcgmDeviceTopology_version1 - -/** - * Group topology information - */ -typedef struct -{ - unsigned int version; //!< version number (ixdcgmGroupTopology_version) - - unsigned long - groupCpuAffinityMask[IXDCGM_AFFINITY_BITMASK_ARRAY_SIZE]; //!< the CPU affinity mask for all GPUs in the group - //!< a 1 represents affinity to the CPU in that bit - //!< position supports up to 256 cores - unsigned int numaOptimalFlag; //!< a zero value indicates that 1 or more GPUs - //!< in the group have a different CPU affinity and thus - //!< may not be optimal for certain algorithms - ixdcgmGpuTopologyLevel_t slowestPath; //!< the slowest path amongst GPUs in the group -} ixdcgmGroupTopology_v1; - -/** - * Typedef for \ref ixdcgmGroupTopology_v1 - */ -typedef ixdcgmGroupTopology_v1 ixdcgmGroupTopology_t; - -/** - * Version 1 for \ref ixdcgmGroupTopology_v1 - */ -#define ixdcgmGroupTopology_version1 MAKE_IXDCGM_VERSION(ixdcgmGroupTopology_v1, 1) - -/** - * Latest version for \ref ixdcgmGroupTopology_t - */ -#define ixdcgmGroupTopology_version ixdcgmGroupTopology_version1 - -/** - * Running process information for a compute or graphics process - */ -typedef struct -{ - unsigned int version; //!< Version of this message (ixdcgmRunningProcess_version) - unsigned int pid; //!< PID of the process - unsigned long long memoryUsed; //!< GPU memory used by this process in bytes. -} ixdcgmRunningProcess_v1; - -/** - * Typedef for \ref ixdcgmRunningProcess_v1 - */ -typedef ixdcgmRunningProcess_v1 ixdcgmRunningProcess_t; - -/** - * Version 1 for \ref ixdcgmRunningProcess_v1 - */ -#define ixdcgmRunningProcess_version1 MAKE_IXDCGM_VERSION(ixdcgmRunningProcess_v1, 1) - -/** - * Latest version for \ref ixdcgmRunningProcess_t - */ -#define ixdcgmRunningProcess_version ixdcgmRunningProcess_version1 - -typedef struct -{ - unsigned int version; //!< Version number (ixdcgmFieldGroupInfo_version) - unsigned int numFieldIds; //!< Number of entries in fieldIds[] that are valid - ixdcgmFieldGrp_t fieldGroupId; //!< ID of this field group - char fieldGroupName[IXDCGM_MAX_STR_LENGTH]; //!< Field Group Name - unsigned short fieldIds[IXDCGM_MAX_FIELD_IDS_PER_FIELD_GROUP]; //!< Field ids that belong to this group -} ixdcgmFieldGroupInfo_v1; - -typedef ixdcgmFieldGroupInfo_v1 ixdcgmFieldGroupInfo_t; - -/** - * Version 1 for ixdcgmFieldGroupInfo_v1 - */ -#define ixdcgmFieldGroupInfo_version1 MAKE_IXDCGM_VERSION(ixdcgmFieldGroupInfo_v1, 1) - -/** - * Latest version for ixdcgmFieldGroupInfo_t - */ -#define ixdcgmFieldGroupInfo_version ixdcgmFieldGroupInfo_version1 - -typedef struct -{ - unsigned int version; //!< Version number (ixdcgmAllFieldGroupInfo_version) - unsigned int numFieldGroups; //!< Number of entries in fieldGroups[] that are populated - ixdcgmFieldGroupInfo_t fieldGroups[IXDCGM_MAX_NUM_FIELD_GROUPS]; //!< Info about each field group -} ixdcgmAllFieldGroup_v1; - -typedef ixdcgmAllFieldGroup_v1 ixdcgmAllFieldGroup_t; - -/** - * Version 1 for ixdcgmAllFieldGroup_v1 - */ -#define ixdcgmAllFieldGroup_version1 MAKE_IXDCGM_VERSION(ixdcgmAllFieldGroup_v1, 1) - -/** - * Latest version for ixdcgmAllFieldGroup_t - */ -#define ixdcgmAllFieldGroup_version ixdcgmAllFieldGroup_version1 - -/** - * Structure to represent error attributes - */ -typedef struct -{ - unsigned int gpuId; //!< Represents GPU ID - short fieldId; //!< One of DCGM_FI_? - int status; //!< One of DCGM_ST_? -} ixdcgmErrorInfo_t; - -typedef struct -{ - int targetLogger; - ixdcgmLogLevel_t targetLogLevel; -} ixdcgmSettingsSetLoggingSeverity_v1; - -#define ixdcgmSettingsSetLoggingSeverity_version1 MAKE_IXDCGM_VERSION(ixdcgmSettingsSetLoggingSeverity_v1, 1) -#define ixdcgmSettingsSetLoggingSeverity_version ixdcgmSettingsSetLoggingSeverity_version1 -typedef ixdcgmSettingsSetLoggingSeverity_v1 ixdcgmSettingsSetLoggingSeverity_t; - -/** - * Systems structure used to enable or disable health watch systems - */ -typedef enum ixdcgmHealthSystems_enum -{ - IXDCGM_HEALTH_WATCH_PCIE = 0x1, //!< PCIe system watches (must have 1m of data before query) - IXDCGM_HEALTH_WATCH_NVLINK = 0x2, //!< NVLINK system watches - IXDCGM_HEALTH_WATCH_PMU = 0x4, //!< Power management unit watches - IXDCGM_HEALTH_WATCH_MCU = 0x8, //!< Micro-controller unit watches - IXDCGM_HEALTH_WATCH_MEM = 0x10, //!< Memory watches - IXDCGM_HEALTH_WATCH_SM = 0x20, //!< Streaming multiprocessor watches - IXDCGM_HEALTH_WATCH_INFOROM = 0x40, //!< Inforom watches - IXDCGM_HEALTH_WATCH_THERMAL = 0x80, //!< Temperature watches (must have 1m of data before query) - IXDCGM_HEALTH_WATCH_POWER = 0x100, //!< Power watches (must have 1m of data before query) - IXDCGM_HEALTH_WATCH_DRIVER = 0x200, //!< Driver-related watches - IXDCGM_HEALTH_WATCH_NVSWITCH_NONFATAL = 0x400, //!< Non-fatal errors in NvSwitch - IXDCGM_HEALTH_WATCH_NVSWITCH_FATAL = 0x800, //!< Fatal errors in NvSwitch - - // ... - IXDCGM_HEALTH_WATCH_ALL = 0xFFFFFFFF //!< All watches enabled -} ixdcgmHealthSystems_t; - -#define IXDCGM_HEALTH_WATCH_COUNT_V1 10 /*!< For iterating through the dcgmHealthSystems_v1 enum */ -#define IXDCGM_HEALTH_WATCH_COUNT_V2 12 /*!< For iterating through the dcgmHealthSystems_v2 enum */ - -/** - * Health Watch test results - */ -typedef enum ixdcgmHealthWatchResult_enum -{ - IXDCGM_HEALTH_RESULT_PASS = 0, //!< All results within this system are reporting normal - IXDCGM_HEALTH_RESULT_WARN = 10, //!< A warning has been issued, refer to the response for more information - IXDCGM_HEALTH_RESULT_FAIL = 20, //!< A failure has been issued, refer to the response for more information -} ixdcgmHealthWatchResults_t; - -typedef struct -{ - char msg[1024]; - unsigned int code; -} ixdcgmDiagErrorDetail_t; - -#define IXDCGM_ERR_MSG_LENGTH 512 -#define IXDCGM_HEALTH_WATCH_MAX_INCIDENTS IXDCGM_GROUP_MAX_ENTITIES - -typedef struct -{ - ixdcgmHealthSystems_t system; //!< system to which this information belongs - ixdcgmHealthWatchResults_t health; //!< health diagnosis of this incident - ixdcgmDiagErrorDetail_t error; //!< Information about the error(s) and their error codes - ixdcgmGroupEntityPair_t entityInfo; //!< identify which entity has this error -} ixdcgmIncidentInfo_t; - -/** - * Structure used to set health watches via the dcgmHealthSet_v2 API - */ -typedef struct -{ - unsigned int version; /*!< Version of this struct. Should be ixdcgmHealthSet_version2 */ - ixdcgmGpuGrp_t groupId; /*!< Group ID representing collection of one or more entities. Look - at \ref dcgmGroupCreate for details on creating the group. - Alternatively, pass in the group id as \a IXDCGM_GROUP_ALL_GPUS - to perform operation on all the GPUs or \a IXDCGM_GROUP_ALL_NVSWITCHES - to perform operation on all the NvSwitches. */ - ixdcgmHealthSystems_t systems; /*!< An enum representing systems that should be enabled for health - checks logically OR'd together. Refer to \ref ixdcgmHealthSystems_t - for details. */ - long long updateInterval; /*!< How often to query the underlying health information from the - driver in usec. This should be the same as how often you call - ixdcgmHealthCheck */ - double maxKeepAge; /*!< How long to keep data cached for this field in seconds. This should - be at least your maximum time between calling ixdcgmHealthCheck */ -} ixdcgmHealthSetParams_v2; - -/** - * Version 2 for \ref ixdcgmHealthSet_v2 - */ -#define ixdcgmHealthSetParams_version2 MAKE_IXDCGM_VERSION(ixdcgmHealthSetParams_v2, 2) - -typedef struct -{ - unsigned int version; //!< The version number of this struct - ixdcgmHealthWatchResults_t overallHealth; //!< The overall health of this entire host - unsigned int incidentCount; //!< The number of health incidents reported in this struct - ixdcgmIncidentInfo_t incidents[IXDCGM_HEALTH_WATCH_MAX_INCIDENTS]; //!< Report of the errors detected -} ixdcgmHealthResponse_v4; - -#define ixdcgmHealthResponse_version4 MAKE_IXDCGM_VERSION(ixdcgmHealthResponse_v4, 4) -#define ixdcgmHealthResponse_version ixdcgmHealthResponse_version4 -typedef ixdcgmHealthResponse_v4 ixdcgmHealthResponse_t; - -#define IXDCGM_PROF_MAX_NUM_GROUPS_V2 10 - -#define IXDCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2 64 - -typedef struct -{ - unsigned short majorId; //!< Major ID of this metric group. Metric groups with the same majorId cannot be - //!< watched concurrently with other metric groups with the same majorId - unsigned short minorId; //!< Minor ID of this metric group. This distinguishes metric groups within the same - //!< major metric group from each other - unsigned int numFieldIds; //!< Number of field IDs that are populated in fieldIds[] - unsigned short fieldIds[IXDCGM_PROF_MAX_FIELD_IDS_PER_GROUP_V2]; //!< DCGM Field IDs that are part of this - //!< profiling group. See DCGM_FI_PROF_* - //!< definitions in dcgm_fields.h for details. -} ixdcgmProfMetricGroupInfo_v2; - -typedef struct -{ - unsigned int version; //!< Version of this request. Should be dcgmProfGetMetricGroups_version - unsigned int unused; //!< Not used for now. Set to 0 - unsigned int gpuId; //!< GPU ID we should get the metric groups for. - - unsigned int numMetricGroups; //!< Number of entries in metricGroups[] that are populated - ixdcgmProfMetricGroupInfo_v2 metricGroups[IXDCGM_PROF_MAX_NUM_GROUPS_V2]; //!< Info for each metric group -} ixdcgmProfGetMetricGroups_v3; - -/** - * Version 3 of dcgmProfGetMetricGroups_t. See dcgm_structs_24.h for v2 - */ -#define ixdcgmProfGetMetricGroups_version3 MAKE_IXDCGM_VERSION(ixdcgmProfGetMetricGroups_v3, 3) -#define ixdcgmProfGetMetricGroups_version ixdcgmProfGetMetricGroups_version3 -typedef ixdcgmProfGetMetricGroups_v3 ixdcgmProfGetMetricGroups_t; +typedef uintptr_t ixdcgmHandle_t; //!< Identifier for ixDCGM Handle -#endif // end of __IXDCGM_STRUCTS_H__ \ No newline at end of file +#endif // end of __IXDCGM_STRUCTS_H__ \ No newline at end of file diff --git a/pkg/ixdcgm/process_info.go b/pkg/ixdcgm/process_info.go index 8fe9f23..3c996e4 100644 --- a/pkg/ixdcgm/process_info.go +++ b/pkg/ixdcgm/process_info.go @@ -19,7 +19,6 @@ package ixdcgm /* #cgo LDFLAGS: -ldl -#include "include/ixdcgmFields.h" #include "include/ixdcgmStructs.h" #include "include/ixdcgmApiExport.h" */ diff --git a/pkg/ixdcgm/types.go b/pkg/ixdcgm/types.go deleted file mode 100644 index d1bb90f..0000000 --- a/pkg/ixdcgm/types.go +++ /dev/null @@ -1,40 +0,0 @@ -/* -Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); you may -not use this file except in compliance with the License. You may obtain -a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package ixdcgm - -import "C" -import "unsafe" - -type Short C.ushort - -type FieldValue_v1 struct { - Version uint - FieldId uint - FieldType uint - Status int - Ts int64 - Value [4096]byte -} - -func (fv FieldValue_v1) Int64() int64 { - return *(*int64)(unsafe.Pointer(&fv.Value[0])) -} - -func (fv FieldValue_v1) Float64() float64 { - return *(*float64)(unsafe.Pointer(&fv.Value[0])) -} diff --git a/pkg/ixdcgm/utils.go b/pkg/ixdcgm/utils.go index 2a19c44..eefe3e8 100644 --- a/pkg/ixdcgm/utils.go +++ b/pkg/ixdcgm/utils.go @@ -35,11 +35,6 @@ import ( "unsafe" ) -const ( - dcgmInt32Blank = 0x7ffffff0 // 2147483632 - dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792 -) - func uintPtr(c C.uint) *uint { i := uint(c) return &i @@ -77,8 +72,8 @@ func makeVersion4(struct_type uintptr) C.uint { return version } -func makeVersion5(struct_type uintptr) C.uint { - version := C.uint(struct_type | 5<<24) +func makeVersion10(struct_type uintptr) C.uint { + version := C.uint(struct_type | 10<<24) return version } diff --git a/samples/diag/main.go b/samples/diag/main.go new file mode 100644 index 0000000..f855e7f --- /dev/null +++ b/samples/diag/main.go @@ -0,0 +1,46 @@ +package main + +import ( + "html/template" + "log" + "os" + + "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm" +) + +const diagOutput = `Software: + {{range $t := .Software}} + {{printf "%-50s" $t.TestName}} {{$t.Status}} {{$t.TestOutput}} + {{- end}} +{{range $g := .PerGpu}} + +GPU: {{$g.GPU}} + {{range $t := $g.DiagResults}} + {{printf "%-20s" $t.TestName}} {{$t.Status}} {{$t.TestOutput}} + {{- end}} +{{- end}} +` + +func main() { + cleanup, err := ixdcgm.Init(ixdcgm.Embedded) + if err != nil { + log.Panicln(err) + } + defer cleanup() + + // Choose ixdcgm diag type and input group handle + // - ixdcgm.DiagQuick -> run a very basic health check on the system + // - ixdcgm.DiagMedium -> run a medium-length diagnostic (a few minutes) + // - ixdcgm.DiagLong -> run a extensive diagnostic (several minutes) + // - ixdcgm.DiagExtended -> run a very extensive diagnostic (many minutes) + // Tip: to run diag within a time limit, please use ixdcgm.RunDiagWithTimeout + result, err := ixdcgm.RunDiag(ixdcgm.DiagQuick, ixdcgm.GroupAllGPUs()) + if err != nil { + log.Panicln(err) + } + + t := template.Must(template.New("Diag").Parse(diagOutput)) + if err = t.Execute(os.Stdout, result); err != nil { + log.Panicln("Template error:", err) + } +} -- Gitee