diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..53160138a755ed704e81ecb3b820b0b6aa5ce00a --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +*.out +*.log + +# MacOS +.DS_Store + +# IDE +.idea/ +.vscode/ +*.code-workspace \ No newline at end of file diff --git a/README.md b/README.md index 76bfe268726c56161fbefb7de8e8c765d1c6a8f0..bb8924bedadaf16ff1a682e59e2c4ebd983830be 100644 --- a/README.md +++ b/README.md @@ -2,25 +2,25 @@ ## Introduction -IXDCGM is a tool provided for monitoring and managing IX GPUs, offering a rich set of APIs to retrieve information about GPU status, performance, power consumption, and more. Go-IXDCGM is a wrapper library for IXDCGM written in Go language, providing a simple set of functions that facilitate the easy invocation of IXDCGM's APIs. +IXDCGM is a tool provided for monitoring and managing IX GPUs, offering a rich set of APIs to retrieve information about GPU status, performance, power consumption, and more. +Go-ixdcgm is a wrapper library for ixdcgm written in Go language, providing a simple set of functions that facilitate the easy invocation of ixdcgm's APIs. ## Install -The installation of Go-IXDCGM is very simple, just execute the following command in the command line: +The installation of go-ixdcgm is very simple, just execute the following command in the command line: ```bash go get gitee.com/deep-spark/go-ixdcgm ``` -## Samples +## Sample -An example of go-ixdcgm for device-info is under: +A simple example of go-ixdcgm for getting device info is under: ```go package main import ( - "flag" "fmt" "html/template" "log" @@ -31,7 +31,7 @@ import ( const ( deviceInfo = `Driver Version : {{.Identifiers.DriverVersion}} -GPUId : {{.GPUId}} +GPUId : {{.GPUId}} IxDCGMSupported : {{.IxDCGMSupported}} Uuid : {{.Uuid}} Product Name : {{.Identifiers.ProductName}} @@ -47,18 +47,12 @@ PowerLimit (W) : {{or .PowerLimit "N/A"}} ` ) -var ( - connectAddr = flag.String("connectAddr", "0.0.0.0:5777", "DCGM connect address") - isSocket = flag.String("socket", "0", "Connect to Unix socket") -) - func main() { - // choose ixdcgm hostengine running mode + // Choose ixdcgm hostengine running mode // 1. ixdcgm.Embedded // 2. ixdcgm.Standalone -connect "addr", -socket "isSocket" // 3. ixdcgm.StartHostengine - flag.Parse() - cleanup, err := ixdcgm.Init(ixdcgm.Standalone, *connectAddr, *isSocket) + cleanup, err := ixdcgm.Init(ixdcgm.Embedded) if err != nil { panic(err) } @@ -84,6 +78,136 @@ func main() { } ``` +## More Samples + +The `samples` folder contains more simple examples of how to use go-ixdcgm to call the ixdcgm API. + +To get device information, run the following command: +``` +$ go run samples/deviceinfo/main.go + +# sample output + +Driver Version : 4.2.0 +GPUId : 1 +IxDCGMSupported : Y +Uuid : GPU-6d2ec5fa-f293-57a3-9f2c-335f78120578 +Product Name : Iluvatar BI-V150S +Serial Number : 24120026944896 +Bus ID : 00000000:8A:00.0 +BAR1 (MB) : N/A +Total Memory (MB): : 32768 +Used Memory (MB): : 25500 +Free Memory (MB): : 7268 +Bandwidth (MB/s) : 31504 +PowerLimit (W) : 205 +CPUAffinity : 20-39,60-79 +NUMAAffinity : 1 +P2P Available : + GPU0 - (BusID)00000000:8A:00.0 - SYS + GPU2 - (BusID)00000000:8A:00.0 - INTE +-------------------------------------------------- +``` + +To get device status, run the following command: +``` +$ go run samples/devicestatus/main.go + +# sample output + +GPUId : 1 +Power Usage (W) : 150.000 +Temperature (°C) : 68 +FanSpeed (%) : N/A +Utilization.GPU (%) : 85 +Utilization.Mem (%) : 78 +Clocks.Cores (MHz) : 1750 +Clocks.Mem (MHz) : 1600 +EccSdbVolDev : 0 +EccDdbVolDev : 0 +PCI.Tx (MB/s) : 107 +PCI.Rx (MB/s) : 92544 +PCI.ReplayCounter : 0 +Total Memory (MB) : 32768 +Used Memory (MB) : 25500 +Free Memory (MB) : 7268 +SmActive : 0.792 +SmOccupancy : 0.222 +DramActive : 0.622 +------------------------------------------- +``` + +To get running process information of device, run the following command: +``` +$ go run samples/deviceprocessinfo/main.go + +# sample output + +Get the running process infos of gpu 1 +> Pid: 4009629 + Name: ./gemm_perf --i 1 --d 0 --m 1024 --l 2000 + UsedGpuMemory(MiB): 128 +``` + +To monitor device health iteratively, run the following command: +``` +$ go run samples/health/main.go + +# sample output + +GPU : 0 +Status : Healthy + +GPU : 1 +Status : Healthy + +GPU : 2 +Status : Healthy + +... +``` +`Note`: Press Ctrl+C to stop the iteration output. + +To find the topology of GPUs on the system, run the following command: +``` +$ go run samples/topology/main.go + +# sample output + + GPU0 GPU1 GPU2 CPU Affinity NUMA Affinity +GPU0 X SYS SYS 0-19,40-59 0 +GPU1 SYS X INTE 20-39,60-79 1 +GPU2 SYS INTE X 20-39,60-79 1 + +Legend: + X = Self + SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + PIX = Connection traversing at most a single PCIe bridge + INTE = Connection traversing at most a single on-board PCIe bridge + IX# = Connection traversing a bonded set of # IXLinks +``` + +To set violation policy and monitor policy violations iteratively, run the following command: +``` +$ go run samples/policy/main.go + +# sample output + +2025/02/25 17:05:22 Policy successfully set. +2025/02/25 17:05:22 Listening for violations... +PolicyViolation : Thermal Limit +Timestamp : 2025-02-25 17:05:42 +0800 CST +Data : {61} +PolicyViolation : Thermal Limit +Timestamp : 2025-02-25 17:05:42 +0800 CST +Data : {61} +... +``` +`Note`: Press Ctrl+C to stop the iteration output. + ## License Copyright (c) 2024 Iluvatar CoreX. All rights reserved. This project has an Apache-2.0 license, as diff --git a/go.mod b/go.mod index 6c0344ffb7787a2683ccd7d65aaca0f7c9217508..6522fb6262166d835d4612d64ccdee68863648b5 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,7 @@ module gitee.com/deep-spark/go-ixdcgm go 1.21.6 -require github.com/bits-and-blooms/bitset v1.13.0 +require ( + github.com/bits-and-blooms/bitset v1.13.0 + github.com/creasty/defaults v1.8.0 +) diff --git a/go.sum b/go.sum index b389503f487a31a26b40b2d7d90890d7ba43f961..31326c97cdd36829955a8ea6306ea34a28e23d0d 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,4 @@ github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE= github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8= +github.com/creasty/defaults v1.8.0 h1:z27FJxCAa0JKt3utc0sCImAEb+spPucmKoOdLHvHYKk= +github.com/creasty/defaults v1.8.0/go.mod h1:iGzKe6pbEHnpMPtfDXZEr0NVxWnPTjb1bbDy08fPzYM= diff --git a/pkg/ixdcgm/api.go b/pkg/ixdcgm/api.go index 86d09e2fdb77bc40792b811e85a5e2207628fc40..0a1e2aa85892b6dc06f5566e6f29f043a02e5e03 100644 --- a/pkg/ixdcgm/api.go +++ b/pkg/ixdcgm/api.go @@ -27,6 +27,7 @@ package ixdcgm */ import "C" import ( + "context" "fmt" "sync" "unsafe" @@ -38,16 +39,16 @@ var ( ixdcgmLibHandler unsafe.Pointer ixdcgmInitCounter int mux sync.Mutex - connectionsMode Interface + connection Interface handle DcgmHandle ) -// to do path +// dynamic library path const ( ixdcgmLib = "libixdcgm.so" ) -func initIxDcgm(m int, args ...string) (err error) { +func initIxDcgm(m int) (err error) { lib := string2Char(ixdcgmLib) defer freeCString(lib) @@ -56,7 +57,7 @@ func initIxDcgm(m int, args ...string) (err error) { return fmt.Errorf("failed to load %s", ixdcgmLib) } - connectionsMode, err = New(m) + connection, err = New(m) if err != nil { return err } @@ -70,12 +71,12 @@ func Init(m int, args ...string) (cleanup func(), err error) { return nil, fmt.Errorf("ixdcgm already initialized %d", ixdcgmInitCounter) } if ixdcgmInitCounter == 0 { - err = initIxDcgm(m, args...) + err = initIxDcgm(m) if err != nil { return nil, err } - handle, err = connectionsMode.Start(args...) + handle, err = connection.Start(args...) if err != nil { return nil, err } @@ -98,7 +99,7 @@ func shutdown() (err error) { } if ixdcgmInitCounter == 1 { - err = connectionsMode.Shutdown() + err = connection.Shutdown() if err != nil { return err } @@ -113,13 +114,13 @@ func GetAllDeviceCount() (uint, error) { return getAllDeviceCount() } -func GetDeviceInfo(gpuId uint) (DeviceInfo, error) { - return getDeviceInfo(gpuId) +func GetSupportedDevices() ([]uint, error) { + return getSupportedDevices() } // GetDeviceInfo describes the given device -func GetSupportedDevices() ([]uint, error) { - return getSupportedDevices() +func GetDeviceInfo(gpuId uint) (DeviceInfo, error) { + return getDeviceInfo(gpuId) } // GetDeviceStatus monitors GPU status including its power, memory and GPU utilization @@ -137,6 +138,28 @@ func GetDeviceRunningProcesses(gpuId uint) ([]DeviceProcessInfo, error) { return getDeviceRunningProcesses(gpuId) } +// GetDeviceRunning checks whether the two GPUs are on the same board func GetDeviceOnSameBoard(gpuId1, gpuId2 uint) (bool, error) { return getDeviceOnSameBoard(gpuId1, gpuId2) } + +// HealthCheckByGpuId monitors GPU health for any errors/failures/warnings +func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error) { + return healthCheckByGpuId(gpuId) +} + +// GetDeviceTopology returns device topology corresponding to the gpuId +func GetDeviceTopology(gpuId uint) ([]P2PLink, error) { + return getDeviceTopology(gpuId) +} + +// ListenForPolicyViolationsForAllGPUs sets GPU usage and error policies and notifies in case of any violations on all GPUs +func ListenForPolicyViolationsForAllGPUs(ctx context.Context, params *PolicyConditionParams) (<-chan PolicyViolation, error) { + groupId := GroupAllGPUs() + return registerPolicy(ctx, groupId, params) +} + +// ListenForPolicyViolationsForGPUs sets GPU usage and error policies and notifies in case of any violations on special GPUs +func ListenForPolicyViolationsForGPUs(ctx context.Context, params *PolicyConditionParams, gpuIds ...uint) (<-chan PolicyViolation, error) { + return registerPolicyForGpus(ctx, params, gpuIds...) +} diff --git a/pkg/ixdcgm/callback.c b/pkg/ixdcgm/callback.c new file mode 100644 index 0000000000000000000000000000000000000000..596c1a0d653cc6601206b105973480a009c6d80d --- /dev/null +++ b/pkg/ixdcgm/callback.c @@ -0,0 +1,11 @@ +int violationNotify(void *p) +{ + int ViolationRegistration(void *); + return ViolationRegistration(p); +} + +int voidCallback(void *p) +{ + int VoidCallback(void *); + return VoidCallback(p); +} \ No newline at end of file diff --git a/pkg/ixdcgm/const.go b/pkg/ixdcgm/const.go index a6eea2e7513c7294c4ae8c71b78994e87a6d66d3..5eafe6b86c7b3cc0afddea5f368505ef511f3761 100644 --- a/pkg/ixdcgm/const.go +++ b/pkg/ixdcgm/const.go @@ -130,14 +130,26 @@ const ( DCGM_FI_DEV_ECC_DBE_VOL_DEV Short = C.DCGM_FI_DEV_ECC_DBE_VOL_DEV DCGM_FI_INTERNAL_FIELDS_0_START Short = C.DCGM_FI_INTERNAL_FIELDS_0_START DCGM_FI_INTERNAL_FIELDS_0_END Short = C.DCGM_FI_INTERNAL_FIELDS_0_END - - DCGM_FI_PROF_GR_ENGINE_ACTIVE Short = C.DCGM_FI_PROF_GR_ENGINE_ACTIVE - DCGM_FI_PROF_SM_ACTIVE Short = C.DCGM_FI_PROF_SM_ACTIVE - DCGM_FI_PROF_SM_OCCUPANCY Short = C.DCGM_FI_PROF_SM_OCCUPANCY - DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Short = C.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE - DCGM_FI_PROF_DRAM_ACTIVE Short = C.DCGM_FI_PROF_DRAM_ACTIVE - DCGM_FI_PROF_PCIE_TX_BYTES Short = C.DCGM_FI_PROF_PCIE_TX_BYTES - DCGM_FI_PROF_PCIE_RX_BYTES Short = C.DCGM_FI_PROF_PCIE_RX_BYTES + DCGM_FI_PROF_SM_ACTIVE Short = C.DCGM_FI_PROF_SM_ACTIVE + DCGM_FI_PROF_SM_OCCUPANCY Short = C.DCGM_FI_PROF_SM_OCCUPANCY + DCGM_FI_PROF_DRAM_ACTIVE Short = C.DCGM_FI_PROF_DRAM_ACTIVE DCGM_FI_MAX_FIELDS Short = C.DCGM_FI_MAX_FIELDS ) + +type HealthSystem uint + +const ( + DCGM_HEALTH_WATCH_PCIE HealthSystem = 0x1 + DCGM_HEALTH_WATCH_THERMAL HealthSystem = 0x80 + DCGM_HEALTH_WATCH_POWER HealthSystem = 0x100 + DCGM_HEALTH_WATCH_ALL HealthSystem = 0xFFFFFFFF // All watches only support PCIE watch now +) + +type HealthResult uint + +const ( + DCGM_HEALTH_RESULT_PASS HealthResult = 0 // All results within this system are reporting normal + DCGM_HEALTH_RESULT_WARN HealthResult = 10 // A warning has been issued, refer to the response for more information + DCGM_HEALTH_RESULT_FAIL HealthResult = 20 // A failure has been issued, refer to the response for more information +) diff --git a/pkg/ixdcgm/device_common.go b/pkg/ixdcgm/device_common.go index 66861a3223d6b81ef68732120116d3d55a9a12cd..cf9d561441a59cf7033408c29252d774417c3158 100644 --- a/pkg/ixdcgm/device_common.go +++ b/pkg/ixdcgm/device_common.go @@ -24,13 +24,11 @@ package ixdcgm #include "include/ixdcgmApiExport.h" */ import "C" -import "fmt" func getDeviceOnSameBoard(gpuId1, gpuId2 uint) (isOnSameBoard bool, err error) { var onSameBoard C.int - r := C.ixdcgmDeviceOnSameBoard(C.ulong(handle.handle), C.uint(gpuId1), C.uint(gpuId2), &onSameBoard) - fmt.Println(r) - if err = ixdcgmErrorString(r); err != nil { + ret := C.ixdcgmDeviceOnSameBoard(C.ulong(handle.handle), C.uint(gpuId1), C.uint(gpuId2), &onSameBoard) + if err = ixdcgmErrorString(ret); err != nil { return false, err } if onSameBoard == 0 { diff --git a/pkg/ixdcgm/device_info.go b/pkg/ixdcgm/device_info.go index 4bde0ac34d7abc92755b42d39e179dd5d393b1bc..c8e16d42c9b1a81b73a31ebfa92b09df6fc83fa2 100644 --- a/pkg/ixdcgm/device_info.go +++ b/pkg/ixdcgm/device_info.go @@ -54,10 +54,13 @@ type DeviceInfo struct { GPUId uint IxDCGMSupported string Uuid string - PowerLimit uint + PowerLimit uint // W PCI PciInfo MemoryUsage MemoryUsageInfo Identifiers DeviceIdentifier + Topology []P2PLink + CPUAffinity string + NUMAAffinity string } func getAllDeviceCount() (gpuCount uint, err error) { @@ -131,7 +134,7 @@ func getDeviceInfo(gpuId uint) (DeviceInfo, error) { return DeviceInfo{}, err } - // check if the given GPU is DCGM supported + // check if the given GPU is IxDCGM supported gpus, err := getSupportedDevices() if err != nil { return DeviceInfo{}, err @@ -144,17 +147,27 @@ func getDeviceInfo(gpuId uint) (DeviceInfo, error) { break } } + + cpuAffinity, err := getCPUAffinity(gpuId) + if err != nil { + return DeviceInfo{}, err + } + numaAffinity, err := getNUMAAffinity(gpuId) + if err != nil { + return DeviceInfo{}, err + } + + var topology []P2PLink var bandwidth int64 if supported == "Y" { + topology, err = getDeviceTopology(gpuId) + if err != nil { + return DeviceInfo{}, err + } bandwidth, err = getPciBandwidth(gpuId) if err != nil { return DeviceInfo{}, err } - - // err = getDeviceTopology(gpuId) - // if err != nil { - // return DeviceInfo{}, err - // } } uuid := cChar2String(&dcgmAttr.identifiers.uuid[0]) @@ -186,6 +199,9 @@ func getDeviceInfo(gpuId uint) (DeviceInfo, error) { PCI: pci, MemoryUsage: memInfo, Identifiers: id, + Topology: topology, + CPUAffinity: cpuAffinity, + NUMAAffinity: numaAffinity, }, nil } @@ -204,9 +220,15 @@ func getSupportedDevices() (gpus []uint, err error) { } return } - func getCPUAffinity(gpuId uint) (string, error) { + return getAffinity(gpuId, "CPU") +} +func getNUMAAffinity(gpuId uint) (string, error) { + return getAffinity(gpuId, "MEM") +} + +func getAffinity(gpuId uint, typ string) (string, error) { const ( affinity0 int = iota affinity1 @@ -216,19 +238,29 @@ func getCPUAffinity(gpuId uint) (string, error) { ) affFields := make([]Short, fieldsCount) - affFields[affinity0] = DCGM_FI_DEV_CPU_AFFINITY_0 - affFields[affinity1] = DCGM_FI_DEV_CPU_AFFINITY_1 - affFields[affinity2] = DCGM_FI_DEV_CPU_AFFINITY_2 - affFields[affinity3] = DCGM_FI_DEV_CPU_AFFINITY_3 + switch typ { + case "CPU": + affFields[affinity0] = DCGM_FI_DEV_CPU_AFFINITY_0 + affFields[affinity1] = DCGM_FI_DEV_CPU_AFFINITY_1 + affFields[affinity2] = DCGM_FI_DEV_CPU_AFFINITY_2 + affFields[affinity3] = DCGM_FI_DEV_CPU_AFFINITY_3 + case "MEM": + affFields[affinity0] = DCGM_FI_DEV_MEM_AFFINITY_0 + affFields[affinity1] = DCGM_FI_DEV_MEM_AFFINITY_1 + affFields[affinity2] = DCGM_FI_DEV_MEM_AFFINITY_2 + affFields[affinity3] = DCGM_FI_DEV_MEM_AFFINITY_3 + default: + return "N/A", fmt.Errorf("not supported affinity type: %s", typ) + } - fieldGrpName := fmt.Sprintf("cpuAffFields%d", gpuId) + fieldGrpName := fmt.Sprintf("affFields%d", gpuId) fieldGrpHdl, err := FieldGroupCreate(fieldGrpName, affFields) if err != nil { return "N/A", err } defer FieldGroupDestroy(fieldGrpHdl) - gpuGrpName := fmt.Sprintf("cpuAff%d", gpuId) + gpuGrpName := fmt.Sprintf("aff%d", gpuId) gpuGrpHdl, err := WatchFields([]uint{gpuId}, fieldGrpHdl, gpuGrpName) if err != nil { return "N/A", err @@ -247,6 +279,6 @@ func getCPUAffinity(gpuId uint) (string, error) { bits[3] = uint64(values[affinity3].Int64()) b := bitset.From(bits) - - return b.String(), nil + str := convertBitsetStr(b.String()) + return str, nil } diff --git a/pkg/ixdcgm/embedded.go b/pkg/ixdcgm/embedded.go index 9829373874be7130a5e74e0ebdb7e1b68b75dc19..09c2a29139ca997feaaaf1ca87797491d8c769c3 100644 --- a/pkg/ixdcgm/embedded.go +++ b/pkg/ixdcgm/embedded.go @@ -46,6 +46,8 @@ func (e *embedded) Shutdown() error { } func (e *embedded) Start(args ...string) (DcgmHandle, error) { + fmt.Println("Start ixdcgm based on Embedded mode.") + result := C.dcgmInit() if err := errorString(result); err != nil { return DcgmHandle{}, fmt.Errorf("failed to initialize dcgm: %v", err) diff --git a/pkg/ixdcgm/fields.go b/pkg/ixdcgm/fields.go index dec961823d87b7a29e3028a82a03c8b5dac2da3e..e52404e0af2acb1ad84ea9b98f40a0588dec1e24 100644 --- a/pkg/ixdcgm/fields.go +++ b/pkg/ixdcgm/fields.go @@ -72,7 +72,7 @@ func WatchFields(gpuIds []uint, fieldGrp FieldGrpHandle, groupName string) (Grou return GroupHandle{}, err } for _, gpuId := range gpuIds { - err = AddDevice(group, gpuId) + err = AddToGroup(group, gpuId) if err != nil { return GroupHandle{}, err } @@ -151,3 +151,45 @@ func GetFieldValueStr(fv FieldValue_v1, typ string) string { return "N/A" } } + +type Field_Entity_Group uint + +const ( + FE_NONE Field_Entity_Group = iota + FE_GPU + FE_VGPU + FE_SWITCH + FE_GPU_I + FE_GPU_CI + FE_LINK + FE_CPU + FE_CPU_CORE + FE_COUNT +) + +type GroupEntityPair struct { + EntityGroupId Field_Entity_Group + EntityId uint +} + +func (e Field_Entity_Group) String() string { + switch e { + case FE_GPU: + return "GPU" + case FE_VGPU: + return "vGPU" + case FE_SWITCH: + return "Switch" + case FE_GPU_I: + return "GPU Instance" + case FE_GPU_CI: + return "GPU Compute Instance" + case FE_LINK: + return "Link" + case FE_CPU: + return "CPU" + case FE_CPU_CORE: + return "CPU Core" + } + return "unknown" +} diff --git a/pkg/ixdcgm/gpu_group.go b/pkg/ixdcgm/gpu_group.go index 3b00ed1dd01f78f9050165f89e4eae6f92871569..7035594a856118d2e66cbd1a113828a30b539a89 100644 --- a/pkg/ixdcgm/gpu_group.go +++ b/pkg/ixdcgm/gpu_group.go @@ -27,6 +27,18 @@ type GroupHandle struct { handle C.dcgmGpuGrp_t } +func (g *GroupHandle) SetHandle(val uintptr) { + g.handle = C.dcgmGpuGrp_t(val) +} + +func (g *GroupHandle) GetHandle() uintptr { + return uintptr(g.handle) +} + +func GroupAllGPUs() GroupHandle { + return GroupHandle{C.DCGM_GROUP_ALL_GPUS} +} + func CreateGroup(groupName string) (GroupHandle, error) { var cGroupId C.dcgmGpuGrp_t cgn := string2Char(groupName) @@ -40,7 +52,7 @@ func CreateGroup(groupName string) (GroupHandle, error) { return GroupHandle{cGroupId}, nil } -func AddDevice(groupId GroupHandle, gpuId uint) error { +func AddToGroup(groupId GroupHandle, gpuId uint) error { res := C.dcgmGroupAddDevice(handle.handle, groupId.handle, C.uint(gpuId)) if err := errorString(res); err != nil { return err @@ -55,3 +67,34 @@ func DestroyGroup(groupId GroupHandle) error { } return nil } + +type GroupInfo struct { + Version uint32 + GroupName string + EntityList []GroupEntityPair +} + +func GetGroupInfo(groupId GroupHandle) (*GroupInfo, error) { + response := C.dcgmGroupInfo_v2{ + version: C.dcgmGroupInfo_version2, + } + + result := C.dcgmGroupGetInfo(handle.handle, groupId.handle, &response) + if err := errorString(result); err != nil { + return nil, err + } + + ret := &GroupInfo{ + Version: uint32(response.version), + GroupName: C.GoString(&response.groupName[0]), + } + + for i := 0; i < int(response.count); i++ { + ret.EntityList = append(ret.EntityList, GroupEntityPair{ + EntityId: uint(response.entityList[i].entityId), + EntityGroupId: Field_Entity_Group(response.entityList[i].entityGroupId), + }) + } + + return ret, nil +} diff --git a/pkg/ixdcgm/health.go b/pkg/ixdcgm/health.go new file mode 100644 index 0000000000000000000000000000000000000000..573d6b10d531982e5931d1248621919b7c7731e2 --- /dev/null +++ b/pkg/ixdcgm/health.go @@ -0,0 +1,217 @@ +/* +Copyright (c) 2024, NVIDIA CORPORATION. +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ixdcgm + +/* +#include "include/dcgm_agent.h" +#include "include/dcgm_structs.h" + +#define MAKE_DCGM_VERSION(typeName, ver) (unsigned int)(sizeof(typeName) | ((unsigned long)(ver) << 24U)) +#define dcgmHealthSetParams_version2 MAKE_DCGM_VERSION(dcgmHealthSetParams_v2, 2) +*/ +import "C" + +import ( + "fmt" + "math/rand" + "unsafe" +) + +type SystemWatch struct { + Type string + Status string + Error string +} + +type DeviceHealth struct { + GPU uint + Status string + Watches []SystemWatch +} + +// HealthSet enable the DCGM health check system for the given systems +func HealthSet(groupId GroupHandle, systems HealthSystem) (err error) { + params_v2 := C.dcgmHealthSetParams_v2{ + version: C.dcgmHealthSetParams_version2, + groupId: groupId.handle, + systems: C.dcgmHealthSystems_t(systems), + updateInterval: C.longlong(int64(100000)), // How often to query the underlying health information from the driver in usecs. + maxKeepAge: C.double(float64(1)), // How long to keep data cached for this field in seconds. + } + + result := C.dcgmHealthSet_v2(handle.handle, ¶ms_v2) + if err = errorString(result); err != nil { + return fmt.Errorf("error setting health watches: %w", err) + } + return +} + +// HealthGet retrieve the current state of the DCGM health check system +func HealthGet(groupId GroupHandle) (HealthSystem, error) { + var systems C.dcgmHealthSystems_t + + result := C.dcgmHealthGet(handle.handle, groupId.handle, (*C.dcgmHealthSystems_t)(unsafe.Pointer(&systems))) + if err := errorString(result); err != nil { + return HealthSystem(0), err + } + return HealthSystem(systems), nil +} + +type DiagErrorDetail struct { + Message string + Code uint // Error code, see include/dcgm_errors.h for more info +} + +type Incident struct { + System HealthSystem + Health HealthResult + Error DiagErrorDetail + EntityInfo GroupEntityPair +} + +type HealthResponse struct { + OverallHealth HealthResult + Incidents []Incident +} + +// HealthCheck check the configured watches for any errors/failures/warnings that have occurred +// since the last time this check was invoked. On the first call, stateful information +// about all of the enabled watches within a group is created but no error results are +// provided. On subsequent calls, any error information will be returned. +func HealthCheck(groupId GroupHandle) (HealthResponse, error) { + var healthResults C.dcgmHealthResponse_v4 + healthResults.version = makeVersion4(unsafe.Sizeof(healthResults)) + + result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults))) + + if err := errorString(result); err != nil { + return HealthResponse{}, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + + response := HealthResponse{ + OverallHealth: HealthResult(healthResults.overallHealth), + } + + // number of watches that encountred error/warning + incidents := uint(healthResults.incidentCount) + + response.Incidents = make([]Incident, incidents) + + for i := uint(0); i < incidents; i++ { + response.Incidents[i] = Incident{ + System: HealthSystem(healthResults.incidents[i].system), + Health: HealthResult(healthResults.incidents[i].health), + Error: DiagErrorDetail{ + Message: *stringPtr(&healthResults.incidents[i].error.msg[0]), + Code: uint(healthResults.incidents[i].error.code), + }, + EntityInfo: GroupEntityPair{ + EntityGroupId: Field_Entity_Group(healthResults.incidents[i].entityInfo.entityGroupId), + EntityId: uint(healthResults.incidents[i].entityInfo.entityId), + }, + } + } + + return response, nil +} + +func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) { + name := fmt.Sprintf("health%d", rand.Uint64()) + groupId, err := CreateGroup(name) + if err != nil { + return + } + + err = AddToGroup(groupId, gpuId) + if err != nil { + return + } + + err = HealthSet(groupId, DCGM_HEALTH_WATCH_ALL) + if err != nil { + return + } + + result, err := HealthCheck(groupId) + if err != nil { + return + } + + status := healthStatus(result.OverallHealth) + watches := []SystemWatch{} + + // number of watches that encountred error/warning + incidents := len(result.Incidents) + + for j := 0; j < incidents; j++ { + watch := SystemWatch{ + Type: systemWatch(result.Incidents[j].System), + Status: healthStatus(result.Incidents[j].Health), + + Error: result.Incidents[j].Error.Message, + } + watches = append(watches, watch) + } + + deviceHealth = DeviceHealth{ + GPU: gpuId, + Status: status, + Watches: watches, + } + _ = DestroyGroup(groupId) + return +} + +func healthStatus(status HealthResult) string { + switch status { + case 0: + return "Healthy" + case 10: + return "Warning" + case 20: + return "Failure" + } + return "N/A" +} + +func systemWatch(watch HealthSystem) string { + switch watch { + case 1: + return "PCIe watches" + case 2: + return "LINK watches" + case 4: + return "Power Managemnt unit watches" + case 8: + return "Microcontroller unit watches" + case 16: + return "Memory watches" + case 32: + return "Streaming Multiprocessor watches" + case 64: + return "Inforom watches" + case 128: + return "Temperature watches" + case 256: + return "Power watches" + case 512: + return "Driver-related watches" + } + return "N/A" +} diff --git a/pkg/ixdcgm/include/dcgm_errors.h b/pkg/ixdcgm/include/dcgm_errors.h new file mode 100644 index 0000000000000000000000000000000000000000..efd80473348c07e18491fae4af6fea424c0a320b --- /dev/null +++ b/pkg/ixdcgm/include/dcgm_errors.h @@ -0,0 +1,626 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef DCGM_ERRORS_H +#define DCGM_ERRORS_H + +#include "dcgm_api_export.h" +#include "dcgm_structs.h" + +/***************************************************************************************************/ +/** @defgroup dcgmErrorEnums Error Codes + * @{ + */ +/***************************************************************************************************/ +/* + * Error codes for passive and active health checks. + * New error codes must be added to end of enum to maintain backwards compatibility. + */ +typedef enum dcgmError_enum +{ + DCGM_FR_OK = 0, //!< 0 No error + DCGM_FR_UNKNOWN = 1, //!< 1 Unknown error code + DCGM_FR_UNRECOGNIZED = 2, //!< 2 Unrecognized error code + DCGM_FR_PCI_REPLAY_RATE = 3, //!< 3 Unacceptable rate of PCI errors + DCGM_FR_VOLATILE_DBE_DETECTED = 4, //!< 4 Uncorrectable volatile double bit error + DCGM_FR_VOLATILE_SBE_DETECTED = 5, //!< 5 Unacceptable rate of volatile single bit errors + DCGM_FR_PENDING_PAGE_RETIREMENTS = 6, //!< 6 Pending page retirements detected + DCGM_FR_RETIRED_PAGES_LIMIT = 7, //!< 7 Unacceptable total page retirements detected + DCGM_FR_RETIRED_PAGES_DBE_LIMIT = 8, //!< 8 Unacceptable total page retirements due to uncorrectable errors + DCGM_FR_CORRUPT_INFOROM = 9, //!< 9 Corrupt inforom found + DCGM_FR_CLOCK_THROTTLE_THERMAL = 10, //!< 10 Clocks being throttled due to overheating + DCGM_FR_POWER_UNREADABLE = 11, //!< 11 Cannot get a reading for power from NVML + DCGM_FR_CLOCK_THROTTLE_POWER = 12, //!< 12 Clock being throttled due to power restrictions + DCGM_FR_NVLINK_ERROR_THRESHOLD = 13, //!< 13 Unacceptable rate of NVLink errors + DCGM_FR_NVLINK_DOWN = 14, //!< 14 NVLink is down + DCGM_FR_NVSWITCH_FATAL_ERROR = 15, //!< 15 Fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_NON_FATAL_ERROR = 16, //!< 16 Non-fatal errors on the NVSwitch + DCGM_FR_NVSWITCH_DOWN = 17, //!< 17 NVSwitch is down - NOT USED: DEPRECATED + DCGM_FR_NO_ACCESS_TO_FILE = 18, //!< 18 Cannot access a file + DCGM_FR_NVML_API = 19, //!< 19 Error occurred on an NVML API - NOT USED: DEPRECATED + DCGM_FR_DEVICE_COUNT_MISMATCH = 20, //!< 20 Disagreement in GPU count between /dev and NVML + DCGM_FR_BAD_PARAMETER = 21, //!< 21 Bad parameter passed to API + DCGM_FR_CANNOT_OPEN_LIB = 22, //!< 22 Cannot open a library that must be accessed + DCGM_FR_DENYLISTED_DRIVER = 23, //!< 23 A driver on the denylist (nouveau) is active + DCGM_FR_NVML_LIB_BAD = 24, //!< 24 NVML library is missing expected functions - NOT USED: DEPRECATED + DCGM_FR_GRAPHICS_PROCESSES = 25, //!< 25 Graphics processes are active on this GPU + DCGM_FR_HOSTENGINE_CONN = 26, //!< 26 Bad connection to nv-hostengine - NOT USED: DEPRECATED + DCGM_FR_FIELD_QUERY = 27, //!< 27 Error querying a field from DCGM + DCGM_FR_BAD_CUDA_ENV = 28, //!< 28 The environment has variables that hurt CUDA + DCGM_FR_PERSISTENCE_MODE = 29, //!< 29 Persistence mode is disabled + DCGM_FR_LOW_BANDWIDTH = 30, //!< 30 The bandwidth is unacceptably low + DCGM_FR_HIGH_LATENCY = 31, //!< 31 Latency is too high + DCGM_FR_CANNOT_GET_FIELD_TAG = 32, //!< 32 Cannot find a tag for a field + DCGM_FR_FIELD_VIOLATION = 33, //!< 33 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD = 34, //!< 34 The value for the specified field is above the threshold + DCGM_FR_FIELD_VIOLATION_DBL = 35, //!< 35 The value for the specified error field is above 0 + DCGM_FR_FIELD_THRESHOLD_DBL = 36, //!< 36 The value for the specified field is above the threshold + DCGM_FR_UNSUPPORTED_FIELD_TYPE = 37, //!< 37 Field type cannot be supported + DCGM_FR_FIELD_THRESHOLD_TS = 38, //!< 38 The value for the specified field is above the threshold + DCGM_FR_FIELD_THRESHOLD_TS_DBL = 39, //!< 39 The value for the specified field is above the threshold + DCGM_FR_THERMAL_VIOLATIONS = 40, //!< 40 Thermal violations detected + DCGM_FR_THERMAL_VIOLATIONS_TS = 41, //!< 41 Thermal violations detected with a timestamp + DCGM_FR_TEMP_VIOLATION = 42, //!< 42 Temperature is too high + DCGM_FR_THROTTLING_VIOLATION = 43, //!< 43 Non-benign clock throttling is occurring + DCGM_FR_INTERNAL = 44, //!< 44 An internal error was detected + DCGM_FR_PCIE_GENERATION = 45, //!< 45 PCIe generation is too low + DCGM_FR_PCIE_WIDTH = 46, //!< 46 PCIe width is too low + DCGM_FR_ABORTED = 47, //!< 47 Test was aborted by a user signal + DCGM_FR_TEST_DISABLED = 48, //!< 48 This test is disabled for this GPU + DCGM_FR_CANNOT_GET_STAT = 49, //!< 49 Cannot get telemetry for a needed value + DCGM_FR_STRESS_LEVEL = 50, //!< 50 Stress level is too low (bad performance) + DCGM_FR_CUDA_API = 51, //!< 51 Error calling the specified CUDA API + DCGM_FR_FAULTY_MEMORY = 52, //!< 52 Faulty memory detected on this GPU + DCGM_FR_CANNOT_SET_WATCHES = 53, //!< 53 Unable to set field watches in DCGM - NOT USED: DEPRECATED + DCGM_FR_CUDA_UNBOUND = 54, //!< 54 CUDA context is no longer bound + DCGM_FR_ECC_DISABLED = 55, //!< 55 ECC memory is disabled right now + DCGM_FR_MEMORY_ALLOC = 56, //!< 56 Cannot allocate memory on the GPU + DCGM_FR_CUDA_DBE = 57, //!< 57 CUDA detected unrecovable double-bit error + DCGM_FR_MEMORY_MISMATCH = 58, //!< 58 Memory error detected + DCGM_FR_CUDA_DEVICE = 59, //!< 59 No CUDA device discoverable for existing GPU + DCGM_FR_ECC_UNSUPPORTED = 60, //!< 60 ECC memory is unsupported by this SKU + DCGM_FR_ECC_PENDING = 61, //!< 61 ECC memory is in a pending state - NOT USED: DEPRECATED + DCGM_FR_MEMORY_BANDWIDTH = 62, //!< 62 Memory bandwidth is too low + DCGM_FR_TARGET_POWER = 63, //!< 63 Cannot hit the target power draw + DCGM_FR_API_FAIL = 64, //!< 64 The specified API call failed + DCGM_FR_API_FAIL_GPU = 65, //!< 65 The specified API call failed for the specified GPU + DCGM_FR_CUDA_CONTEXT = 66, //!< 66 Cannot create a CUDA context on this GPU + DCGM_FR_DCGM_API = 67, //!< 67 DCGM API failure + DCGM_FR_CONCURRENT_GPUS = 68, //!< 68 Need multiple GPUs to run this test + DCGM_FR_TOO_MANY_ERRORS = 69, //!< 69 More errors than fit in the return struct - NOT USED: DEPRECATED + DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70, //!< 70 More than 100 CRC errors are happening per second + DCGM_FR_NVLINK_ERROR_CRITICAL = 71, //!< 71 NVLink error for a field that should always be 0 + DCGM_FR_ENFORCED_POWER_LIMIT = 72, //!< 72 The enforced power limit is too low to hit the target + DCGM_FR_MEMORY_ALLOC_HOST = 73, //!< 73 Cannot allocate memory on the host + DCGM_FR_GPU_OP_MODE = 74, //!< 74 Bad GPU operating mode for running plugin - NOT USED: DEPRECATED + DCGM_FR_NO_MEMORY_CLOCKS = 75, //!< 75 No memory clocks with the needed MHz found - NOT USED: DEPRECATED + DCGM_FR_NO_GRAPHICS_CLOCKS = 76, //!< 76 No graphics clocks with the needed MHz found - NOT USED: DEPRECATED + DCGM_FR_HAD_TO_RESTORE_STATE = 77, //!< 77 Note that we had to restore a GPU's state + DCGM_FR_L1TAG_UNSUPPORTED = 78, //!< 78 L1TAG test is unsupported by this SKU + DCGM_FR_L1TAG_MISCOMPARE = 79, //!< 79 L1TAG test failed on a miscompare + DCGM_FR_ROW_REMAP_FAILURE = 80, //!< 80 Row remapping failed (Ampere or newer GPUs) + DCGM_FR_UNCONTAINED_ERROR = 81, //!< 81 Uncontained error - XID 95 + DCGM_FR_EMPTY_GPU_LIST = 82, //!< 82 No GPU information given to plugin + DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83, //!< 83 Pending page retirements due to a DBE + DCGM_FR_UNCORRECTABLE_ROW_REMAP = 84, //!< 84 Uncorrectable row remapping + DCGM_FR_PENDING_ROW_REMAP = 85, //!< 85 Row remapping is pending + DCGM_FR_BROKEN_P2P_MEMORY_DEVICE = 86, //!< 86 P2P copy test detected an error writing to this GPU + DCGM_FR_BROKEN_P2P_WRITER_DEVICE = 87, //!< 87 P2P copy test detected an error writing from this GPU + DCGM_FR_NVSWITCH_NVLINK_DOWN = 88, //!< 88 An NvLink is down for the specified NVSwitch - NOT USED: DEPRECATED + DCGM_FR_EUD_BINARY_PERMISSIONS = 89, //!< 89 EUD binary permissions are incorrect + DCGM_FR_EUD_NON_ROOT_USER = 90, //!< 90 EUD plugin is not running as root + DCGM_FR_EUD_SPAWN_FAILURE = 91, //!< 91 EUD plugin failed to spawn the EUD binary + DCGM_FR_EUD_TIMEOUT = 92, //!< 92 EUD plugin timed out + DCGM_FR_EUD_ZOMBIE = 93, //!< 93 EUD process remains running after the plugin considers it finished + DCGM_FR_EUD_NON_ZERO_EXIT_CODE = 94, //!< 94 EUD process exited with a non-zero exit code + DCGM_FR_EUD_TEST_FAILED = 95, //!< 95 EUD test failed + DCGM_FR_FILE_CREATE_PERMISSIONS = 96, //!< 96 We cannot create a file in this directory. + DCGM_FR_PAUSE_RESUME_FAILED = 97, //!< 97 Pause/Resume failed + DCGM_FR_PCIE_H_REPLAY_VIOLATION = 98, //!< 98 PCIe test caught correctable errors + DCGM_FR_GPU_EXPECTED_NVLINKS_UP = 99, //!< 99 Expected nvlinks up per gpu + DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP = 100, //!< 100 Expected nvlinks up per nvswitch + DCGM_FR_XID_ERROR = 101, //!< 101 XID error detected + DCGM_FR_SBE_VIOLATION = 102, //!< 102 Single bit error detected + DCGM_FR_DBE_VIOLATION = 103, //!< 103 Double bit error detected + DCGM_FR_PCIE_REPLAY_VIOLATION = 104, //!< 104 PCIe replay errors detected + DCGM_FR_SBE_THRESHOLD_VIOLATION = 105, //!< 105 SBE threshold violated + DCGM_FR_DBE_THRESHOLD_VIOLATION = 106, //!< 106 DBE threshold violated + DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION = 107, //!< 107 PCIE replay count violated + DCGM_FR_CUDA_FM_NOT_INITIALIZED = 108, //!< 108 The fabricmanager is not initialized + DCGM_FR_SXID_ERROR = 109, //!< 109 NvSwitch fatal error detected + DCGM_FR_ERROR_SENTINEL = 110, //!< 110 MUST BE THE LAST ERROR CODE +} dcgmError_t; + +typedef enum dcgmErrorSeverity_enum +{ + DCGM_ERROR_NONE = 0, //!< 0 NONE + DCGM_ERROR_MONITOR = 1, //!< 1 Can perform workload, but needs to be monitored. + DCGM_ERROR_ISOLATE = 2, //!< 2 Cannot perform workload. GPU should be isolated. + DCGM_ERROR_UNKNOWN = 3, //!< 3 This error code is not recognized + DCGM_ERROR_TRIAGE = 4, //!< 4 This error should be triaged + DCGM_ERROR_CONFIG = 5, //!< 5 This error can be configured + DCGM_ERROR_RESET = 6, //!< 6 Drain and reset GPU +} dcgmErrorSeverity_t; + +typedef enum dcgmErrorCategory_enum +{ + DCGM_FR_EC_NONE = 0, //!< 0 NONE + DCGM_FR_EC_PERF_THRESHOLD = 1, //!< 1 Performance Threshold + DCGM_FR_EC_PERF_VIOLATION = 2, //!< 2 Performance Violation + DCGM_FR_EC_SOFTWARE_CONFIG = 3, //!< 3 Software Configuration + DCGM_FR_EC_SOFTWARE_LIBRARY = 4, //!< 4 Software Library + DCGM_FR_EC_SOFTWARE_XID = 5, //!< 5 Software XID + DCGM_FR_EC_SOFTWARE_CUDA = 6, //!< 6 Software Cuda + DCGM_FR_EC_SOFTWARE_EUD = 7, //!< 7 Software EUD + DCGM_FR_EC_SOFTWARE_OTHER = 8, //!< 8 Software Other + DCGM_FR_EC_HARDWARE_THERMAL = 9, //!< 9 Hardware Thermal + DCGM_FR_EC_HARDWARE_MEMORY = 10, //!< 10 Hardware Memory + DCGM_FR_EC_HARDWARE_NVLINK = 11, //!< 11 Hardware NvLink + DCGM_FR_EC_HARDWARE_NVSWITCH = 12, //!< 12 Hardware NvSwitch + DCGM_FR_EC_HARDWARE_PCIE = 13, //!< 13 Hardware PCIe + DCGM_FR_EC_HARDWARE_POWER = 14, //!< 14 Hardware Power + DCGM_FR_EC_HARDWARE_OTHER = 15, //!< 15 Hardware Other + DCGM_FR_EC_INTERNAL_OTHER = 16, //!< 16 Internal Other +} dcgmErrorCategory_t; + +typedef struct +{ + dcgmError_t errorId; + const char* msgFormat; + const char* suggestion; + int severity; + int category; +} dcgm_error_meta_t; + +extern dcgm_error_meta_t dcgmErrorMeta[]; + +/* Standard message for running a field diagnostic */ +#define TRIAGE_RUN_FIELD_DIAG_MSG "Run a field diagnostic on the GPU." +#define DEBUG_COOLING_MSG \ + "Verify that the cooling on this machine is functional, including external, " \ + "thermal material interface, fans, and any other components." +#define BUG_REPORT_MSG "Please capture an nvidia-bug-report and send it to NVIDIA." +#define SYSTEM_TRIAGE_MSG "Check DCGM and system logs for errors. Reset GPU. Restart DCGM. Rerun diagnostics." +#define CONFIG_MSG "Check DCGM and system configuration. This error may be eliminated with an updated configuration." + +/* + * Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG format + * where is the actual message. + */ +#define DCGM_FR_OK_MSG "The operation completed successfully." +#define DCGM_FR_UNKNOWN_MSG "Unknown error." +#define DCGM_FR_UNRECOGNIZED_MSG "Unrecognized error code." +// replay limit, gpu id, replay errors detected +#define DCGM_FR_PCI_REPLAY_RATE_MSG "Detected more than %u PCIe replays per minute for GPU %u : %d" +// dbes deteced, gpu id +#define DCGM_FR_VOLATILE_DBE_DETECTED_MSG "Detected %d volatile double-bit ECC error(s) in GPU %u." +// sbe limit, gpu id, sbes detected +#define DCGM_FR_VOLATILE_SBE_DETECTED_MSG "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld" +// gpu id +#define DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG "A pending retired page has been detected in GPU %u." +// retired pages detected, gpud id +#define DCGM_FR_RETIRED_PAGES_LIMIT_MSG "%u or more retired pages have been detected in GPU %u. " +// retired pages due to dbes detected, gpu id +#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG \ + "An excess of %u retired pages due to DBEs have been detected and" \ + " more than one page has been retired due to DBEs in the past" \ + " week in GPU %u." +// gpu id +#define DCGM_FR_CORRUPT_INFOROM_MSG "A corrupt InfoROM has been detected in GPU %u." +// gpu id +#define DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG "Detected clock throttling due to thermal violation in GPU %u." +// gpu id +#define DCGM_FR_POWER_UNREADABLE_MSG "Cannot reliably read the power usage for GPU %u." +// gpu id +#define DCGM_FR_CLOCK_THROTTLE_POWER_MSG "Detected clock throttling due to power violation in GPU %u." +// nvlink errors detected, nvlink id, error threshold +#define DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG \ + "Detected %ld %s NvLink errors on GPU %u's NVLink which exceeds " \ + "threshold of %u" +// gpu id, nvlink id +#define DCGM_FR_NVLINK_DOWN_MSG "GPU %u's NvLink link %d is currently down" +// nvlinks up, expected nvlinks up +#define DCGM_FR_GPU_EXPECTED_NVLINKS_UP_MSG "Only %u NvLinks are up out of the expected %u" +// switch id, nvlinks up, expected nvlinks up +#define DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP_MSG "NvSwitch %u - Only %u NvLinks are up out of the expected %u" +// nvswitch id, nvlink id +#define DCGM_FR_NVSWITCH_FATAL_ERROR_MSG "Detected fatal errors on NvSwitch %u link %u" +// nvswitch id, nvlink id +#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG "Detected nonfatal errors on NvSwitch %u link %u" +// nvswitch id, nvlink port +#define DCGM_FR_NVSWITCH_DOWN_MSG "NvSwitch physical ID %u's NvLink port %d is currently down." +// file path, error detail +#define DCGM_FR_NO_ACCESS_TO_FILE_MSG "File %s could not be accessed directly: %s" +// purpose for communicating with NVML, NVML error as string, NVML error +#define DCGM_FR_NVML_API_MSG "Error calling NVML API %s: %s" +#define DCGM_FR_DEVICE_COUNT_MISMATCH_MSG \ + "The number of devices NVML returns is different than the number " \ + "of devices in /dev." +// function name +#define DCGM_FR_BAD_PARAMETER_MSG "Bad parameter to function %s cannot be processed" +// library name, error returned from dlopen +#define DCGM_FR_CANNOT_OPEN_LIB_MSG "Cannot open library %s: '%s'" +// the name of the denylisted driver +#define DCGM_FR_DENYLISTED_DRIVER_MSG "Found driver on the denylist: %s" +// the name of the function that wasn't found +#define DCGM_FR_NVML_LIB_BAD_MSG "Cannot get pointer to %s from libnvidia-ml.so" +#define DCGM_FR_GRAPHICS_PROCESSES_MSG \ + "NVVS has detected processes with graphics contexts open running on at least one " \ + "GPU. This may cause some tests to fail." +// error message from the API call +#define DCGM_FR_HOSTENGINE_CONN_MSG "Could not connect to the host engine: '%s'" +// field name, gpu id +#define DCGM_FR_FIELD_QUERY_MSG "Could not query field %s for GPU %u" +// environment variable name +#define DCGM_FR_BAD_CUDA_ENV_MSG "Found CUDA performance-limiting environment variable '%s'." +// gpu id +#define DCGM_FR_PERSISTENCE_MODE_MSG "Persistence mode for GPU %u is disabled." +// gpu id, direction (d2h, e.g.), measured bandwidth, expected bandwidth +#define DCGM_FR_LOW_BANDWIDTH_MSG \ + "Bandwidth of GPU %u in direction %s of %.2f did not exceed " \ + "minimum required bandwidth of %.2f." +// gpu id, direction (d2h, e.g.), measured latency, expected latency +#define DCGM_FR_HIGH_LATENCY_MSG \ + "Latency type %s of GPU %u value %.2f exceeded maximum allowed " \ + "latency of %.2f." +// field id +#define DCGM_FR_CANNOT_GET_FIELD_TAG_MSG "Unable to get field information for field id %hu" +// field value, field name, gpu id (this message is for fields that should always have a 0 value) +#define DCGM_FR_FIELD_VIOLATION_MSG "Detected %ld %s for GPU %u" +// field value, field name, gpu id, allowable threshold +#define DCGM_FR_FIELD_THRESHOLD_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +// field value, field name, gpu id (same as DCGM_FR_FIELD_VIOLATION, but it's a double) +#define DCGM_FR_FIELD_VIOLATION_DBL_MSG "Detected %.1f %s for GPU %u" +// field value, field name, gpu id, allowable threshold (same as DCGM_FR_FIELD_THRESHOLD, but it's a double) +#define DCGM_FR_FIELD_THRESHOLD_DBL_MSG "Detected %.1f %s for GPU %u which is above the threshold %.1f" +// field name +#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG \ + "Field %s is not supported by this API because it is neither an " \ + "int64 nor a double type." +// field name, allowable threshold, observed value, seconds +#define DCGM_FR_FIELD_THRESHOLD_TS_MSG \ + "%s met or exceeded the threshold of %lu per second: %lu at " \ + "%.1f seconds into the test." +// field name, allowable threshold, observed value, seconds (same as DCGM_FR_FIELD_THRESHOLD, but it's a double) +#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG \ + "%s met or exceeded the threshold of %.1f per second: %.1f at " \ + "%.1f seconds into the test." +// total seconds of violation, gpu id +#define DCGM_FR_THERMAL_VIOLATIONS_MSG "There were thermal violations totaling %.1f seconds for GPU %u" +// total seconds of violations, first instance, gpu id +#define DCGM_FR_THERMAL_VIOLATIONS_TS_MSG \ + "Thermal violations totaling %.1f seconds started at %.1f seconds " \ + "into the test for GPU %u" +// observed temperature, gpu id, max allowed temperature +#define DCGM_FR_TEMP_VIOLATION_MSG \ + "Temperature %lld of GPU %u exceeded user-specified maximum " \ + "allowed temperature %lld" +// gpu id, seconds into test, details about throttling +#define DCGM_FR_THROTTLING_VIOLATION_MSG \ + "Clocks are being throttled for GPU %u because of clock " \ + "throttling starting %.1f seconds into the test. %s" +// details about error +#define DCGM_FR_INTERNAL_MSG "There was an internal error during the test: '%s'" +// gpu id, PCIe generation, minimum allowed, parameter to control +#define DCGM_FR_PCIE_GENERATION_MSG \ + "GPU %u is running at PCI link generation %d, which is below " \ + "the minimum allowed link generation of %d (parameter '%s')" +// gpu id, PCIe width, minimum allowed, parameter to control +#define DCGM_FR_PCIE_WIDTH_MSG \ + "GPU %u is running at PCI link width %dX, which is below the " \ + "minimum allowed link generation of %d (parameter '%s')" +#define DCGM_FR_ABORTED_MSG "Test was aborted early due to user signal" +// Test name +#define DCGM_FR_TEST_DISABLED_MSG "The %s test is skipped for this GPU." +// stat name, gpu id +#define DCGM_FR_CANNOT_GET_STAT_MSG "Unable to generate / collect stat %s for GPU %u" +// observed value, minimum allowed, gpu id +#define DCGM_FR_STRESS_LEVEL_MSG \ + "Max stress level of %.1f did not reach desired stress level of " \ + "%.1f for GPU %u" +// CUDA API name +#define DCGM_FR_CUDA_API_MSG "Error using CUDA API %s" +// count, gpu id +#define DCGM_FR_FAULTY_MEMORY_MSG "Found %d faulty memory elements on GPU %u" +// error detail +#define DCGM_FR_CANNOT_SET_WATCHES_MSG "Unable to add field watches to DCGM: %s" +// gpu id +#define DCGM_FR_CUDA_UNBOUND_MSG "Cuda GPU %d is no longer bound to a CUDA context...Aborting" +// Test name, gpu id +#define DCGM_FR_ECC_DISABLED_MSG "Skipping test %s because ECC is not enabled on GPU %u" +// percentage of memory we tried to allocate, gpu id +#define DCGM_FR_MEMORY_ALLOC_MSG "Couldn't allocate at least %.1f%% of GPU memory on GPU %u" +// gpu id +#define DCGM_FR_CUDA_DBE_MSG \ + "CUDA APIs have indicated that a double-bit ECC error has " \ + "occured on GPU %u." +// gpu id +#define DCGM_FR_MEMORY_MISMATCH_MSG \ + "A memory mismatch was detected on GPU %u, but no error was " \ + "reported by CUDA or NVML." +// gpu id, error detail +#define DCGM_FR_CUDA_DEVICE_MSG "Unable to find a corresponding CUDA device for GPU %u: '%s'" +#define DCGM_FR_ECC_UNSUPPORTED_MSG "ECC Memory is not turned on or is unsupported. Skipping test." +// gpu id +#define DCGM_FR_ECC_PENDING_MSG "ECC memory for GPU %u is in a pending state." +// gpu id, observed bandwidth, required, test name +#define DCGM_FR_MEMORY_BANDWIDTH_MSG \ + "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing " \ + "to meet %.2f GB/s for test %d" +// power draw observed, field tag, minimum power draw required, gpu id +#define DCGM_FR_TARGET_POWER_MSG \ + "Max power of %.1f did not reach desired power minimum %s of " \ + "%.1f for GPU %u" +// API name, error detail +#define DCGM_FR_API_FAIL_MSG "API call %s failed: '%s'" +// API name, gpu id, error detail +#define DCGM_FR_API_FAIL_GPU_MSG "API call %s failed for GPU %u: '%s'" +// gpu id, error detail +#define DCGM_FR_CUDA_CONTEXT_MSG "GPU %u failed to create a CUDA context: %s" +// DCGM API name +#define DCGM_FR_DCGM_API_MSG "Error using DCGM API %s" +#define DCGM_FR_CONCURRENT_GPUS_MSG \ + "Unable to run concurrent pair bandwidth test without 2 or more " \ + "gpus. Skipping" +#define DCGM_FR_TOO_MANY_ERRORS_MSG \ + "This API can only return up to four errors per system. " \ + "Additional errors were found for this system that couldn't be " \ + "communicated." +// error count, gpu id +#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG \ + "%.1f %s NvLink errors found occuring per second on GPU %u, " \ + "exceeding the limit of 100 per second." +// error count, field name, gpu id +#define DCGM_FR_NVLINK_ERROR_CRITICAL_MSG "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)" +// gpu id, power limit, power reached +#define DCGM_FR_ENFORCED_POWER_LIMIT_MSG \ + "Enforced power limit on GPU %u set to %.1f, which is too low to " \ + "attempt to achieve target power %.1f" +// memory +#define DCGM_FR_MEMORY_ALLOC_HOST_MSG "Cannot allocate %zu bytes on the host" +#define DCGM_FR_GPU_OP_MODE_MSG "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP." +// clock, count +#define DCGM_FR_NO_MEMORY_CLOCKS_MSG "No memory clocks <= %u MHZ were found in %u supported memory clocks." +// clock, count, clock +#define DCGM_FR_NO_GRAPHICS_CLOCKS_MSG \ + "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ." +// error detail +#define DCGM_FR_HAD_TO_RESTORE_STATE_MSG "Had to restore GPU state on NVML GPU(s): %s" +#define DCGM_FR_L1TAG_UNSUPPORTED_MSG "This card does not support the L1 cache test. Skipping test." +#define DCGM_FR_L1TAG_MISCOMPARE_MSG "Detected a miscompare failure in the L1 cache." +// gpu id +#define DCGM_FR_ROW_REMAP_FAILURE_MSG "GPU %u had uncorrectable memory errors and row remapping failed." +#define DCGM_FR_UNCONTAINED_ERROR_MSG "GPU had an uncontained error (XID 95)" +#define DCGM_FR_EMPTY_GPU_LIST_MSG "No valid GPUs passed to plugin" +#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_MSG "Pending page retirements together with a DBE were detected on GPU %u." +// gpu id, rows remapped +#define DCGM_FR_UNCORRECTABLE_ROW_REMAP_MSG "GPU %u had uncorrectable memory errors and %u rows were remapped" +// gpu id +#define DCGM_FR_PENDING_ROW_REMAP_MSG "GPU %u had memory errors and row remappings are pending" +// gpu id, test name +#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG "GPU %u was unsuccessfully written to in a peer-to-peer test: %s" +// gpu id, test name +#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG "GPU %u unsuccessfully wrote data in a peer-to-peer test: %s" +// nvswitch id, nvlink id +#define DCGM_FR_NVSWITCH_NVLINK_DOWN_MSG "NVSwitch %u's NvLink %u is down." +#define DCGM_FR_EUD_BINARY_PERMISSIONS_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_NON_ROOT_USER_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_SPAWN_FAILURE_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_TIMEOUT_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_ZOMBIE_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_NON_ZERO_EXIT_CODE_MSG "" /* See message inplace */ +#define DCGM_FR_EUD_TEST_FAILED_MSG "" /* See message inplace */ +#define DCGM_FR_FILE_CREATE_PERMISSIONS_MSG \ + "The DCGM Diagnostic does not have permissions to create a file in directory '%s'" +#define DCGM_FR_PAUSE_RESUME_FAILED_MSG "" /* See message inplace */ +// gpu id +#define DCGM_FR_PCIE_H_REPLAY_VIOLATION_MSG "GPU %u host-side PCIe replay violation, see dmesg for more information" +// xid error, gpu id +#define DCGM_FR_XID_ERROR_MSG "Detected XID %u for GPU %u" +// count, field, gpu id +#define DCGM_FR_SBE_VIOLATION_MSG "Detected %ld %s for GPU %u" +// count, field, gpu id +#define DCGM_FR_DBE_VIOLATION_MSG "Detected %ld %s for GPU %u" +// count, field, gpu id +#define DCGM_FR_PCIE_REPLAY_VIOLATION_MSG "Detected %ld %s for GPU %u" +// count, field, gpu id, threshold +#define DCGM_FR_SBE_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +#define DCGM_FR_DBE_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +#define DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld" +#define DCGM_FR_CUDA_FM_NOT_INITIALIZED_MSG "" +#define DCGM_FR_SXID_ERROR_MSG "Detected fatal NvSwitch SXID %u" +#define DCGM_FR_ERROR_SENTINEL_MSG "" /* See message inplace */ + +/* + * Suggestions for next steps for the corresponding error message + */ +#define DCGM_FR_OK_NEXT "N/A" +#define DCGM_FR_UNKNOWN_NEXT "" +#define DCGM_FR_UNRECOGNIZED_NEXT "" +#define DCGM_FR_PCI_REPLAY_RATE_NEXT \ + "Reconnect PCIe card. Run system side PCIE diagnostic utilities " \ + "to verify hops off the GPU board. If issue is on the board, run " \ + "the field diagnostic." +#define DCGM_FR_VOLATILE_DBE_DETECTED_NEXT "Drain the GPU and reset it or reboot the node." +#define DCGM_FR_VOLATILE_SBE_DETECTED_NEXT "Monitor - this GPU can still perform workload." +#define DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT "Monitor - this GPU can still perform workload" +#define DCGM_FR_RETIRED_PAGES_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CORRUPT_INFOROM_NEXT "Flash the InfoROM to clear this corruption." +#define DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT DEBUG_COOLING_MSG +#define DCGM_FR_POWER_UNREADABLE_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CLOCK_THROTTLE_POWER_NEXT "Monitor the power conditions. This GPU can still perform workload." +#define DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT "Monitor the NVLink. It can still perform workload." +#define DCGM_FR_NVLINK_DOWN_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT "Monitor the NVSwitch. It can still perform workload." +#define DCGM_FR_NVSWITCH_DOWN_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_NO_ACCESS_TO_FILE_NEXT "Check relevant permissions, access, and existence of the file." +#define DCGM_FR_GPU_EXPECTED_NVLINKS_UP_NEXT \ + "Ensure Fabric Manager is running. Check system logs, dmesg, and fabric-manager logs for more info." + +#define DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP_NEXT \ + "Ensure Fabric Manager is running. Check system logs, dmesg, and fabric-manager logs for more info." + +#define DCGM_FR_NVML_API_NEXT \ + "Check the error condition and ensure that appropriate libraries " \ + "are present and accessible." +#define DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT \ + "Check for the presence of cgroups, operating system blocks, and " \ + "or unsupported / older cards" +#define DCGM_FR_BAD_PARAMETER_NEXT BUG_REPORT_MSG +#define DCGM_FR_CANNOT_OPEN_LIB_NEXT \ + "Check for the existence of the library and set LD_LIBRARY_PATH " \ + "if needed." +#define DCGM_FR_DENYLISTED_DRIVER_NEXT "Please load the appropriate driver." +#define DCGM_FR_NVML_LIB_BAD_NEXT \ + "Make sure that the required version of libnvidia-ml.so " \ + "is present and accessible on the system." +#define DCGM_FR_GRAPHICS_PROCESSES_NEXT \ + "Stop the graphics processes or run this diagnostic on a server " \ + "that is not being used for display purposes." +#define DCGM_FR_HOSTENGINE_CONN_NEXT \ + "If hostengine is run separately, please ensure that it is up " \ + "and responsive." +#define DCGM_FR_FIELD_QUERY_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_BAD_CUDA_ENV_NEXT "Please unset this environment variable to address test failures." +#define DCGM_FR_PERSISTENCE_MODE_NEXT \ + "Enable persistence mode by running \"nvidia-smi -i -pm " \ + "1 \" as root." +#define DCGM_FR_LOW_BANDWIDTH_NEXT \ + "Verify that your minimum bandwidth setting is appropriate for " \ + "the topology of each GPU. If so, and errors are consistent, " \ + "please run a field diagnostic." +#define DCGM_FR_HIGH_LATENCY_NEXT \ + "Verify that your maximum latency setting is appropriate for " \ + "the topology of each GPU. If so, and errors are consistent, " \ + "please run a field diagnostic." +#define DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT "" +#define DCGM_FR_FIELD_VIOLATION_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_VIOLATION_DBL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_DBL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_TS_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_THERMAL_VIOLATIONS_NEXT DEBUG_COOLING_MSG +#define DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT DEBUG_COOLING_MSG +#define DCGM_FR_TEMP_VIOLATION_NEXT \ + "Verify that the user-specified temperature maximum is set " \ + "correctly. If it is, check the cooling for this GPU and node: " DEBUG_COOLING_MSG +#define DCGM_FR_THROTTLING_VIOLATION_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_INTERNAL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_PCIE_GENERATION_NEXT CONFIG_MSG +#define DCGM_FR_PCIE_WIDTH_NEXT CONFIG_MSG +#define DCGM_FR_ABORTED_NEXT "" +#define DCGM_FR_TEST_DISABLED_NEXT CONFIG_MSG +#define DCGM_FR_CANNOT_GET_STAT_NEXT \ + "If running a standalone nv-hostengine, verify that it is up " \ + "and responsive." +#define DCGM_FR_STRESS_LEVEL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CUDA_API_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_FAULTY_MEMORY_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CANNOT_SET_WATCHES_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CUDA_UNBOUND_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_ECC_DISABLED_NEXT \ + "Enable ECC memory by running \"nvidia-smi -i -e 1\" " \ + "to enable. This may require a GPU reset or reboot to take effect." +#define DCGM_FR_MEMORY_ALLOC_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CUDA_DBE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_MEMORY_MISMATCH_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CUDA_DEVICE_NEXT \ + "Make sure CUDA_VISIBLE_DEVICES is not preventing visibility of " \ + "this GPU. Also check if CUDA libraries are compatible and " \ + "correctly installed." +#define DCGM_FR_ECC_UNSUPPORTED_NEXT CONFIG_MSG +#define DCGM_FR_ECC_PENDING_NEXT "Reboot to complete activation of the ECC memory." +#define DCGM_FR_MEMORY_BANDWIDTH_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_TARGET_POWER_NEXT "Verify that the clock speeds and GPU utilization are high." +#define DCGM_FR_API_FAIL_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_API_FAIL_GPU_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CUDA_CONTEXT_NEXT \ + "Please make sure the correct driver version is installed and " \ + "verify that no conflicting libraries are present." +#define DCGM_FR_DCGM_API_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_CONCURRENT_GPUS_NEXT CONFIG_MSG +#define DCGM_FR_TOO_MANY_ERRORS_NEXT "" +#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_ENFORCED_POWER_LIMIT_NEXT \ + "If this enforced power limit is necessary, then this test " \ + "cannot be run. If it is unnecessary, then raise the enforced " \ + "power limit setting to be able to run this test." +#define DCGM_FR_MEMORY_ALLOC_HOST_NEXT "Manually kill processes or restart your machine." +#define DCGM_FR_GPU_OP_MODE_NEXT \ + "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i " \ + "" +#define DCGM_FR_NO_MEMORY_CLOCKS_NEXT "" +#define DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT "" +#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_L1TAG_UNSUPPORTED_NEXT CONFIG_MSG +#define DCGM_FR_L1TAG_MISCOMPARE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_ROW_REMAP_FAILURE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_UNCONTAINED_ERROR_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT +#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT "Drain the GPU and reset it or reboot the node to resolve this issue." +#define DCGM_FR_EMPTY_GPU_LIST_NEXT CONFIG_MSG +#define DCGM_FR_UNCORRECTABLE_ROW_REMAP_NEXT "" +#define DCGM_FR_PENDING_ROW_REMAP_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_NEXT BUG_REPORT_MSG +#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_NEXT BUG_REPORT_MSG +#define DCGM_FR_NVSWITCH_NVLINK_DOWN_NEXT \ + "Please check fabric manager and initialization logs to figure out why the link is down. " \ + "You may also need to run a field diagnostic." +#define DCGM_FR_EUD_BINARY_PERMISSIONS_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_NON_ROOT_USER_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_SPAWN_FAILURE_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_TIMEOUT_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_ZOMBIE_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_NON_ZERO_EXIT_CODE_NEXT "" /* See message inplace */ +#define DCGM_FR_EUD_TEST_FAILED_NEXT "" /* See message inplace */ +#define DCGM_FR_FILE_CREATE_PERMISSIONS_NEXT \ + "Please restart the hostengine with parameter --home-dir to specify a different home directory for the " \ + "diagnostic or change permissions in the current directory to allow the user to write files there." +#define DCGM_FR_PAUSE_RESUME_FAILED_NEXT "" /* See message inplace */ +#define DCGM_FR_PCIE_H_REPLAY_VIOLATION_NEXT "" /* See message inplace */ +#define DCGM_FR_XID_ERROR_NEXT "Please consult the documentation for details of this XID." +#define DCGM_FR_SBE_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_DBE_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_PCIE_REPLAY_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_SBE_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_DBE_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG +#define DCGM_FR_CUDA_FM_NOT_INITIALIZED_NEXT "Ensure that the FabricManager is running without errors." +#define DCGM_FR_SXID_ERROR_NEXT SYSTEM_TRIAGE_MSG +#define DCGM_FR_ERROR_SENTINEL_NEXT "" /* See message inplace */ + +#ifdef __cplusplus +extern "C" +{ +#endif + + DCGM_PUBLIC_API dcgmErrorSeverity_t dcgmErrorGetPriorityByCode(unsigned int code); + DCGM_PUBLIC_API dcgmErrorCategory_t dcgmErrorGetCategoryByCode(unsigned int code); + DCGM_PUBLIC_API const char* dcgmErrorGetFormatMsgByCode(unsigned int code); + + DCGM_PUBLIC_API const dcgm_error_meta_t* dcgmGetErrorMeta(dcgmError_t error); + DCGM_PUBLIC_API const char* errorString(dcgmReturn_t result); + + /** @} */ + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // DCGM_ERRORS_H diff --git a/pkg/ixdcgm/include/ixdcgmApiExport.h b/pkg/ixdcgm/include/ixdcgmApiExport.h index 789cd6fb8982c36a1021a5dab3abf5ee77e00c49..56053b3bfca80ebaef84583dce1a9b6f1b638a13 100644 --- a/pkg/ixdcgm/include/ixdcgmApiExport.h +++ b/pkg/ixdcgm/include/ixdcgmApiExport.h @@ -27,208 +27,39 @@ extern "C" #endif #define IXDCGM_PRIVATE_API __attribute((visibility("hidden"))) + /** + * Get the IxLink link status for every IxLink in this system. + * + * @param pixdcgmHandle IN: IxDCGM Handle + * @param linkStatus OUT: Structure in which to store IxLink link statuses. + * + * @return + * - \ref IXDCGM_RET_OK if the call was successful. + * - \ref IXDCGM_RET_NOT_SUPPORTED if the given entityGroup does not support enumeration. + * - \ref IXDCGM_RET_BADPARAM if any parameter is invalid + * - \ref IXDCGM_RET_VER_MISMATCH if the version of linkStatus is not ixdcgmLinkStatus_v3 + */ + ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLinkStatus(ixdcgmHandle_t pixdcgmHandle, ixdcgmLinkStatus_v3 *linkStatus); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmInit(void); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStartEmbedded(ixdcgmStartEmbeddedParam *params); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmEngineStart(unsigned short portNum, char const *socketPath, bool overTCP); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmEngineRun(unsigned short portNumber, - char const *socketPath, - unsigned int isConnectionTCP); - IXDCGM_PUBLIC_API const char *ixdcgmErrorString(ixdcgmReturn_t result); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmDisconnect(ixdcgmHandle_t pixdcgmHandle); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConnect(const char *ipAddress, - ixdcgmConnectParams *connectParams, - ixdcgmHandle_t *pixdcgmHandle); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetEntityGroupEntities(ixdcgmHandle_t pixdcgmHandle, - ixdcgm_field_entity_group_t entityGroup, - ixdcgm_field_eid_t *entities, - int *numEntities, - unsigned int flags); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetDeviceAttributes(ixdcgmHandle_t pixdcgmHandle, - unsigned int gpuId, - ixdcgmDeviceAttributes_t *pixdcgmAttr); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetAllDevices(ixdcgmHandle_t pixdcgmHandle, - unsigned int gpuIdList[IXDCGM_MAX_NUM_DEVICES], - int *count); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetAllSupportedDevices(ixdcgmHandle_t pixdcgmHandle, - unsigned int gpuIdList[IXDCGM_MAX_NUM_DEVICES], - int *count); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmEntitiesGetLatestValues(ixdcgmHandle_t pDcgmHandle, - ixdcgmGroupEntityPair_t entities[], - unsigned int entityCount, - unsigned short fields[], - unsigned int fieldCount, - unsigned int flags, - ixdcgmFieldValue_v2 values[]); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHostengineVersionInfo(ixdcgmHandle_t pixdcgmHandle, - ixdcgmVersionInfo_t *pVersionInfo); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmVersionInfo(ixdcgmVersionInfo_t *pVersionInfo); - - /*Grouping APIs*/ - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupCreate(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGroupType_t type, - const char *groupName, - ixdcgmGpuGrp_t *groupId); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupDestroy(ixdcgmHandle_t pixdcgmHandle, ixdcgmGpuGrp_t groupId); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupAddEntity(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgm_field_entity_group_t entityGroupId, - ixdcgm_field_eid_t entityId); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupAddDevice(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - unsigned int gpuId); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupRemoveDevice(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - unsigned int gpuId); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupRemoveEntity(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgm_field_entity_group_t entityGroupId, - ixdcgm_field_eid_t entityId); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupGetInfo(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmGroupInfo_t *pDcgmGroupInfo); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupGetAllIds(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupIdList[], - unsigned int *count); - - /* Field Grouping APIs*/ - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupCreate(ixdcgmHandle_t pixdcgmHandle, - int numFieldIds, - unsigned short *fieldIds, - const char *fieldGroupName, - ixdcgmFieldGrp_t *fieldGroupId); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupDestroy(ixdcgmHandle_t pixdcgmHandle, - ixdcgmFieldGrp_t fieldGroupId); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupGetInfo(ixdcgmHandle_t pixdcgmHandle, - ixdcgmFieldGroupInfo_t *fieldGroupInfo); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupGetAll(ixdcgmHandle_t pixdcgmHandle, - ixdcgmAllFieldGroup_t *allGroupInfo); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmWatchFields(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmFieldGrp_t fieldGroupId, - long long updateFreq, - double maxKeepAge, - int maxKeepSamples); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmUnwatchFields(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmFieldGrp_t fieldGroupId); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusCreate(ixdcgmStatus_t *statusHandle); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusDestroy(ixdcgmStatus_t statusHandle); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusGetCount(ixdcgmStatus_t statusHandle, unsigned int *count); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusPopError(ixdcgmStatus_t statusHandle, - ixdcgmErrorInfo_t *pixdcgmErrorInfo); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusClear(ixdcgmStatus_t statusHandle); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConfigGet(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmConfigType_t type, - int count, - ixdcgmConfig_t deviceConfigList[], - ixdcgmStatus_t statusHandle); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConfigSet(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmConfig_t *pDeviceConfig, - ixdcgmStatus_t statusHandle); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConfigEnforce(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmStatus_t statusHandle); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetValuesSince_v2(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmFieldGrp_t fieldGroupId, - long long sinceTimestamp, - long long *nextSinceTimestamp, - ixdcgmFieldValueEntityEnumeration_f enumCB, - void *userData); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLatestValues_v2(ixdcgmHandle_t pDcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmFieldGrp_t fieldGroupId, - ixdcgmFieldValueEntityEnumeration_f enumCB, - void *userData); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStopEmbedded(ixdcgmHandle_t pixdcgmHandle); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetFieldSummary(ixdcgmHandle_t pixdcgmHandle, - ixdcgmFieldSummaryRequest_t *request); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmShutdown(void); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmModuleIdToName(ixdcgmModuleId_t id, char const **name); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLatestValuesForFields(ixdcgmHandle_t pixdcgmHandle, - int gpuId, - unsigned short fields[], - unsigned int count, - ixdcgmFieldValue_v1 values[]); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmUpdateAllFields(ixdcgmHandle_t pixdcgmHandle, int waitForUpdate); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHostengineSetLoggingSeverity(ixdcgmHandle_t pixdcgmHandle, - ixdcgmSettingsSetLoggingSeverity_t *logging); - + /** + * Gets the 2 GPUs are on the same board or not. + * @param pixdcgmHandle IN: IxDCGM Handle + * @param gpuId1 IN: GPU1 Id + * @param gpuId2 IN: GPU2 Id + * @param onSameBoard IN/OUT: On same board info of the GPU pair. 0= not on the same board; 1= on the same board + * + * @return + * - \ref IXDCGM_RET_OK if the call was successful. + * - \ref IXDCGM_RET_BADPARAM if gpuId1, gpuId2 or onSameBoard were not valid. + */ ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmDeviceOnSameBoard(ixdcgmHandle_t pixdcgmHandle, unsigned int gpuId1, unsigned int gpuId2, int *onSameBoard); - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmModuleGetStatuses(ixdcgmHandle_t pixdcgmHandle, - ixdcgmModuleGetStatuses_t *moduleStatuses); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmModuleDenylist(ixdcgmHandle_t pixdcgmHandle, ixdcgmModuleId_t moduleId); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetDeviceTopology(ixdcgmHandle_t pixdcgmHandle, - unsigned int gpuId, - ixdcgmDeviceTopology_t *deviceTopology); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetGroupTopology(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmGroupTopology_t *pixdcgmGroupTopology); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmSelectGpusByTopology(ixdcgmHandle_t pixdcgmHandle, - uint64_t inputGpuIds, - uint32_t numGpus, - uint64_t *outputGpuIds, - uint64_t hintFlags); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHealthGet(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmHealthSystems_t *systems); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHealthSet_v2(ixdcgmHandle_t pixdcgmHandle, ixdcgmHealthSetParams_v2 *params); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHealthCheck_v4(ixdcgmHandle_t pixdcgmHandle, - ixdcgmGpuGrp_t groupId, - ixdcgmHealthResponse_v4 *response); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLinkStatus(ixdcgmHandle_t pixdcgmHandle, ixdcgmLinkStatus_v3 *linkStatus); - - ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmProfGetSupportedMetricGroups(ixdcgmHandle_t pixdcgmHandle, - ixdcgmProfGetMetricGroups_t *metricGroups); - /** * Gets all the running process info corresponding to the gpuId . - * @param pixdcgmHandle IN: ixDCGM Handle + * @param pixdcgmHandle IN: IxDCGM Handle * @param gpuId IN: GPU Id corresponding to which the processes info should be fetched * @param infoCount IN/OUT: * IN - max number of the info could be stored in to the pids and usedMemoryBytes buffer @@ -238,9 +69,9 @@ extern "C" * @param usedMemoryBytes OUT: Buffer to store returned processes used memory in byte * * @return - * - \ref DCGM_ST_OK if the call was successful. + * - \ref IXDCGM_RET_OK if the call was successful. * - \ref IXDCGM_RET_INSUFFICIENT_SIZE if the infoCount input is smaller than the buffer needed. - * - \ref DCGM_ST_BADPARAM if gpuId, infoCunt, pids or usedMemoryBytes not valid. + * - \ref IXDCGM_RET_BADPARAM if gpuId, infoCunt, pids or usedMemoryBytes not valid. **/ ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetDeviceRunningProcesses(ixdcgmHandle_t pixdcgmHandle, unsigned int gpuId, @@ -248,6 +79,8 @@ extern "C" uint64_t *pids, uint64_t *usedMemoryBytes); + IXDCGM_PUBLIC_API const char *ixdcgmErrorString(ixdcgmReturn_t result); + #ifdef __cplusplus } #endif diff --git a/pkg/ixdcgm/policy.go b/pkg/ixdcgm/policy.go new file mode 100644 index 0000000000000000000000000000000000000000..ada6c23fe7daba7cead8b4a230c4c1b3ea5be188 --- /dev/null +++ b/pkg/ixdcgm/policy.go @@ -0,0 +1,464 @@ +/* +Copyright (c) 2024, NVIDIA CORPORATION. +Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ixdcgm + +/* +#include "include/dcgm_agent.h" +#include "include/dcgm_structs.h" + +// wrapper for go callback function +extern int violationNotify(void* p); +extern int voidCallback(void* p); +*/ +import "C" +import ( + "context" + "encoding/binary" + "fmt" + "log" + "math/rand" + "sync" + "time" + "unsafe" + + "github.com/creasty/defaults" +) + +// At least one policy must be enabled. +type PolicyConditionParams struct { + // DbePolicyEnabled indicates whether the DbePolicy is enabled. Default is false (disabled). + DbePolicyEnabled bool `default:"false"` + + // PCIePolicyEnabled indicates whether the PCIePolicy is enabled. Default is false (disabled). + PCIePolicyEnabled bool `default:"false"` + + // MaxRtPgPolicyEnabled indicates whether the MaxRtPgPolicy is enabled. Default is false (disabled). + MaxRtPgPolicyEnabled bool `default:"false"` + + // MaxRtPgPolicyThreshold specifies the maximum number of retired pages that will trigger a violation. + // Note that the MaxRtPgPolicyThreshold will be ignored if MaxRtPgPolicy is disabled. + // Default value is 10. + MaxRtPgPolicyThreshold uint32 `default:"10"` + + // ThermalPolicyEnabled indicates whether the ThermalPolicy is enabled. Default is false (disabled). + ThermalPolicyEnabled bool `default:"false"` + + // ThermalPolicyThreshold specifies the maximum temperature a group's GPUs can reach before triggering a violation. + // Note that the ThermalPolicyThreshold will be ignored if ThermalPolicy is disabled. + // Default value is 100 and the unit is in degrees Celsius (°C). + ThermalPolicyThreshold uint32 `default:"100"` + + // PowerPolicyEnabled indicates whether the PowerPolicy is enabled. Default is false (disabled). + PowerPolicyEnabled bool `default:"false"` + + // PowerPolicyThreshold specifies the maximum power a group's GPUs can reach before triggering a violation. + // Note that the PowerPolicyThreshold will be ignored if PowerPolicy is fadisabledlse. + // Default value is 250 and the unit is in watts (W). + PowerPolicyThreshold uint32 `default:"250"` +} + +type policyCondition string + +const ( + DbePolicy = policyCondition("Double-bit ECC Error") + PCIePolicy = policyCondition("PCI Error") + MaxRtPgPolicy = policyCondition("Max Retired Pages Limit") + ThermalPolicy = policyCondition("Thermal Limit") + PowerPolicy = policyCondition("Power Limit") +) + +type PolicyViolation struct { + Condition policyCondition + Timestamp time.Time + Data interface{} +} + +type policyIndex int + +const ( + dbePolicyIndex policyIndex = iota + pciePolicyIndex + maxRtPgPolicyIndex + thermalPolicyIndex + powerPolicyIndex +) + +type policyConditionParam struct { + typ uint32 + value uint32 +} + +type DbePolicyCondition struct { + Location string + NumErrors uint +} + +type PciPolicyCondition struct { + ReplayCounter uint +} + +type RetiredPagesPolicyCondition struct { + SbePages uint + DbePages uint +} + +type ThermalPolicyCondition struct { + ThermalViolation uint +} + +type PowerPolicyCondition struct { + PowerViolation uint +} + +var ( + policyChanOnce sync.Once + policyMapOnce sync.Once + + // callbacks maps PolicyViolation channels with policy + // captures C callback() value for each violation condition + callbacks map[string]chan PolicyViolation + + // paramMap maps C.dcgmPolicy_t.parms index and limits + // to be used in setPolicy() for setting user selected policies + paramMap map[policyIndex]policyConditionParam + + registerCh = make(chan struct{}) +) + +func makePolicyChannels() { + policyChanOnce.Do(func() { + callbacks = make(map[string]chan PolicyViolation) + callbacks["dbe"] = make(chan PolicyViolation, 1) + callbacks["pcie"] = make(chan PolicyViolation, 1) + callbacks["maxrtpg"] = make(chan PolicyViolation, 1) + callbacks["thermal"] = make(chan PolicyViolation, 1) + callbacks["power"] = make(chan PolicyViolation, 1) + }) +} + +func makePolicyParamsMap(params *PolicyConditionParams) { + const ( + policyFieldTypeBool = 0 + policyFieldTypeLong = 1 + policyBoolValue = 1 + ) + + policyMapOnce.Do(func() { + paramMap = make(map[policyIndex]policyConditionParam) + + paramMap[dbePolicyIndex] = policyConditionParam{ + typ: policyFieldTypeBool, + value: policyBoolValue, + } + + paramMap[pciePolicyIndex] = policyConditionParam{ + typ: policyFieldTypeBool, + value: policyBoolValue, + } + + paramMap[maxRtPgPolicyIndex] = policyConditionParam{ + typ: policyFieldTypeLong, + value: params.MaxRtPgPolicyThreshold, + } + + paramMap[thermalPolicyIndex] = policyConditionParam{ + typ: policyFieldTypeLong, + value: params.ThermalPolicyThreshold, + } + + paramMap[powerPolicyIndex] = policyConditionParam{ + typ: policyFieldTypeLong, + value: params.PowerPolicyThreshold, + } + + }) +} + +func setPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t, paramList []policyIndex) (err error) { + var policy C.dcgmPolicy_t + policy.version = makeVersion1(unsafe.Sizeof(policy)) + policy.mode = C.dcgmPolicyMode_t(C.DCGM_OPERATION_MODE_AUTO) + policy.action = C.DCGM_POLICY_ACTION_NONE + policy.isolation = C.DCGM_POLICY_ISOLATION_NONE + policy.validation = C.DCGM_POLICY_VALID_NONE + policy.condition = condition + + // iterate on paramMap for given policy conditions + for _, key := range paramList { + conditionParam, exists := paramMap[policyIndex(key)] + if !exists { + return fmt.Errorf("Error: Invalid Policy condition, %v does not exist", key) + } + // set policy condition parameters + // set condition type (bool or longlong) + policy.parms[key].tag = conditionParam.typ + + // set condition val (violation threshold) + // policy.parms.val is a C union type + // cgo docs: Go doesn't have support for C's union type + // C union types are represented as a Go byte array + binary.LittleEndian.PutUint32(policy.parms[key].val[:], conditionParam.value) + } + + var statusHandle C.dcgmStatus_t + + result := C.dcgmPolicySet(handle.handle, groupId.handle, &policy, statusHandle) + if err = errorString(result); err != nil { + return fmt.Errorf("Error setting policies: %s", err) + } + + log.Println("Policy successfully set.") + + return +} + +func validatePolicy(p *PolicyConditionParams) error { + if err := defaults.Set(p); err != nil { + return err + } + if !(p.DbePolicyEnabled || p.PCIePolicyEnabled || p.MaxRtPgPolicyEnabled || p.ThermalPolicyEnabled || p.PowerPolicyEnabled) { + return fmt.Errorf("bad parameters: at least one policy must be enabled") + } + return nil +} + +func registerPolicyForGpus(ctx context.Context, params *PolicyConditionParams, gpuIds ...uint) (<-chan PolicyViolation, error) { + groupId, err := CreateGroup(fmt.Sprintf("PolicyGroup_%d", rand.Uint64())) + if err != nil { + return nil, fmt.Errorf("failed to create policy group, err: %v", err) + } + + go func() { + <-ctx.Done() + select { + case <-registerCh: // Wait the policy is unregistered + _ = DestroyGroup(groupId) + case <-time.After(500 * time.Millisecond): + _ = DestroyGroup(groupId) + } + }() + + for _, gpuId := range gpuIds { + err = AddToGroup(groupId, gpuId) + if err != nil { + return nil, fmt.Errorf("failed to add gpu %d to policy group, err: %v", gpuId, err) + } + } + + return registerPolicy(ctx, groupId, params) +} + +// registerPolicy sets GPU usage and error policies and notifies in case of any violations on GPUs within a specific group +func registerPolicy(ctx context.Context, groupId GroupHandle, params *PolicyConditionParams) (<-chan PolicyViolation, error) { + if params == nil { + return nil, fmt.Errorf("PolicyConditionParams is required") + } + if err := validatePolicy(params); err != nil { + return nil, err + } + + // init policy globals for internal API + makePolicyChannels() + makePolicyParamsMap(params) + + // make a list of policy conditions for setting their parameters + var paramKeys []policyIndex + // get all conditions to be set in setPolicy() + var condition C.dcgmPolicyCondition_t = 0 + // get length of enabled condition types + var conTypes int = 0 + if params.DbePolicyEnabled { + conTypes++ + paramKeys = append(paramKeys, dbePolicyIndex) + condition |= C.DCGM_POLICY_COND_DBE + } + if params.PCIePolicyEnabled { + conTypes++ + paramKeys = append(paramKeys, pciePolicyIndex) + condition |= C.DCGM_POLICY_COND_PCI + } + if params.MaxRtPgPolicyEnabled { + conTypes++ + paramKeys = append(paramKeys, maxRtPgPolicyIndex) + condition |= C.DCGM_POLICY_COND_MAX_PAGES_RETIRED + } + if params.ThermalPolicyEnabled { + conTypes++ + paramKeys = append(paramKeys, thermalPolicyIndex) + condition |= C.DCGM_POLICY_COND_THERMAL + } + if params.PowerPolicyEnabled { + conTypes++ + paramKeys = append(paramKeys, powerPolicyIndex) + condition |= C.DCGM_POLICY_COND_POWER + } + + var err error + if err = setPolicy(groupId, condition, paramKeys); err != nil { + return nil, err + } + + result := C.dcgmPolicyRegister(handle.handle, groupId.handle, + C.dcgmPolicyCondition_t(condition), + C.fpRecvUpdates(C.violationNotify), + C.fpRecvUpdates(C.voidCallback), + ) + + if err = errorString(result); err != nil { + return nil, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} + } + log.Println("Listening for violations...") + + violation := make(chan PolicyViolation, conTypes) + + go func() { + defer func() { + log.Println("unregister policy violation...") + unregisterPolicy(groupId, condition) + close(violation) + close(registerCh) + }() + for { + select { + case dbe := <-callbacks["dbe"]: + violation <- dbe + case pcie := <-callbacks["pcie"]: + violation <- pcie + case maxrtpg := <-callbacks["maxrtpg"]: + violation <- maxrtpg + case thermal := <-callbacks["thermal"]: + violation <- thermal + case power := <-callbacks["power"]: + violation <- power + case <-ctx.Done(): + return + } + } + }() + + return violation, err +} + +func unregisterPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t) { + result := C.dcgmPolicyUnregister(handle.handle, groupId.handle, condition) + + if err := errorString(result); err != nil { + log.Println(fmt.Errorf("error unregistering policy: %s", err)) + } +} + +func createTimeStamp(t C.longlong) time.Time { + tm := int64(t) / 1000000 + ts := time.Unix(tm, 0) + return ts +} + +func dbeLocation(location int) string { + switch location { + case 0: + return "L1" + case 1: + return "L2" + case 2: + return "Device" + case 3: + return "Register" + case 4: + return "Texture" + } + return "N/A" +} + +// VoidCallback is a go callback function for dcgmPolicyRegister() wrapped in C.voidCallback() +// +//export VoidCallback +func VoidCallback(data unsafe.Pointer) int { + return 0 +} + +// ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify() +// +//export ViolationRegistration +func ViolationRegistration(data unsafe.Pointer) int { + var con policyCondition + var timestamp time.Time + var val interface{} + + response := *(*C.dcgmPolicyCallbackResponse_t)(unsafe.Pointer(data)) + + switch response.condition { + case C.DCGM_POLICY_COND_DBE: + dbe := (*C.dcgmPolicyConditionDbe_t)(unsafe.Pointer(&response.val)) + con = DbePolicy + timestamp = createTimeStamp(dbe.timestamp) + val = DbePolicyCondition{ + Location: dbeLocation(int(dbe.location)), + NumErrors: *uintPtr(dbe.numerrors), + } + case C.DCGM_POLICY_COND_PCI: + pci := (*C.dcgmPolicyConditionPci_t)(unsafe.Pointer(&response.val)) + con = PCIePolicy + timestamp = createTimeStamp(pci.timestamp) + val = PciPolicyCondition{ + ReplayCounter: *uintPtr(pci.counter), + } + case C.DCGM_POLICY_COND_MAX_PAGES_RETIRED: + mpr := (*C.dcgmPolicyConditionMpr_t)(unsafe.Pointer(&response.val)) + con = MaxRtPgPolicy + timestamp = createTimeStamp(mpr.timestamp) + val = RetiredPagesPolicyCondition{ + SbePages: *uintPtr(mpr.sbepages), + DbePages: *uintPtr(mpr.dbepages), + } + case C.DCGM_POLICY_COND_THERMAL: + thermal := (*C.dcgmPolicyConditionThermal_t)(unsafe.Pointer(&response.val)) + con = ThermalPolicy + timestamp = createTimeStamp(thermal.timestamp) + val = ThermalPolicyCondition{ + ThermalViolation: *uintPtr(thermal.thermalViolation), + } + case C.DCGM_POLICY_COND_POWER: + pwr := (*C.dcgmPolicyConditionPower_t)(unsafe.Pointer(&response.val)) + con = PowerPolicy + timestamp = createTimeStamp(pwr.timestamp) + val = PowerPolicyCondition{ + PowerViolation: *uintPtr(pwr.powerViolation), + } + } + + err := PolicyViolation{ + Condition: con, + Timestamp: timestamp, + Data: val, + } + + switch con { + case DbePolicy: + callbacks["dbe"] <- err + case PCIePolicy: + callbacks["pcie"] <- err + case MaxRtPgPolicy: + callbacks["maxrtpg"] <- err + case ThermalPolicy: + callbacks["thermal"] <- err + case PowerPolicy: + callbacks["power"] <- err + } + return 0 +} diff --git a/pkg/ixdcgm/process_info.go b/pkg/ixdcgm/process_info.go index 74955a883a8dc20725f493914fb92f4d31525b8a..8fe9f231baaae01344c29310bbeef26c43a73e31 100644 --- a/pkg/ixdcgm/process_info.go +++ b/pkg/ixdcgm/process_info.go @@ -59,14 +59,13 @@ func ixdcgmGetDeviceRunningProcesses(gpuId uint) (cnt C.uint32_t, pids []C.uint6 usedMemoryBytes = make([]C.uint64_t, cnt) ret := C.ixdcgmGetDeviceRunningProcesses(C.ulong(handle.handle), C.uint(gpuId), &cnt, &pids[0], &usedMemoryBytes[0]) if ret == C.IXDCGM_RET_OK { - // fmt.Printf("the number of valid pids/usedMemoryBytes info is %d\n", uint32(cnt)) err = nil return } else if ret == C.IXDCGM_RET_INSUFFICIENT_SIZE { - // fmt.Printf("INSUFFICIENT_SIZE Warnnig: the needed buffer size is %d\n", uint32(cnt)) + fmt.Printf("INSUFFICIENT_SIZE Warnnig: the needed buffer size is %d\n", uint32(cnt)) continue - } else if ret == C.IXDCGM_RET_BADPARAM { - err = fmt.Errorf("bad parameter") + } else { + err = ixdcgmErrorString(ret) return } } diff --git a/pkg/ixdcgm/standalone.go b/pkg/ixdcgm/standalone.go index 7f1163d55bb6ef01b256aa51aed799a48553375e..76f30856449ee8c9b3de92b72a115cab4f38a173 100644 --- a/pkg/ixdcgm/standalone.go +++ b/pkg/ixdcgm/standalone.go @@ -49,7 +49,7 @@ func (s *standalone) Shutdown() error { } func (s *standalone) Start(args ...string) (DcgmHandle, error) { - fmt.Printf("standalone mode: %v\n", args) + fmt.Println("Start ixdcgm based on Standalone mode.") if len(args) < 2 { return DcgmHandle{}, fmt.Errorf("missing dcgm address or port") diff --git a/pkg/ixdcgm/startHostengine.go b/pkg/ixdcgm/startHostengine.go index 9c3dd923100617f1c7b9c4b877bb34ae05bfad0a..0a46990c30e413b4f401156ce3f4fc96b1af941f 100644 --- a/pkg/ixdcgm/startHostengine.go +++ b/pkg/ixdcgm/startHostengine.go @@ -28,6 +28,6 @@ func (s *startHostengine) Shutdown() (err error) { } func (s startHostengine) Start(args ...string) (DcgmHandle, error) { - fmt.Println("hello startHostengine mode") + fmt.Println("Start ixdcgm based on StartHostengine mode.") return DcgmHandle{}, nil } diff --git a/pkg/ixdcgm/topology.go b/pkg/ixdcgm/topology.go index 47ed56fdb44ce88c660136260441a8a5bee994c9..85d5b19120b40636887299ab7c84dc3ab75c5cdb 100644 --- a/pkg/ixdcgm/topology.go +++ b/pkg/ixdcgm/topology.go @@ -1,12 +1,13 @@ /* +Copyright (c) 2024, NVIDIA CORPORATION. Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); you may -not use this file except in compliance with the License. You may obtain -a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -27,18 +28,128 @@ import ( "unsafe" ) -func getDeviceTopology(deviceId uint) (err error) { +type P2PLinkType uint + +const ( + P2PLinkUnknown P2PLinkType = iota // N/A + + P2PLinkCrossCPU // SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + P2PLinkSameCPU // NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + P2PLinkHostBridge // PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + P2PLinkMultiSwitch // PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + P2PLinkSingleSwitch // PIX = Connection traversing at most a single PCIe bridge + P2PLinkSameBoard // INTE = Connection traversing at most a single on-board PCIe bridge + P2PLinkIXLINK1 // IX1 = Connection traversing a single IXLink + P2PLinkIXLINK2 // IX2 = Connection traversing two IXLinks + P2PLinkIXLINK3 // IX3 = Connection traversing three IXLinks + P2PLinkIXLINK4 // IX4 = Connection traversing four IXLinks + P2PLinkIXLINK5 // IX5 = Connection traversing five IXLinks + P2PLinkIXLINK6 // IX6 = Connection traversing six IXLinks +) + +func (l P2PLinkType) PCIPaths() string { + switch l { + case P2PLinkSameBoard: + return "INTE" + case P2PLinkSingleSwitch: + return "PIX" + case P2PLinkMultiSwitch: + return "PXB" + case P2PLinkHostBridge: + return "PHB" + case P2PLinkSameCPU: + return "NODE" + case P2PLinkCrossCPU: + return "SYS" + case P2PLinkIXLINK1: + return "IX1" + case P2PLinkIXLINK2: + return "IX2" + case P2PLinkIXLINK3: + return "IX3" + case P2PLinkIXLINK4: + return "IX4" + case P2PLinkIXLINK5: + return "IX5" + case P2PLinkIXLINK6: + return "IX6" + case P2PLinkUnknown: + } + return "N/A" +} + +type P2PLink struct { + GPU uint + BusID string + Link P2PLinkType +} + +func getP2PLink(path uint) P2PLinkType { + switch path { + case C.DCGM_TOPOLOGY_BOARD: + return P2PLinkSameBoard + case C.DCGM_TOPOLOGY_SINGLE: + return P2PLinkSingleSwitch + case C.DCGM_TOPOLOGY_MULTIPLE: + return P2PLinkMultiSwitch + case C.DCGM_TOPOLOGY_HOSTBRIDGE: + return P2PLinkHostBridge + case C.DCGM_TOPOLOGY_CPU: + return P2PLinkSameCPU + case C.DCGM_TOPOLOGY_SYSTEM: + return P2PLinkCrossCPU + case C.DCGM_TOPOLOGY_NVLINK1: + return P2PLinkIXLINK1 + case C.DCGM_TOPOLOGY_NVLINK2: + return P2PLinkIXLINK2 + case C.DCGM_TOPOLOGY_NVLINK3: + return P2PLinkIXLINK3 + case C.DCGM_TOPOLOGY_NVLINK4: + return P2PLinkIXLINK4 + case C.DCGM_TOPOLOGY_NVLINK5: + return P2PLinkIXLINK5 + case C.DCGM_TOPOLOGY_NVLINK6: + return P2PLinkIXLINK6 + } + return P2PLinkUnknown +} + +func getBusid(gpuid uint) (string, error) { + var device C.dcgmDeviceAttributes_v3 + device.version = makeVersion3(unsafe.Sizeof(device)) + + result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device) + if err := errorString(result); err != nil { + return "", fmt.Errorf("Error getting device busid: %s", err) + } + return *stringPtr(&device.identifiers.pciBusId[0]), nil +} + +func getDeviceTopology(gpuid uint) (links []P2PLink, err error) { var topology C.dcgmDeviceTopology_v1 topology.version = makeVersion1(unsafe.Sizeof(topology)) - res := C.dcgmGetDeviceTopology(handle.handle, C.uint(deviceId), &topology) - if res == C.DCGM_ST_NOT_SUPPORTED { - fmt.Println("not supported") - return nil + result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuid), &topology) + if result == C.DCGM_ST_NOT_SUPPORTED { + return links, fmt.Errorf("DcgmGetDeviceTopology is not supported") + } + if result != C.DCGM_ST_OK { + return links, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result} } - if res != C.DCGM_ST_OK { - return fmt.Errorf("error getting device topology %s", C.GoString(C.errorString(res))) + + busid, err := getBusid(gpuid) + if err != nil { + return } - return nil + for i := uint(0); i < uint(topology.numGpus); i++ { + gpu := topology.gpuPaths[i].gpuId + p2pLink := P2PLink{ + GPU: uint(gpu), + BusID: busid, + Link: getP2PLink(uint(topology.gpuPaths[i].path)), + } + links = append(links, p2pLink) + } + return } diff --git a/pkg/ixdcgm/utils.go b/pkg/ixdcgm/utils.go index 7ab4650210f05a31fc78547707ebb2d3087ca7e9..2a19c44821e4a7c479ba3ca897f93a91574b9ef7 100644 --- a/pkg/ixdcgm/utils.go +++ b/pkg/ixdcgm/utils.go @@ -30,9 +30,33 @@ package ixdcgm import "C" import ( "fmt" + "strconv" + "strings" "unsafe" ) +const ( + dcgmInt32Blank = 0x7ffffff0 // 2147483632 + dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792 +) + +func uintPtr(c C.uint) *uint { + i := uint(c) + return &i +} + +func stringPtr(c *C.char) *string { + s := C.GoString(c) + return &s +} + +type DcgmError struct { + msg string // description of error + Code C.dcgmReturn_t // dcgmReturn_t value of error +} + +func (e *DcgmError) Error() string { return e.msg } + func makeVersion1(struct_type uintptr) C.uint { version := C.uint(struct_type | 1<<24) return version @@ -48,6 +72,16 @@ func makeVersion3(struct_type uintptr) C.uint { return version } +func makeVersion4(struct_type uintptr) C.uint { + version := C.uint(struct_type | 4<<24) + return version +} + +func makeVersion5(struct_type uintptr) C.uint { + version := C.uint(struct_type | 5<<24) + return version +} + func errorString(result C.dcgmReturn_t) error { if result == C.DCGM_ST_OK { return nil @@ -87,3 +121,61 @@ func removeBytesSpaces(originalBytes []byte) string { return string(cleanedBytes) } + +// convertBitsetStr converts a set of numbers in string format to a range representation. +// input sample: "{0,1,2,3,6,10,11,12,13}" +// output sample: "0-3,6,10-13" +func convertBitsetStr(input string) (output string) { + input = strings.Trim(input, "{}") + numStrs := strings.Split(input, ",") + nums := make([]int, len(numStrs)) + + // Convert string numbers to integers + for i, numStr := range numStrs { + num, err := strconv.Atoi(strings.TrimSpace(numStr)) + if err != nil { + panic(err) + } + nums[i] = num + } + + // Sort the numbers (assuming they are not sorted) + // If the input is always sorted, you can skip this step + for i := 0; i < len(nums)-1; i++ { + for j := 0; j < len(nums)-1-i; j++ { + if nums[j] > nums[j+1] { + nums[j], nums[j+1] = nums[j+1], nums[j] + } + } + } + + // Process the numbers to create ranges + var result []string + start := nums[0] + end := nums[0] + + for i := 1; i < len(nums); i++ { + if nums[i] == end+1 { + end = nums[i] + } else { + if start == end { + result = append(result, strconv.Itoa(start)) + } else { + result = append(result, fmt.Sprintf("%d-%d", start, end)) + } + start = nums[i] + end = nums[i] + } + } + + // Handle the last range + if start == end { + result = append(result, strconv.Itoa(start)) + } else { + result = append(result, fmt.Sprintf("%d-%d", start, end)) + } + + // Join the result into a single string + output = strings.Join(result, ",") + return +} diff --git a/samples/devicecommon/main.go b/samples/devicecommon/main.go index e758b53734413c9c838b9679ade725fca5c934a5..f4c9651a3f543e9b96ebfef5e95c9bce2738ceb5 100644 --- a/samples/devicecommon/main.go +++ b/samples/devicecommon/main.go @@ -30,7 +30,7 @@ var ( ) func main() { - // choose ixdcgm hostengine running mode + // Choose ixdcgm hostengine running mode // 1. ixdcgm.Embedded // 2. ixdcgm.Standalone -connect "addr", -socket "isSocket" // 3. ixdcgm.StartHostengine diff --git a/samples/deviceinfo/main.go b/samples/deviceinfo/main.go index a1ededd2fa994289d5c6b992414106ce08139a51..791b124929cfcb92cffa57a9bd9d1a5f9be981d0 100644 --- a/samples/deviceinfo/main.go +++ b/samples/deviceinfo/main.go @@ -18,7 +18,6 @@ limitations under the License. package main import ( - "flag" "fmt" "html/template" "log" @@ -29,7 +28,7 @@ import ( const ( deviceInfo = `Driver Version : {{.Identifiers.DriverVersion}} -GPUId : {{.GPUId}} +GPUId : {{.GPUId}} IxDCGMSupported : {{.IxDCGMSupported}} Uuid : {{.Uuid}} Product Name : {{.Identifiers.ProductName}} @@ -41,22 +40,20 @@ Used Memory (MB): : {{or .MemoryUsage.Used "N/A"}} Free Memory (MB): : {{or .MemoryUsage.Free "N/A"}} Bandwidth (MB/s) : {{or .PCI.Bandwidth "N/A"}} PowerLimit (W) : {{or .PowerLimit "N/A"}} ---------------------------------------------------------------------- +CPUAffinity : {{or .CPUAffinity "N/A"}} +NUMAAffinity : {{or .NUMAAffinity "N/A"}} +P2P Available : {{if not .Topology}}None{{else}}{{range .Topology}} + GPU{{.GPU}} - (BusID){{.BusID}} - {{.Link.PCIPaths}}{{end}}{{end}} +-------------------------------------------------- ` ) -var ( - connectAddr = flag.String("connectAddr", "0.0.0.0:5777", "DCGM connect address") - isSocket = flag.String("socket", "0", "Connect to Unix socket") -) - func main() { - // choose ixdcgm hostengine running mode + // Choose ixdcgm hostengine running mode // 1. ixdcgm.Embedded // 2. ixdcgm.Standalone -connect "addr", -socket "isSocket" // 3. ixdcgm.StartHostengine - flag.Parse() - cleanup, err := ixdcgm.Init(ixdcgm.Standalone, *connectAddr, *isSocket) + cleanup, err := ixdcgm.Init(ixdcgm.Embedded) if err != nil { panic(err) } @@ -76,7 +73,7 @@ func main() { } if err = t.Execute(os.Stdout, d); err != nil { - log.Panicln("Template error:", err) + log.Panicln("Template error: ", err) } } } diff --git a/samples/deviceprocessinfo/main.go b/samples/deviceprocessinfo/main.go index f16b1bad8ef23f1d296eaf40cd29589cbbf45e47..01fda9b13b6e38250f898f853b11d15be3cb4fb8 100644 --- a/samples/deviceprocessinfo/main.go +++ b/samples/deviceprocessinfo/main.go @@ -20,18 +20,16 @@ package main import ( "fmt" "log" - "os" - "os/signal" - "syscall" "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm" ) func main() { - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) - - cleanup, err := ixdcgm.Init(ixdcgm.Embedded, "LogInfo") + // Choose ixdcgm hostengine running mode + // 1. ixdcgm.Embedded + // 2. ixdcgm.Standalone -connect "addr", -socket "isSocket" + // 3. ixdcgm.StartHostengine + cleanup, err := ixdcgm.Init(ixdcgm.Embedded, "LogWarn") if err != nil { log.Panicln(err) } @@ -49,7 +47,7 @@ func main() { fmt.Printf("%v\n", err) } for _, info := range infos { - fmt.Printf("> Pid: %d, Name: %s, UsedGpuMemory(MiB): %d\n", info.Pid, info.Name, info.UsedGpuMemory) + fmt.Printf("> Pid: %d\n Name: %s\n UsedGpuMemory(MiB): %d\n", info.Pid, info.Name, info.UsedGpuMemory) } fmt.Println("---------------------------------------------------------------------") } diff --git a/samples/devicestatus/main.go b/samples/devicestatus/main.go index 91dbb1e743658bba99112ad9b87d20c24f8429f4..fee63857ff04ee9cc5cef89a522fa5eccd2621a5 100644 --- a/samples/devicestatus/main.go +++ b/samples/devicestatus/main.go @@ -20,17 +20,15 @@ package main import ( "fmt" "log" - "os" - "os/signal" - "syscall" "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm" ) func main() { - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) - + // Choose ixdcgm hostengine running mode + // 1. ixdcgm.Embedded + // 2. ixdcgm.Standalone -connect "addr", -socket "isSocket" + // 3. ixdcgm.StartHostengine cleanup, err := ixdcgm.Init(ixdcgm.Embedded) if err != nil { log.Panicln(err) diff --git a/samples/health/main.go b/samples/health/main.go new file mode 100644 index 0000000000000000000000000000000000000000..cc7802ab209762f6326944bf6907572e080fea29 --- /dev/null +++ b/samples/health/main.go @@ -0,0 +1,70 @@ +package main + +import ( + "log" + "os" + "os/signal" + "syscall" + "text/template" + "time" + + "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm" +) + +const ( + HealthStatus = `GPU : {{.GPU}} +Status : {{.Status}} +{{range .Watches}} +Type : {{.Type}} +Status : {{.Status}} +Error : {{.Error}} +{{end}} +` +) + +// Based on ixdcgmi health commands: +// - Create group: ixdcgmi group -c +// - Enable all watches: ixdcgmi health -g GROUPID -s a +// - Check: ixdcgmi health -g GROUPID -c +func main() { + // Choose ixdcgm hostengine running mode + // 1. ixdcgm.Embedded + // 2. ixdcgm.Standalone -connect "addr", -socket "isSocket" + // 3. ixdcgm.StartHostengine + cleanup, err := ixdcgm.Init(ixdcgm.Embedded) + if err != nil { + log.Panicln(err) + } + defer cleanup() + + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + + gpuIds, err := ixdcgm.GetSupportedDevices() + if err != nil { + log.Panicln(err) + } + + ticker := time.NewTicker(time.Second * 1) + defer ticker.Stop() + + t := template.Must(template.New("HealthStatus").Parse(HealthStatus)) + for { + select { + case <-ticker.C: + for _, gpuId := range gpuIds { + h, err := ixdcgm.HealthCheckByGpuId(gpuId) + if err != nil { + log.Panicln(err) + } + + if err = t.Execute(os.Stdout, h); err != nil { + log.Panicln("Template error: ", err) + } + } + case <-sigs: + return + } + } + +} diff --git a/samples/policy/main.go b/samples/policy/main.go new file mode 100644 index 0000000000000000000000000000000000000000..9a81a958d0b9b6774a8ffc0a4b4918c82ee2b7a0 --- /dev/null +++ b/samples/policy/main.go @@ -0,0 +1,75 @@ +package main + +import ( + "context" + "fmt" + "log" + "os" + "os/signal" + "syscall" + "time" + + "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm" +) + +// Based on ixdcgmi policy commands: +// - Create group: ixdcgmi group -c +// - Set violation policy: ixdcgmi policy -g GROUPID --set 0,0 -e -p -T 60 +// - Register for policy updates: ixdcgmi policy -g GROUPID --reg +func main() { + // Choose ixdcgm hostengine running mode + // 1. ixdcgm.Embedded + // 2. ixdcgm.Standalone -connect "addr", -socket "isSocket" + // 3. ixdcgm.StartHostengine + cleanup, err := ixdcgm.Init(ixdcgm.Embedded) + if err != nil { + log.Panicln(err) + } + defer func() { + cleanup() + }() + + ctx, done := context.WithCancel(context.Background()) + // Handle SIGINT (Ctrl+C) and SIGTERM (termination signal) + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + + go func() { + <-sigs + fmt.Println("Received termination signal, exiting...") + done() + }() + + // Create policy condition parameters to register violation callback. + // Note: at least one policy must be enabled. + params := &ixdcgm.PolicyConditionParams{ + DbePolicyEnabled: true, + PCIePolicyEnabled: true, + ThermalPolicyEnabled: true, + ThermalPolicyThreshold: 60, // °C + } + + // Monitor policy violations for all GPUs + ch, err := ixdcgm.ListenForPolicyViolationsForAllGPUs(ctx, params) + + // If you want to monitor policy violations for particular GPUs (e.g., gpuId0 and gpuId1), + // use the following code: + // ch, err := ixdcgm.ListenForPolicyViolationsForGPUs(ctx, params, 0, 1) + + if err != nil { + fmt.Printf("Failed to monitor policy violations, err: %v", err) + return + } + + for { + select { + case pe := <-ch: + fmt.Printf("PolicyViolation : %v\nTimestamp : %v\nData : %v\n", + pe.Condition, pe.Timestamp, pe.Data) + case <-ctx.Done(): + // Sleep to ensure the ixdcgm policy is unregistered before cleanup. + time.Sleep(1 * time.Second) + return + } + } +} diff --git a/samples/topology/main.go b/samples/topology/main.go new file mode 100644 index 0000000000000000000000000000000000000000..df082bfc8fa2b0891b6e6dea59b62817e675bc40 --- /dev/null +++ b/samples/topology/main.go @@ -0,0 +1,73 @@ +package main + +import ( + "fmt" + "log" + "strconv" + + "gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm" +) + +const ( + legend = ` +Legend: + X = Self + SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + PIX = Connection traversing at most a single PCIe bridge + INTE = Connection traversing at most a single on-board PCIe bridge + IX# = Connection traversing a bonded set of # IXLinks` +) + +// Based on topo commands of ixdcgmi and ixsmi +func main() { + // Choose ixdcgm hostengine running mode + // 1. ixdcgm.Embedded + // 2. ixdcgm.Standalone -connect "addr", -socket "isSocket" + // 3. ixdcgm.StartHostengine + cleanup, err := ixdcgm.Init(ixdcgm.Embedded) + if err != nil { + log.Panicln(err) + } + defer cleanup() + + gpus, err := ixdcgm.GetSupportedDevices() + if err != nil { + log.Panicln(err) + } + + fmt.Printf("%-8s", "") + for _, gpu := range gpus { + fmt.Printf("%-8s", "GPU"+strconv.Itoa(int(gpu))) + } + fmt.Printf("%-16s", "CPU Affinity") + fmt.Printf("%-16s\n", "NUMA Affinity") + + numGpus := len(gpus) + gpuTopo := make([]string, numGpus) + for i := 0; i < numGpus; i++ { + topo, err := ixdcgm.GetDeviceTopology(gpus[i]) + if err != nil { + log.Panicln(err) + } + + fmt.Printf("%-8s", "GPU"+strconv.Itoa(int(gpus[i]))) + for j := 0; j < len(topo); j++ { + // skip current GPU + gpuTopo[topo[j].GPU] = topo[j].Link.PCIPaths() + } + gpuTopo[i] = " X " + for j := 0; j < numGpus; j++ { + fmt.Printf("%-8s", gpuTopo[j]) + } + deviceInfo, err := ixdcgm.GetDeviceInfo(gpus[i]) + if err != nil { + log.Panicln(err) + } + fmt.Printf("%-16s", deviceInfo.CPUAffinity) + fmt.Printf("%-16s\n", deviceInfo.NUMAAffinity) + } + fmt.Println(legend) +} diff --git a/vendor/github.com/creasty/defaults/.gitignore b/vendor/github.com/creasty/defaults/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e43b0f988953ae3a84b00331d0ccf5f7d51cb3cf --- /dev/null +++ b/vendor/github.com/creasty/defaults/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/vendor/github.com/creasty/defaults/LICENSE b/vendor/github.com/creasty/defaults/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..1483dd2d83e77325a4306cb20075b3594a9dd79e --- /dev/null +++ b/vendor/github.com/creasty/defaults/LICENSE @@ -0,0 +1,22 @@ +Copyright (c) 2017-present Yuki Iwanaga + +MIT License + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/vendor/github.com/creasty/defaults/Makefile b/vendor/github.com/creasty/defaults/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..404212a2e5bda82946ee80064cd6657aeb3e1a0a --- /dev/null +++ b/vendor/github.com/creasty/defaults/Makefile @@ -0,0 +1,29 @@ +SHELL := /bin/bash -eu -o pipefail + +GO_TEST_FLAGS := -v + +PACKAGE_DIRS := $(shell go list ./... 2> /dev/null | grep -v /vendor/) +SRC_FILES := $(shell find . -name '*.go' -not -path './vendor/*') + + +# Tasks +#----------------------------------------------- +.PHONY: lint +lint: + @gofmt -e -d -s $(SRC_FILES) | awk '{ e = 1; print $0 } END { if (e) exit(1) }' + @golangci-lint --disable errcheck,unused run + +.PHONY: test +test: lint + @go test $(GO_TEST_FLAGS) $(PACKAGE_DIRS) + +.PHONY: ci-test +ci-test: lint + @echo > coverage.txt + @for d in $(PACKAGE_DIRS); do \ + go test -coverprofile=profile.out -covermode=atomic -race -v $$d; \ + if [ -f profile.out ]; then \ + cat profile.out >> coverage.txt; \ + rm profile.out; \ + fi; \ + done diff --git a/vendor/github.com/creasty/defaults/README.md b/vendor/github.com/creasty/defaults/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e8096b6db903f4be84ce1acfc6a1e6f206fbc8c9 --- /dev/null +++ b/vendor/github.com/creasty/defaults/README.md @@ -0,0 +1,160 @@ +defaults +======== + +[![CircleCI](https://circleci.com/gh/creasty/defaults/tree/master.svg?style=svg)](https://circleci.com/gh/creasty/defaults/tree/master) +[![codecov](https://codecov.io/gh/creasty/defaults/branch/master/graph/badge.svg)](https://codecov.io/gh/creasty/defaults) +[![GitHub release](https://img.shields.io/github/release/creasty/defaults.svg)](https://github.com/creasty/defaults/releases) +[![License](https://img.shields.io/github/license/creasty/defaults.svg)](./LICENSE) + +Initialize structs with default values + +- Supports almost all kind of types + - Scalar types + - `int/8/16/32/64`, `uint/8/16/32/64`, `float32/64` + - `uintptr`, `bool`, `string` + - Complex types + - `map`, `slice`, `struct` + - Nested types + - `map[K1]map[K2]Struct`, `[]map[K1]Struct[]` + - Aliased types + - `time.Duration` + - e.g., `type Enum string` + - Pointer types + - e.g., `*SampleStruct`, `*int` +- Recursively initializes fields in a struct +- Dynamically sets default values by [`defaults.Setter`](./setter.go) interface +- Preserves non-initial values from being reset with a default value + + +Usage +----- + +```go +package main + +import ( + "encoding/json" + "fmt" + "math/rand" + + "github.com/creasty/defaults" +) + +type Gender string + +type Sample struct { + Name string `default:"John Smith"` + Age int `default:"27"` + Gender Gender `default:"m"` + Working bool `default:"true"` + + SliceInt []int `default:"[1, 2, 3]"` + SlicePtr []*int `default:"[1, 2, 3]"` + SliceString []string `default:"[\"a\", \"b\"]"` + + MapNull map[string]int `default:"{}"` + Map map[string]int `default:"{\"key1\": 123}"` + MapOfStruct map[string]OtherStruct `default:"{\"Key2\": {\"Foo\":123}}"` + MapOfPtrStruct map[string]*OtherStruct `default:"{\"Key3\": {\"Foo\":123}}"` + MapOfStructWithTag map[string]OtherStruct `default:"{\"Key4\": {\"Foo\":123}}"` + + Struct OtherStruct `default:"{\"Foo\": 123}"` + StructPtr *OtherStruct `default:"{\"Foo\": 123}"` + + NoTag OtherStruct // Recurses into a nested struct by default + NoOption OtherStruct `default:"-"` // no option +} + +type OtherStruct struct { + Hello string `default:"world"` // Tags in a nested struct also work + Foo int `default:"-"` + Random int `default:"-"` +} + +// SetDefaults implements defaults.Setter interface +func (s *OtherStruct) SetDefaults() { + if defaults.CanUpdate(s.Random) { // Check if it's a zero value (recommended) + s.Random = rand.Int() // Set a dynamic value + } +} + +func main() { + obj := &Sample{} + if err := defaults.Set(obj); err != nil { + panic(err) + } + + out, err := json.MarshalIndent(obj, "", " ") + if err != nil { + panic(err) + } + fmt.Println(string(out)) + + // Output: + // { + // "Name": "John Smith", + // "Age": 27, + // "Gender": "m", + // "Working": true, + // "SliceInt": [ + // 1, + // 2, + // 3 + // ], + // "SlicePtr": [ + // 1, + // 2, + // 3 + // ], + // "SliceString": [ + // "a", + // "b" + // ], + // "MapNull": {}, + // "Map": { + // "key1": 123 + // }, + // "MapOfStruct": { + // "Key2": { + // "Hello": "world", + // "Foo": 123, + // "Random": 5577006791947779410 + // } + // }, + // "MapOfPtrStruct": { + // "Key3": { + // "Hello": "world", + // "Foo": 123, + // "Random": 8674665223082153551 + // } + // }, + // "MapOfStructWithTag": { + // "Key4": { + // "Hello": "world", + // "Foo": 123, + // "Random": 6129484611666145821 + // } + // }, + // "Struct": { + // "Hello": "world", + // "Foo": 123, + // "Random": 4037200794235010051 + // }, + // "StructPtr": { + // "Hello": "world", + // "Foo": 123, + // "Random": 3916589616287113937 + // }, + // "NoTag": { + // "Hello": "world", + // "Foo": 0, + // "Random": 6334824724549167320 + // }, + // "NoOption": { + // "Hello": "", + // "Foo": 0, + // "Random": 0 + // } + // } +} +``` diff --git a/vendor/github.com/creasty/defaults/defaults.go b/vendor/github.com/creasty/defaults/defaults.go new file mode 100644 index 0000000000000000000000000000000000000000..f4539281a44cc0647dc52319f9566f0592079c5b --- /dev/null +++ b/vendor/github.com/creasty/defaults/defaults.go @@ -0,0 +1,244 @@ +package defaults + +import ( + "encoding" + "encoding/json" + "errors" + "reflect" + "strconv" + "time" +) + +var ( + errInvalidType = errors.New("not a struct pointer") +) + +const ( + fieldName = "default" +) + +// Set initializes members in a struct referenced by a pointer. +// Maps and slices are initialized by `make` and other primitive types are set with default values. +// `ptr` should be a struct pointer +func Set(ptr interface{}) error { + if reflect.TypeOf(ptr).Kind() != reflect.Ptr { + return errInvalidType + } + + v := reflect.ValueOf(ptr).Elem() + t := v.Type() + + if t.Kind() != reflect.Struct { + return errInvalidType + } + + for i := 0; i < t.NumField(); i++ { + if defaultVal := t.Field(i).Tag.Get(fieldName); defaultVal != "-" { + if err := setField(v.Field(i), defaultVal); err != nil { + return err + } + } + } + callSetter(ptr) + return nil +} + +// MustSet function is a wrapper of Set function +// It will call Set and panic if err not equals nil. +func MustSet(ptr interface{}) { + if err := Set(ptr); err != nil { + panic(err) + } +} + +func setField(field reflect.Value, defaultVal string) error { + if !field.CanSet() { + return nil + } + + if !shouldInitializeField(field, defaultVal) { + return nil + } + + isInitial := isInitialValue(field) + if isInitial { + if unmarshalByInterface(field, defaultVal) { + return nil + } + + switch field.Kind() { + case reflect.Bool: + if val, err := strconv.ParseBool(defaultVal); err == nil { + field.Set(reflect.ValueOf(val).Convert(field.Type())) + } + case reflect.Int: + if val, err := strconv.ParseInt(defaultVal, 0, strconv.IntSize); err == nil { + field.Set(reflect.ValueOf(int(val)).Convert(field.Type())) + } + case reflect.Int8: + if val, err := strconv.ParseInt(defaultVal, 0, 8); err == nil { + field.Set(reflect.ValueOf(int8(val)).Convert(field.Type())) + } + case reflect.Int16: + if val, err := strconv.ParseInt(defaultVal, 0, 16); err == nil { + field.Set(reflect.ValueOf(int16(val)).Convert(field.Type())) + } + case reflect.Int32: + if val, err := strconv.ParseInt(defaultVal, 0, 32); err == nil { + field.Set(reflect.ValueOf(int32(val)).Convert(field.Type())) + } + case reflect.Int64: + if val, err := time.ParseDuration(defaultVal); err == nil { + field.Set(reflect.ValueOf(val).Convert(field.Type())) + } else if val, err := strconv.ParseInt(defaultVal, 0, 64); err == nil { + field.Set(reflect.ValueOf(val).Convert(field.Type())) + } + case reflect.Uint: + if val, err := strconv.ParseUint(defaultVal, 0, strconv.IntSize); err == nil { + field.Set(reflect.ValueOf(uint(val)).Convert(field.Type())) + } + case reflect.Uint8: + if val, err := strconv.ParseUint(defaultVal, 0, 8); err == nil { + field.Set(reflect.ValueOf(uint8(val)).Convert(field.Type())) + } + case reflect.Uint16: + if val, err := strconv.ParseUint(defaultVal, 0, 16); err == nil { + field.Set(reflect.ValueOf(uint16(val)).Convert(field.Type())) + } + case reflect.Uint32: + if val, err := strconv.ParseUint(defaultVal, 0, 32); err == nil { + field.Set(reflect.ValueOf(uint32(val)).Convert(field.Type())) + } + case reflect.Uint64: + if val, err := strconv.ParseUint(defaultVal, 0, 64); err == nil { + field.Set(reflect.ValueOf(val).Convert(field.Type())) + } + case reflect.Uintptr: + if val, err := strconv.ParseUint(defaultVal, 0, strconv.IntSize); err == nil { + field.Set(reflect.ValueOf(uintptr(val)).Convert(field.Type())) + } + case reflect.Float32: + if val, err := strconv.ParseFloat(defaultVal, 32); err == nil { + field.Set(reflect.ValueOf(float32(val)).Convert(field.Type())) + } + case reflect.Float64: + if val, err := strconv.ParseFloat(defaultVal, 64); err == nil { + field.Set(reflect.ValueOf(val).Convert(field.Type())) + } + case reflect.String: + field.Set(reflect.ValueOf(defaultVal).Convert(field.Type())) + + case reflect.Slice: + ref := reflect.New(field.Type()) + ref.Elem().Set(reflect.MakeSlice(field.Type(), 0, 0)) + if defaultVal != "" && defaultVal != "[]" { + if err := json.Unmarshal([]byte(defaultVal), ref.Interface()); err != nil { + return err + } + } + field.Set(ref.Elem().Convert(field.Type())) + case reflect.Map: + ref := reflect.New(field.Type()) + ref.Elem().Set(reflect.MakeMap(field.Type())) + if defaultVal != "" && defaultVal != "{}" { + if err := json.Unmarshal([]byte(defaultVal), ref.Interface()); err != nil { + return err + } + } + field.Set(ref.Elem().Convert(field.Type())) + case reflect.Struct: + if defaultVal != "" && defaultVal != "{}" { + if err := json.Unmarshal([]byte(defaultVal), field.Addr().Interface()); err != nil { + return err + } + } + case reflect.Ptr: + field.Set(reflect.New(field.Type().Elem())) + } + } + + switch field.Kind() { + case reflect.Ptr: + if isInitial || field.Elem().Kind() == reflect.Struct { + setField(field.Elem(), defaultVal) + callSetter(field.Interface()) + } + case reflect.Struct: + if err := Set(field.Addr().Interface()); err != nil { + return err + } + case reflect.Slice: + for j := 0; j < field.Len(); j++ { + if err := setField(field.Index(j), ""); err != nil { + return err + } + } + case reflect.Map: + for _, e := range field.MapKeys() { + var v = field.MapIndex(e) + + switch v.Kind() { + case reflect.Ptr: + switch v.Elem().Kind() { + case reflect.Struct, reflect.Slice, reflect.Map: + if err := setField(v.Elem(), ""); err != nil { + return err + } + } + case reflect.Struct, reflect.Slice, reflect.Map: + ref := reflect.New(v.Type()) + ref.Elem().Set(v) + if err := setField(ref.Elem(), ""); err != nil { + return err + } + field.SetMapIndex(e, ref.Elem().Convert(v.Type())) + } + } + } + + return nil +} + +func unmarshalByInterface(field reflect.Value, defaultVal string) bool { + asText, ok := field.Addr().Interface().(encoding.TextUnmarshaler) + if ok && defaultVal != "" { + // if field implements encode.TextUnmarshaler, try to use it before decode by kind + if err := asText.UnmarshalText([]byte(defaultVal)); err == nil { + return true + } + } + asJSON, ok := field.Addr().Interface().(json.Unmarshaler) + if ok && defaultVal != "" && defaultVal != "{}" && defaultVal != "[]" { + // if field implements json.Unmarshaler, try to use it before decode by kind + if err := asJSON.UnmarshalJSON([]byte(defaultVal)); err == nil { + return true + } + } + return false +} + +func isInitialValue(field reflect.Value) bool { + return reflect.DeepEqual(reflect.Zero(field.Type()).Interface(), field.Interface()) +} + +func shouldInitializeField(field reflect.Value, tag string) bool { + switch field.Kind() { + case reflect.Struct: + return true + case reflect.Ptr: + if !field.IsNil() && field.Elem().Kind() == reflect.Struct { + return true + } + case reflect.Slice: + return field.Len() > 0 || tag != "" + case reflect.Map: + return field.Len() > 0 || tag != "" + } + + return tag != "" +} + +// CanUpdate returns true when the given value is an initial value of its type +func CanUpdate(v interface{}) bool { + return isInitialValue(reflect.ValueOf(v)) +} diff --git a/vendor/github.com/creasty/defaults/setter.go b/vendor/github.com/creasty/defaults/setter.go new file mode 100644 index 0000000000000000000000000000000000000000..1f64aa65993be7fccf57beff3569ea8b7bff5367 --- /dev/null +++ b/vendor/github.com/creasty/defaults/setter.go @@ -0,0 +1,12 @@ +package defaults + +// Setter is an interface for setting default values +type Setter interface { + SetDefaults() +} + +func callSetter(v interface{}) { + if ds, ok := v.(Setter); ok { + ds.SetDefaults() + } +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 0cd694adf1fb0943a2c6746ffb7a270a2f09f0d1..e3036d188bd8e7906fd224996953659386467926 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,3 +1,6 @@ # github.com/bits-and-blooms/bitset v1.13.0 ## explicit; go 1.16 github.com/bits-and-blooms/bitset +# github.com/creasty/defaults v1.8.0 +## explicit; go 1.14 +github.com/creasty/defaults