diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..53160138a755ed704e81ecb3b820b0b6aa5ce00a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+*.out
+*.log
+
+# MacOS
+.DS_Store
+
+# IDE
+.idea/
+.vscode/
+*.code-workspace
\ No newline at end of file
diff --git a/README.md b/README.md
index 76bfe268726c56161fbefb7de8e8c765d1c6a8f0..bb8924bedadaf16ff1a682e59e2c4ebd983830be 100644
--- a/README.md
+++ b/README.md
@@ -2,25 +2,25 @@
 
 ## Introduction
 
-IXDCGM is a tool provided for monitoring and managing IX GPUs, offering a rich set of APIs to retrieve information about GPU status, performance, power consumption, and more. Go-IXDCGM is a wrapper library for IXDCGM written in Go language, providing a simple set of functions that facilitate the easy invocation of IXDCGM's APIs.
+IXDCGM is a tool provided for monitoring and managing IX GPUs, offering a rich set of APIs to retrieve information about GPU status, performance, power consumption, and more.   
+Go-ixdcgm is a wrapper library for ixdcgm written in Go language, providing a simple set of functions that facilitate the easy invocation of ixdcgm's APIs.
 
 ## Install
 
-The installation of Go-IXDCGM is very simple, just execute the following command in the command line：
+The installation of go-ixdcgm is very simple, just execute the following command in the command line：
 
 ```bash
 go get gitee.com/deep-spark/go-ixdcgm
 ```
 
-## Samples
+## Sample
 
-An example of go-ixdcgm for device-info is under:
+A simple example of go-ixdcgm for getting device info is under:
 
 ```go
 package main
 
 import (
-	"flag"
 	"fmt"
 	"html/template"
 	"log"
@@ -31,7 +31,7 @@ import (
 
 const (
 	deviceInfo = `Driver Version         : {{.Identifiers.DriverVersion}}
-GPUId		       : {{.GPUId}}
+GPUId                  : {{.GPUId}}
 IxDCGMSupported        : {{.IxDCGMSupported}}
 Uuid                   : {{.Uuid}}
 Product Name           : {{.Identifiers.ProductName}}
@@ -47,18 +47,12 @@ PowerLimit (W)         : {{or .PowerLimit "N/A"}}
 `
 )
 
-var (
-	connectAddr = flag.String("connectAddr", "0.0.0.0:5777", "DCGM connect address")
-	isSocket    = flag.String("socket", "0", "Connect to Unix socket")
-)
-
 func main() {
-	// choose ixdcgm hostengine running mode
+	// Choose ixdcgm hostengine running mode
 	// 1. ixdcgm.Embedded
 	// 2. ixdcgm.Standalone -connect "addr", -socket "isSocket"
 	// 3. ixdcgm.StartHostengine
-	flag.Parse()
-	cleanup, err := ixdcgm.Init(ixdcgm.Standalone, *connectAddr, *isSocket)
+	cleanup, err := ixdcgm.Init(ixdcgm.Embedded)
 	if err != nil {
 		panic(err)
 	}
@@ -84,6 +78,136 @@ func main() {
 }
 ```
 
+## More Samples
+
+The `samples` folder contains more simple examples of how to use go-ixdcgm to call the ixdcgm API.
+
+To get device information, run the following command:
+```
+$ go run samples/deviceinfo/main.go
+
+# sample output
+
+Driver Version         : 4.2.0
+GPUId                  : 1
+IxDCGMSupported        : Y
+Uuid                   : GPU-6d2ec5fa-f293-57a3-9f2c-335f78120578
+Product Name           : Iluvatar BI-V150S
+Serial Number          : 24120026944896
+Bus ID                 : 00000000:8A:00.0
+BAR1 (MB)              : N/A
+Total Memory (MB):     : 32768
+Used Memory (MB):      : 25500
+Free Memory (MB):      : 7268
+Bandwidth (MB/s)       : 31504
+PowerLimit (W)         : 205
+CPUAffinity            : 20-39,60-79
+NUMAAffinity           : 1
+P2P Available          :
+    GPU0 - (BusID)00000000:8A:00.0 - SYS
+    GPU2 - (BusID)00000000:8A:00.0 - INTE
+--------------------------------------------------	
+```
+
+To get device status, run the following command:
+```
+$ go run samples/devicestatus/main.go
+
+# sample output
+
+GPUId                  : 1
+Power Usage (W)        : 150.000
+Temperature (°C)       : 68
+FanSpeed (%)           : N/A
+Utilization.GPU (%)    : 85
+Utilization.Mem (%)    : 78
+Clocks.Cores (MHz)     : 1750
+Clocks.Mem (MHz)       : 1600
+EccSdbVolDev           : 0
+EccDdbVolDev           : 0
+PCI.Tx (MB/s)          : 107
+PCI.Rx (MB/s)          : 92544
+PCI.ReplayCounter      : 0
+Total Memory (MB)      : 32768
+Used Memory (MB)       : 25500
+Free Memory (MB)       : 7268
+SmActive               : 0.792
+SmOccupancy            : 0.222
+DramActive             : 0.622
+-------------------------------------------
+```
+
+To get running process information of device, run the following command:
+```
+$ go run samples/deviceprocessinfo/main.go
+
+# sample output
+
+Get the running process infos of gpu 1
+> Pid: 4009629
+  Name: ./gemm_perf --i 1 --d 0 --m 1024 --l 2000
+  UsedGpuMemory(MiB): 128
+```
+
+To monitor device health iteratively, run the following command:
+```
+$ go run samples/health/main.go
+
+# sample output
+
+GPU                : 0
+Status             : Healthy
+
+GPU                : 1
+Status             : Healthy
+
+GPU                : 2
+Status             : Healthy
+
+...
+```
+`Note`: Press Ctrl+C to stop the iteration output.  
+
+To find the topology of GPUs on the system, run the following command:
+```
+$ go run samples/topology/main.go
+
+# sample output
+
+        GPU0    GPU1    GPU2    CPU Affinity    NUMA Affinity
+GPU0     X      SYS     SYS     0-19,40-59      0
+GPU1    SYS      X      INTE    20-39,60-79     1
+GPU2    SYS     INTE     X      20-39,60-79     1
+
+Legend:
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing at most a single PCIe bridge
+  INTE = Connection traversing at most a single on-board PCIe bridge
+  IX#  = Connection traversing a bonded set of # IXLinks
+```
+
+To set violation policy and monitor policy violations iteratively, run the following command:
+```
+$ go run samples/policy/main.go
+
+# sample output
+
+2025/02/25 17:05:22 Policy successfully set.
+2025/02/25 17:05:22 Listening for violations...
+PolicyViolation : Thermal Limit
+Timestamp       : 2025-02-25 17:05:42 +0800 CST
+Data            : {61}
+PolicyViolation : Thermal Limit
+Timestamp       : 2025-02-25 17:05:42 +0800 CST
+Data            : {61}
+...
+```
+`Note`: Press Ctrl+C to stop the iteration output.
+
 ## License
 
 Copyright (c) 2024 Iluvatar CoreX. All rights reserved. This project has an Apache-2.0 license, as
diff --git a/go.mod b/go.mod
index 6c0344ffb7787a2683ccd7d65aaca0f7c9217508..6522fb6262166d835d4612d64ccdee68863648b5 100644
--- a/go.mod
+++ b/go.mod
@@ -2,4 +2,7 @@ module gitee.com/deep-spark/go-ixdcgm
 
 go 1.21.6
 
-require github.com/bits-and-blooms/bitset v1.13.0
+require (
+	github.com/bits-and-blooms/bitset v1.13.0
+	github.com/creasty/defaults v1.8.0
+)
diff --git a/go.sum b/go.sum
index b389503f487a31a26b40b2d7d90890d7ba43f961..31326c97cdd36829955a8ea6306ea34a28e23d0d 100644
--- a/go.sum
+++ b/go.sum
@@ -1,2 +1,4 @@
 github.com/bits-and-blooms/bitset v1.13.0 h1:bAQ9OPNFYbGHV6Nez0tmNI0RiEu7/hxlYJRUA0wFAVE=
 github.com/bits-and-blooms/bitset v1.13.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
+github.com/creasty/defaults v1.8.0 h1:z27FJxCAa0JKt3utc0sCImAEb+spPucmKoOdLHvHYKk=
+github.com/creasty/defaults v1.8.0/go.mod h1:iGzKe6pbEHnpMPtfDXZEr0NVxWnPTjb1bbDy08fPzYM=
diff --git a/pkg/ixdcgm/api.go b/pkg/ixdcgm/api.go
index 86d09e2fdb77bc40792b811e85a5e2207628fc40..0a1e2aa85892b6dc06f5566e6f29f043a02e5e03 100644
--- a/pkg/ixdcgm/api.go
+++ b/pkg/ixdcgm/api.go
@@ -27,6 +27,7 @@ package ixdcgm
 */
 import "C"
 import (
+	"context"
 	"fmt"
 	"sync"
 	"unsafe"
@@ -38,16 +39,16 @@ var (
 	ixdcgmLibHandler  unsafe.Pointer
 	ixdcgmInitCounter int
 	mux               sync.Mutex
-	connectionsMode   Interface
+	connection        Interface
 	handle            DcgmHandle
 )
 
-// to do path
+// dynamic library path
 const (
 	ixdcgmLib = "libixdcgm.so"
 )
 
-func initIxDcgm(m int, args ...string) (err error) {
+func initIxDcgm(m int) (err error) {
 	lib := string2Char(ixdcgmLib)
 	defer freeCString(lib)
 
@@ -56,7 +57,7 @@ func initIxDcgm(m int, args ...string) (err error) {
 		return fmt.Errorf("failed to load %s", ixdcgmLib)
 	}
 
-	connectionsMode, err = New(m)
+	connection, err = New(m)
 	if err != nil {
 		return err
 	}
@@ -70,12 +71,12 @@ func Init(m int, args ...string) (cleanup func(), err error) {
 		return nil, fmt.Errorf("ixdcgm already initialized %d", ixdcgmInitCounter)
 	}
 	if ixdcgmInitCounter == 0 {
-		err = initIxDcgm(m, args...)
+		err = initIxDcgm(m)
 		if err != nil {
 			return nil, err
 		}
 
-		handle, err = connectionsMode.Start(args...)
+		handle, err = connection.Start(args...)
 		if err != nil {
 			return nil, err
 		}
@@ -98,7 +99,7 @@ func shutdown() (err error) {
 	}
 
 	if ixdcgmInitCounter == 1 {
-		err = connectionsMode.Shutdown()
+		err = connection.Shutdown()
 		if err != nil {
 			return err
 		}
@@ -113,13 +114,13 @@ func GetAllDeviceCount() (uint, error) {
 	return getAllDeviceCount()
 }
 
-func GetDeviceInfo(gpuId uint) (DeviceInfo, error) {
-	return getDeviceInfo(gpuId)
+func GetSupportedDevices() ([]uint, error) {
+	return getSupportedDevices()
 }
 
 // GetDeviceInfo describes the given device
-func GetSupportedDevices() ([]uint, error) {
-	return getSupportedDevices()
+func GetDeviceInfo(gpuId uint) (DeviceInfo, error) {
+	return getDeviceInfo(gpuId)
 }
 
 // GetDeviceStatus monitors GPU status including its power, memory and GPU utilization
@@ -137,6 +138,28 @@ func GetDeviceRunningProcesses(gpuId uint) ([]DeviceProcessInfo, error) {
 	return getDeviceRunningProcesses(gpuId)
 }
 
+// GetDeviceRunning checks whether the two GPUs are on the same board
 func GetDeviceOnSameBoard(gpuId1, gpuId2 uint) (bool, error) {
 	return getDeviceOnSameBoard(gpuId1, gpuId2)
 }
+
+// HealthCheckByGpuId monitors GPU health for any errors/failures/warnings
+func HealthCheckByGpuId(gpuId uint) (DeviceHealth, error) {
+	return healthCheckByGpuId(gpuId)
+}
+
+// GetDeviceTopology returns device topology corresponding to the gpuId
+func GetDeviceTopology(gpuId uint) ([]P2PLink, error) {
+	return getDeviceTopology(gpuId)
+}
+
+// ListenForPolicyViolationsForAllGPUs sets GPU usage and error policies and notifies in case of any violations on all GPUs
+func ListenForPolicyViolationsForAllGPUs(ctx context.Context, params *PolicyConditionParams) (<-chan PolicyViolation, error) {
+	groupId := GroupAllGPUs()
+	return registerPolicy(ctx, groupId, params)
+}
+
+// ListenForPolicyViolationsForGPUs sets GPU usage and error policies and notifies in case of any violations on special GPUs
+func ListenForPolicyViolationsForGPUs(ctx context.Context, params *PolicyConditionParams, gpuIds ...uint) (<-chan PolicyViolation, error) {
+	return registerPolicyForGpus(ctx, params, gpuIds...)
+}
diff --git a/pkg/ixdcgm/callback.c b/pkg/ixdcgm/callback.c
new file mode 100644
index 0000000000000000000000000000000000000000..596c1a0d653cc6601206b105973480a009c6d80d
--- /dev/null
+++ b/pkg/ixdcgm/callback.c
@@ -0,0 +1,11 @@
+int violationNotify(void *p)
+{
+    int ViolationRegistration(void *);
+    return ViolationRegistration(p);
+}
+
+int voidCallback(void *p)
+{
+    int VoidCallback(void *);
+    return VoidCallback(p);
+}
\ No newline at end of file
diff --git a/pkg/ixdcgm/const.go b/pkg/ixdcgm/const.go
index a6eea2e7513c7294c4ae8c71b78994e87a6d66d3..5eafe6b86c7b3cc0afddea5f368505ef511f3761 100644
--- a/pkg/ixdcgm/const.go
+++ b/pkg/ixdcgm/const.go
@@ -130,14 +130,26 @@ const (
 	DCGM_FI_DEV_ECC_DBE_VOL_DEV             Short = C.DCGM_FI_DEV_ECC_DBE_VOL_DEV
 	DCGM_FI_INTERNAL_FIELDS_0_START         Short = C.DCGM_FI_INTERNAL_FIELDS_0_START
 	DCGM_FI_INTERNAL_FIELDS_0_END           Short = C.DCGM_FI_INTERNAL_FIELDS_0_END
-
-	DCGM_FI_PROF_GR_ENGINE_ACTIVE   Short = C.DCGM_FI_PROF_GR_ENGINE_ACTIVE
-	DCGM_FI_PROF_SM_ACTIVE          Short = C.DCGM_FI_PROF_SM_ACTIVE
-	DCGM_FI_PROF_SM_OCCUPANCY       Short = C.DCGM_FI_PROF_SM_OCCUPANCY
-	DCGM_FI_PROF_PIPE_TENSOR_ACTIVE Short = C.DCGM_FI_PROF_PIPE_TENSOR_ACTIVE
-	DCGM_FI_PROF_DRAM_ACTIVE        Short = C.DCGM_FI_PROF_DRAM_ACTIVE
-	DCGM_FI_PROF_PCIE_TX_BYTES      Short = C.DCGM_FI_PROF_PCIE_TX_BYTES
-	DCGM_FI_PROF_PCIE_RX_BYTES      Short = C.DCGM_FI_PROF_PCIE_RX_BYTES
+	DCGM_FI_PROF_SM_ACTIVE                  Short = C.DCGM_FI_PROF_SM_ACTIVE
+	DCGM_FI_PROF_SM_OCCUPANCY               Short = C.DCGM_FI_PROF_SM_OCCUPANCY
+	DCGM_FI_PROF_DRAM_ACTIVE                Short = C.DCGM_FI_PROF_DRAM_ACTIVE
 
 	DCGM_FI_MAX_FIELDS Short = C.DCGM_FI_MAX_FIELDS
 )
+
+type HealthSystem uint
+
+const (
+	DCGM_HEALTH_WATCH_PCIE    HealthSystem = 0x1
+	DCGM_HEALTH_WATCH_THERMAL HealthSystem = 0x80
+	DCGM_HEALTH_WATCH_POWER   HealthSystem = 0x100
+	DCGM_HEALTH_WATCH_ALL     HealthSystem = 0xFFFFFFFF //  All watches only support PCIE watch now
+)
+
+type HealthResult uint
+
+const (
+	DCGM_HEALTH_RESULT_PASS HealthResult = 0  // All results within this system are reporting normal
+	DCGM_HEALTH_RESULT_WARN HealthResult = 10 // A warning has been issued, refer to the response for more information
+	DCGM_HEALTH_RESULT_FAIL HealthResult = 20 // A failure has been issued, refer to the response for more information
+)
diff --git a/pkg/ixdcgm/device_common.go b/pkg/ixdcgm/device_common.go
index 66861a3223d6b81ef68732120116d3d55a9a12cd..cf9d561441a59cf7033408c29252d774417c3158 100644
--- a/pkg/ixdcgm/device_common.go
+++ b/pkg/ixdcgm/device_common.go
@@ -24,13 +24,11 @@ package ixdcgm
 #include "include/ixdcgmApiExport.h"
 */
 import "C"
-import "fmt"
 
 func getDeviceOnSameBoard(gpuId1, gpuId2 uint) (isOnSameBoard bool, err error) {
 	var onSameBoard C.int
-	r := C.ixdcgmDeviceOnSameBoard(C.ulong(handle.handle), C.uint(gpuId1), C.uint(gpuId2), &onSameBoard)
-	fmt.Println(r)
-	if err = ixdcgmErrorString(r); err != nil {
+	ret := C.ixdcgmDeviceOnSameBoard(C.ulong(handle.handle), C.uint(gpuId1), C.uint(gpuId2), &onSameBoard)
+	if err = ixdcgmErrorString(ret); err != nil {
 		return false, err
 	}
 	if onSameBoard == 0 {
diff --git a/pkg/ixdcgm/device_info.go b/pkg/ixdcgm/device_info.go
index 4bde0ac34d7abc92755b42d39e179dd5d393b1bc..c8e16d42c9b1a81b73a31ebfa92b09df6fc83fa2 100644
--- a/pkg/ixdcgm/device_info.go
+++ b/pkg/ixdcgm/device_info.go
@@ -54,10 +54,13 @@ type DeviceInfo struct {
 	GPUId           uint
 	IxDCGMSupported string
 	Uuid            string
-	PowerLimit      uint
+	PowerLimit      uint // W
 	PCI             PciInfo
 	MemoryUsage     MemoryUsageInfo
 	Identifiers     DeviceIdentifier
+	Topology        []P2PLink
+	CPUAffinity     string
+	NUMAAffinity    string
 }
 
 func getAllDeviceCount() (gpuCount uint, err error) {
@@ -131,7 +134,7 @@ func getDeviceInfo(gpuId uint) (DeviceInfo, error) {
 		return DeviceInfo{}, err
 	}
 
-	// check if the given GPU is DCGM supported
+	// check if the given GPU is IxDCGM supported
 	gpus, err := getSupportedDevices()
 	if err != nil {
 		return DeviceInfo{}, err
@@ -144,17 +147,27 @@ func getDeviceInfo(gpuId uint) (DeviceInfo, error) {
 			break
 		}
 	}
+
+	cpuAffinity, err := getCPUAffinity(gpuId)
+	if err != nil {
+		return DeviceInfo{}, err
+	}
+	numaAffinity, err := getNUMAAffinity(gpuId)
+	if err != nil {
+		return DeviceInfo{}, err
+	}
+
+	var topology []P2PLink
 	var bandwidth int64
 	if supported == "Y" {
+		topology, err = getDeviceTopology(gpuId)
+		if err != nil {
+			return DeviceInfo{}, err
+		}
 		bandwidth, err = getPciBandwidth(gpuId)
 		if err != nil {
 			return DeviceInfo{}, err
 		}
-
-		// err = getDeviceTopology(gpuId)
-		// if err != nil {
-		// 	return DeviceInfo{}, err
-		// }
 	}
 
 	uuid := cChar2String(&dcgmAttr.identifiers.uuid[0])
@@ -186,6 +199,9 @@ func getDeviceInfo(gpuId uint) (DeviceInfo, error) {
 		PCI:             pci,
 		MemoryUsage:     memInfo,
 		Identifiers:     id,
+		Topology:        topology,
+		CPUAffinity:     cpuAffinity,
+		NUMAAffinity:    numaAffinity,
 	}, nil
 }
 
@@ -204,9 +220,15 @@ func getSupportedDevices() (gpus []uint, err error) {
 	}
 	return
 }
-
 func getCPUAffinity(gpuId uint) (string, error) {
+	return getAffinity(gpuId, "CPU")
+}
 
+func getNUMAAffinity(gpuId uint) (string, error) {
+	return getAffinity(gpuId, "MEM")
+}
+
+func getAffinity(gpuId uint, typ string) (string, error) {
 	const (
 		affinity0 int = iota
 		affinity1
@@ -216,19 +238,29 @@ func getCPUAffinity(gpuId uint) (string, error) {
 	)
 
 	affFields := make([]Short, fieldsCount)
-	affFields[affinity0] = DCGM_FI_DEV_CPU_AFFINITY_0
-	affFields[affinity1] = DCGM_FI_DEV_CPU_AFFINITY_1
-	affFields[affinity2] = DCGM_FI_DEV_CPU_AFFINITY_2
-	affFields[affinity3] = DCGM_FI_DEV_CPU_AFFINITY_3
+	switch typ {
+	case "CPU":
+		affFields[affinity0] = DCGM_FI_DEV_CPU_AFFINITY_0
+		affFields[affinity1] = DCGM_FI_DEV_CPU_AFFINITY_1
+		affFields[affinity2] = DCGM_FI_DEV_CPU_AFFINITY_2
+		affFields[affinity3] = DCGM_FI_DEV_CPU_AFFINITY_3
+	case "MEM":
+		affFields[affinity0] = DCGM_FI_DEV_MEM_AFFINITY_0
+		affFields[affinity1] = DCGM_FI_DEV_MEM_AFFINITY_1
+		affFields[affinity2] = DCGM_FI_DEV_MEM_AFFINITY_2
+		affFields[affinity3] = DCGM_FI_DEV_MEM_AFFINITY_3
+	default:
+		return "N/A", fmt.Errorf("not supported affinity type: %s", typ)
+	}
 
-	fieldGrpName := fmt.Sprintf("cpuAffFields%d", gpuId)
+	fieldGrpName := fmt.Sprintf("affFields%d", gpuId)
 	fieldGrpHdl, err := FieldGroupCreate(fieldGrpName, affFields)
 	if err != nil {
 		return "N/A", err
 	}
 	defer FieldGroupDestroy(fieldGrpHdl)
 
-	gpuGrpName := fmt.Sprintf("cpuAff%d", gpuId)
+	gpuGrpName := fmt.Sprintf("aff%d", gpuId)
 	gpuGrpHdl, err := WatchFields([]uint{gpuId}, fieldGrpHdl, gpuGrpName)
 	if err != nil {
 		return "N/A", err
@@ -247,6 +279,6 @@ func getCPUAffinity(gpuId uint) (string, error) {
 	bits[3] = uint64(values[affinity3].Int64())
 
 	b := bitset.From(bits)
-
-	return b.String(), nil
+	str := convertBitsetStr(b.String())
+	return str, nil
 }
diff --git a/pkg/ixdcgm/embedded.go b/pkg/ixdcgm/embedded.go
index 9829373874be7130a5e74e0ebdb7e1b68b75dc19..09c2a29139ca997feaaaf1ca87797491d8c769c3 100644
--- a/pkg/ixdcgm/embedded.go
+++ b/pkg/ixdcgm/embedded.go
@@ -46,6 +46,8 @@ func (e *embedded) Shutdown() error {
 }
 
 func (e *embedded) Start(args ...string) (DcgmHandle, error) {
+	fmt.Println("Start ixdcgm based on Embedded mode.")
+
 	result := C.dcgmInit()
 	if err := errorString(result); err != nil {
 		return DcgmHandle{}, fmt.Errorf("failed to initialize dcgm: %v", err)
diff --git a/pkg/ixdcgm/fields.go b/pkg/ixdcgm/fields.go
index dec961823d87b7a29e3028a82a03c8b5dac2da3e..e52404e0af2acb1ad84ea9b98f40a0588dec1e24 100644
--- a/pkg/ixdcgm/fields.go
+++ b/pkg/ixdcgm/fields.go
@@ -72,7 +72,7 @@ func WatchFields(gpuIds []uint, fieldGrp FieldGrpHandle, groupName string) (Grou
 		return GroupHandle{}, err
 	}
 	for _, gpuId := range gpuIds {
-		err = AddDevice(group, gpuId)
+		err = AddToGroup(group, gpuId)
 		if err != nil {
 			return GroupHandle{}, err
 		}
@@ -151,3 +151,45 @@ func GetFieldValueStr(fv FieldValue_v1, typ string) string {
 		return "N/A"
 	}
 }
+
+type Field_Entity_Group uint
+
+const (
+	FE_NONE Field_Entity_Group = iota
+	FE_GPU
+	FE_VGPU
+	FE_SWITCH
+	FE_GPU_I
+	FE_GPU_CI
+	FE_LINK
+	FE_CPU
+	FE_CPU_CORE
+	FE_COUNT
+)
+
+type GroupEntityPair struct {
+	EntityGroupId Field_Entity_Group
+	EntityId      uint
+}
+
+func (e Field_Entity_Group) String() string {
+	switch e {
+	case FE_GPU:
+		return "GPU"
+	case FE_VGPU:
+		return "vGPU"
+	case FE_SWITCH:
+		return "Switch"
+	case FE_GPU_I:
+		return "GPU Instance"
+	case FE_GPU_CI:
+		return "GPU Compute Instance"
+	case FE_LINK:
+		return "Link"
+	case FE_CPU:
+		return "CPU"
+	case FE_CPU_CORE:
+		return "CPU Core"
+	}
+	return "unknown"
+}
diff --git a/pkg/ixdcgm/gpu_group.go b/pkg/ixdcgm/gpu_group.go
index 3b00ed1dd01f78f9050165f89e4eae6f92871569..7035594a856118d2e66cbd1a113828a30b539a89 100644
--- a/pkg/ixdcgm/gpu_group.go
+++ b/pkg/ixdcgm/gpu_group.go
@@ -27,6 +27,18 @@ type GroupHandle struct {
 	handle C.dcgmGpuGrp_t
 }
 
+func (g *GroupHandle) SetHandle(val uintptr) {
+	g.handle = C.dcgmGpuGrp_t(val)
+}
+
+func (g *GroupHandle) GetHandle() uintptr {
+	return uintptr(g.handle)
+}
+
+func GroupAllGPUs() GroupHandle {
+	return GroupHandle{C.DCGM_GROUP_ALL_GPUS}
+}
+
 func CreateGroup(groupName string) (GroupHandle, error) {
 	var cGroupId C.dcgmGpuGrp_t
 	cgn := string2Char(groupName)
@@ -40,7 +52,7 @@ func CreateGroup(groupName string) (GroupHandle, error) {
 	return GroupHandle{cGroupId}, nil
 }
 
-func AddDevice(groupId GroupHandle, gpuId uint) error {
+func AddToGroup(groupId GroupHandle, gpuId uint) error {
 	res := C.dcgmGroupAddDevice(handle.handle, groupId.handle, C.uint(gpuId))
 	if err := errorString(res); err != nil {
 		return err
@@ -55,3 +67,34 @@ func DestroyGroup(groupId GroupHandle) error {
 	}
 	return nil
 }
+
+type GroupInfo struct {
+	Version    uint32
+	GroupName  string
+	EntityList []GroupEntityPair
+}
+
+func GetGroupInfo(groupId GroupHandle) (*GroupInfo, error) {
+	response := C.dcgmGroupInfo_v2{
+		version: C.dcgmGroupInfo_version2,
+	}
+
+	result := C.dcgmGroupGetInfo(handle.handle, groupId.handle, &response)
+	if err := errorString(result); err != nil {
+		return nil, err
+	}
+
+	ret := &GroupInfo{
+		Version:   uint32(response.version),
+		GroupName: C.GoString(&response.groupName[0]),
+	}
+
+	for i := 0; i < int(response.count); i++ {
+		ret.EntityList = append(ret.EntityList, GroupEntityPair{
+			EntityId:      uint(response.entityList[i].entityId),
+			EntityGroupId: Field_Entity_Group(response.entityList[i].entityGroupId),
+		})
+	}
+
+	return ret, nil
+}
diff --git a/pkg/ixdcgm/health.go b/pkg/ixdcgm/health.go
new file mode 100644
index 0000000000000000000000000000000000000000..573d6b10d531982e5931d1248621919b7c7731e2
--- /dev/null
+++ b/pkg/ixdcgm/health.go
@@ -0,0 +1,217 @@
+/*
+Copyright (c) 2024, NVIDIA CORPORATION.
+Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package ixdcgm
+
+/*
+#include "include/dcgm_agent.h"
+#include "include/dcgm_structs.h"
+
+#define MAKE_DCGM_VERSION(typeName, ver) (unsigned int)(sizeof(typeName) | ((unsigned long)(ver) << 24U))
+#define dcgmHealthSetParams_version2 MAKE_DCGM_VERSION(dcgmHealthSetParams_v2, 2)
+*/
+import "C"
+
+import (
+	"fmt"
+	"math/rand"
+	"unsafe"
+)
+
+type SystemWatch struct {
+	Type   string
+	Status string
+	Error  string
+}
+
+type DeviceHealth struct {
+	GPU     uint
+	Status  string
+	Watches []SystemWatch
+}
+
+// HealthSet enable the DCGM health check system for the given systems
+func HealthSet(groupId GroupHandle, systems HealthSystem) (err error) {
+	params_v2 := C.dcgmHealthSetParams_v2{
+		version:        C.dcgmHealthSetParams_version2,
+		groupId:        groupId.handle,
+		systems:        C.dcgmHealthSystems_t(systems),
+		updateInterval: C.longlong(int64(100000)), // How often to query the underlying health information from the driver in usecs.
+		maxKeepAge:     C.double(float64(1)),      // How long to keep data cached for this field in seconds.
+	}
+
+	result := C.dcgmHealthSet_v2(handle.handle, &params_v2)
+	if err = errorString(result); err != nil {
+		return fmt.Errorf("error setting health watches: %w", err)
+	}
+	return
+}
+
+// HealthGet retrieve the current state of the DCGM health check system
+func HealthGet(groupId GroupHandle) (HealthSystem, error) {
+	var systems C.dcgmHealthSystems_t
+
+	result := C.dcgmHealthGet(handle.handle, groupId.handle, (*C.dcgmHealthSystems_t)(unsafe.Pointer(&systems)))
+	if err := errorString(result); err != nil {
+		return HealthSystem(0), err
+	}
+	return HealthSystem(systems), nil
+}
+
+type DiagErrorDetail struct {
+	Message string
+	Code    uint // Error code, see include/dcgm_errors.h for more info
+}
+
+type Incident struct {
+	System     HealthSystem
+	Health     HealthResult
+	Error      DiagErrorDetail
+	EntityInfo GroupEntityPair
+}
+
+type HealthResponse struct {
+	OverallHealth HealthResult
+	Incidents     []Incident
+}
+
+// HealthCheck check the configured watches for any errors/failures/warnings that have occurred
+// since the last time this check was invoked.  On the first call, stateful information
+// about all of the enabled watches within a group is created but no error results are
+// provided. On subsequent calls, any error information will be returned.
+func HealthCheck(groupId GroupHandle) (HealthResponse, error) {
+	var healthResults C.dcgmHealthResponse_v4
+	healthResults.version = makeVersion4(unsafe.Sizeof(healthResults))
+
+	result := C.dcgmHealthCheck(handle.handle, groupId.handle, (*C.dcgmHealthResponse_t)(unsafe.Pointer(&healthResults)))
+
+	if err := errorString(result); err != nil {
+		return HealthResponse{}, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result}
+	}
+
+	response := HealthResponse{
+		OverallHealth: HealthResult(healthResults.overallHealth),
+	}
+
+	// number of watches that encountred error/warning
+	incidents := uint(healthResults.incidentCount)
+
+	response.Incidents = make([]Incident, incidents)
+
+	for i := uint(0); i < incidents; i++ {
+		response.Incidents[i] = Incident{
+			System: HealthSystem(healthResults.incidents[i].system),
+			Health: HealthResult(healthResults.incidents[i].health),
+			Error: DiagErrorDetail{
+				Message: *stringPtr(&healthResults.incidents[i].error.msg[0]),
+				Code:    uint(healthResults.incidents[i].error.code),
+			},
+			EntityInfo: GroupEntityPair{
+				EntityGroupId: Field_Entity_Group(healthResults.incidents[i].entityInfo.entityGroupId),
+				EntityId:      uint(healthResults.incidents[i].entityInfo.entityId),
+			},
+		}
+	}
+
+	return response, nil
+}
+
+func healthCheckByGpuId(gpuId uint) (deviceHealth DeviceHealth, err error) {
+	name := fmt.Sprintf("health%d", rand.Uint64())
+	groupId, err := CreateGroup(name)
+	if err != nil {
+		return
+	}
+
+	err = AddToGroup(groupId, gpuId)
+	if err != nil {
+		return
+	}
+
+	err = HealthSet(groupId, DCGM_HEALTH_WATCH_ALL)
+	if err != nil {
+		return
+	}
+
+	result, err := HealthCheck(groupId)
+	if err != nil {
+		return
+	}
+
+	status := healthStatus(result.OverallHealth)
+	watches := []SystemWatch{}
+
+	// number of watches that encountred error/warning
+	incidents := len(result.Incidents)
+
+	for j := 0; j < incidents; j++ {
+		watch := SystemWatch{
+			Type:   systemWatch(result.Incidents[j].System),
+			Status: healthStatus(result.Incidents[j].Health),
+
+			Error: result.Incidents[j].Error.Message,
+		}
+		watches = append(watches, watch)
+	}
+
+	deviceHealth = DeviceHealth{
+		GPU:     gpuId,
+		Status:  status,
+		Watches: watches,
+	}
+	_ = DestroyGroup(groupId)
+	return
+}
+
+func healthStatus(status HealthResult) string {
+	switch status {
+	case 0:
+		return "Healthy"
+	case 10:
+		return "Warning"
+	case 20:
+		return "Failure"
+	}
+	return "N/A"
+}
+
+func systemWatch(watch HealthSystem) string {
+	switch watch {
+	case 1:
+		return "PCIe watches"
+	case 2:
+		return "LINK watches"
+	case 4:
+		return "Power Managemnt unit watches"
+	case 8:
+		return "Microcontroller unit watches"
+	case 16:
+		return "Memory watches"
+	case 32:
+		return "Streaming Multiprocessor watches"
+	case 64:
+		return "Inforom watches"
+	case 128:
+		return "Temperature watches"
+	case 256:
+		return "Power watches"
+	case 512:
+		return "Driver-related watches"
+	}
+	return "N/A"
+}
diff --git a/pkg/ixdcgm/include/dcgm_errors.h b/pkg/ixdcgm/include/dcgm_errors.h
new file mode 100644
index 0000000000000000000000000000000000000000..efd80473348c07e18491fae4af6fea424c0a320b
--- /dev/null
+++ b/pkg/ixdcgm/include/dcgm_errors.h
@@ -0,0 +1,626 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DCGM_ERRORS_H
+#define DCGM_ERRORS_H
+
+#include "dcgm_api_export.h"
+#include "dcgm_structs.h"
+
+/***************************************************************************************************/
+/** @defgroup dcgmErrorEnums Error Codes
+ *  @{
+ */
+/***************************************************************************************************/
+/*
+ * Error codes for passive and active health checks.
+ * New error codes must be added to end of enum to maintain backwards compatibility.
+ */
+typedef enum dcgmError_enum
+{
+    DCGM_FR_OK                         = 0,   //!< 0 No error
+    DCGM_FR_UNKNOWN                    = 1,   //!< 1 Unknown error code
+    DCGM_FR_UNRECOGNIZED               = 2,   //!< 2 Unrecognized error code
+    DCGM_FR_PCI_REPLAY_RATE            = 3,   //!< 3 Unacceptable rate of PCI errors
+    DCGM_FR_VOLATILE_DBE_DETECTED      = 4,   //!< 4 Uncorrectable volatile double bit error
+    DCGM_FR_VOLATILE_SBE_DETECTED      = 5,   //!< 5 Unacceptable rate of volatile single bit errors
+    DCGM_FR_PENDING_PAGE_RETIREMENTS   = 6,   //!< 6 Pending page retirements detected
+    DCGM_FR_RETIRED_PAGES_LIMIT        = 7,   //!< 7 Unacceptable total page retirements detected
+    DCGM_FR_RETIRED_PAGES_DBE_LIMIT    = 8,   //!< 8 Unacceptable total page retirements due to uncorrectable errors
+    DCGM_FR_CORRUPT_INFOROM            = 9,   //!< 9 Corrupt inforom found
+    DCGM_FR_CLOCK_THROTTLE_THERMAL     = 10,  //!< 10 Clocks being throttled due to overheating
+    DCGM_FR_POWER_UNREADABLE           = 11,  //!< 11 Cannot get a reading for power from NVML
+    DCGM_FR_CLOCK_THROTTLE_POWER       = 12,  //!< 12 Clock being throttled due to power restrictions
+    DCGM_FR_NVLINK_ERROR_THRESHOLD     = 13,  //!< 13 Unacceptable rate of NVLink errors
+    DCGM_FR_NVLINK_DOWN                = 14,  //!< 14 NVLink is down
+    DCGM_FR_NVSWITCH_FATAL_ERROR       = 15,  //!< 15 Fatal errors on the NVSwitch
+    DCGM_FR_NVSWITCH_NON_FATAL_ERROR   = 16,  //!< 16 Non-fatal errors on the NVSwitch
+    DCGM_FR_NVSWITCH_DOWN              = 17,  //!< 17 NVSwitch is down - NOT USED: DEPRECATED
+    DCGM_FR_NO_ACCESS_TO_FILE          = 18,  //!< 18 Cannot access a file
+    DCGM_FR_NVML_API                   = 19,  //!< 19 Error occurred on an NVML API - NOT USED: DEPRECATED
+    DCGM_FR_DEVICE_COUNT_MISMATCH      = 20,  //!< 20 Disagreement in GPU count between /dev and NVML
+    DCGM_FR_BAD_PARAMETER              = 21,  //!< 21 Bad parameter passed to API
+    DCGM_FR_CANNOT_OPEN_LIB            = 22,  //!< 22 Cannot open a library that must be accessed
+    DCGM_FR_DENYLISTED_DRIVER          = 23,  //!< 23 A driver on the denylist (nouveau) is active
+    DCGM_FR_NVML_LIB_BAD               = 24,  //!< 24 NVML library is missing expected functions - NOT USED: DEPRECATED
+    DCGM_FR_GRAPHICS_PROCESSES         = 25,  //!< 25 Graphics processes are active on this GPU
+    DCGM_FR_HOSTENGINE_CONN            = 26,  //!< 26 Bad connection to nv-hostengine - NOT USED: DEPRECATED
+    DCGM_FR_FIELD_QUERY                = 27,  //!< 27 Error querying a field from DCGM
+    DCGM_FR_BAD_CUDA_ENV               = 28,  //!< 28 The environment has variables that hurt CUDA
+    DCGM_FR_PERSISTENCE_MODE           = 29,  //!< 29 Persistence mode is disabled
+    DCGM_FR_LOW_BANDWIDTH              = 30,  //!< 30 The bandwidth is unacceptably low
+    DCGM_FR_HIGH_LATENCY               = 31,  //!< 31 Latency is too high
+    DCGM_FR_CANNOT_GET_FIELD_TAG       = 32,  //!< 32 Cannot find a tag for a field
+    DCGM_FR_FIELD_VIOLATION            = 33,  //!< 33 The value for the specified error field is above 0
+    DCGM_FR_FIELD_THRESHOLD            = 34,  //!< 34 The value for the specified field is above the threshold
+    DCGM_FR_FIELD_VIOLATION_DBL        = 35,  //!< 35 The value for the specified error field is above 0
+    DCGM_FR_FIELD_THRESHOLD_DBL        = 36,  //!< 36 The value for the specified field is above the threshold
+    DCGM_FR_UNSUPPORTED_FIELD_TYPE     = 37,  //!< 37 Field type cannot be supported
+    DCGM_FR_FIELD_THRESHOLD_TS         = 38,  //!< 38 The value for the specified field is above the threshold
+    DCGM_FR_FIELD_THRESHOLD_TS_DBL     = 39,  //!< 39 The value for the specified field is above the threshold
+    DCGM_FR_THERMAL_VIOLATIONS         = 40,  //!< 40 Thermal violations detected
+    DCGM_FR_THERMAL_VIOLATIONS_TS      = 41,  //!< 41 Thermal violations detected with a timestamp
+    DCGM_FR_TEMP_VIOLATION             = 42,  //!< 42 Temperature is too high
+    DCGM_FR_THROTTLING_VIOLATION       = 43,  //!< 43 Non-benign clock throttling is occurring
+    DCGM_FR_INTERNAL                   = 44,  //!< 44 An internal error was detected
+    DCGM_FR_PCIE_GENERATION            = 45,  //!< 45 PCIe generation is too low
+    DCGM_FR_PCIE_WIDTH                 = 46,  //!< 46 PCIe width is too low
+    DCGM_FR_ABORTED                    = 47,  //!< 47 Test was aborted by a user signal
+    DCGM_FR_TEST_DISABLED              = 48,  //!< 48 This test is disabled for this GPU
+    DCGM_FR_CANNOT_GET_STAT            = 49,  //!< 49 Cannot get telemetry for a needed value
+    DCGM_FR_STRESS_LEVEL               = 50,  //!< 50 Stress level is too low (bad performance)
+    DCGM_FR_CUDA_API                   = 51,  //!< 51 Error calling the specified CUDA API
+    DCGM_FR_FAULTY_MEMORY              = 52,  //!< 52 Faulty memory detected on this GPU
+    DCGM_FR_CANNOT_SET_WATCHES         = 53,  //!< 53 Unable to set field watches in DCGM - NOT USED: DEPRECATED
+    DCGM_FR_CUDA_UNBOUND               = 54,  //!< 54 CUDA context is no longer bound
+    DCGM_FR_ECC_DISABLED               = 55,  //!< 55 ECC memory is disabled right now
+    DCGM_FR_MEMORY_ALLOC               = 56,  //!< 56 Cannot allocate memory on the GPU
+    DCGM_FR_CUDA_DBE                   = 57,  //!< 57 CUDA detected unrecovable double-bit error
+    DCGM_FR_MEMORY_MISMATCH            = 58,  //!< 58 Memory error detected
+    DCGM_FR_CUDA_DEVICE                = 59,  //!< 59 No CUDA device discoverable for existing GPU
+    DCGM_FR_ECC_UNSUPPORTED            = 60,  //!< 60 ECC memory is unsupported by this SKU
+    DCGM_FR_ECC_PENDING                = 61,  //!< 61 ECC memory is in a pending state - NOT USED: DEPRECATED
+    DCGM_FR_MEMORY_BANDWIDTH           = 62,  //!< 62 Memory bandwidth is too low
+    DCGM_FR_TARGET_POWER               = 63,  //!< 63 Cannot hit the target power draw
+    DCGM_FR_API_FAIL                   = 64,  //!< 64 The specified API call failed
+    DCGM_FR_API_FAIL_GPU               = 65,  //!< 65 The specified API call failed for the specified GPU
+    DCGM_FR_CUDA_CONTEXT               = 66,  //!< 66 Cannot create a CUDA context on this GPU
+    DCGM_FR_DCGM_API                   = 67,  //!< 67 DCGM API failure
+    DCGM_FR_CONCURRENT_GPUS            = 68,  //!< 68 Need multiple GPUs to run this test
+    DCGM_FR_TOO_MANY_ERRORS            = 69,  //!< 69 More errors than fit in the return struct - NOT USED: DEPRECATED
+    DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD = 70,  //!< 70 More than 100 CRC errors are happening per second
+    DCGM_FR_NVLINK_ERROR_CRITICAL      = 71,  //!< 71 NVLink error for a field that should always be 0
+    DCGM_FR_ENFORCED_POWER_LIMIT       = 72,  //!< 72 The enforced power limit is too low to hit the target
+    DCGM_FR_MEMORY_ALLOC_HOST          = 73,  //!< 73 Cannot allocate memory on the host
+    DCGM_FR_GPU_OP_MODE                = 74,  //!< 74 Bad GPU operating mode for running plugin - NOT USED: DEPRECATED
+    DCGM_FR_NO_MEMORY_CLOCKS           = 75,  //!< 75 No memory clocks with the needed MHz found - NOT USED: DEPRECATED
+    DCGM_FR_NO_GRAPHICS_CLOCKS   = 76,  //!< 76 No graphics clocks with the needed MHz found - NOT USED: DEPRECATED
+    DCGM_FR_HAD_TO_RESTORE_STATE = 77,  //!< 77 Note that we had to restore a GPU's state
+    DCGM_FR_L1TAG_UNSUPPORTED    = 78,  //!< 78 L1TAG test is unsupported by this SKU
+    DCGM_FR_L1TAG_MISCOMPARE     = 79,  //!< 79 L1TAG test failed on a miscompare
+    DCGM_FR_ROW_REMAP_FAILURE    = 80,  //!< 80 Row remapping failed (Ampere or newer GPUs)
+    DCGM_FR_UNCONTAINED_ERROR    = 81,  //!< 81 Uncontained error - XID 95
+    DCGM_FR_EMPTY_GPU_LIST       = 82,  //!< 82 No GPU information given to plugin
+    DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS = 83,  //!< 83 Pending page retirements due to a DBE
+    DCGM_FR_UNCORRECTABLE_ROW_REMAP      = 84,  //!< 84 Uncorrectable row remapping
+    DCGM_FR_PENDING_ROW_REMAP            = 85,  //!< 85 Row remapping is pending
+    DCGM_FR_BROKEN_P2P_MEMORY_DEVICE     = 86,  //!< 86 P2P copy test detected an error writing to this GPU
+    DCGM_FR_BROKEN_P2P_WRITER_DEVICE     = 87,  //!< 87 P2P copy test detected an error writing from this GPU
+    DCGM_FR_NVSWITCH_NVLINK_DOWN    = 88,  //!< 88 An NvLink is down for the specified NVSwitch - NOT USED: DEPRECATED
+    DCGM_FR_EUD_BINARY_PERMISSIONS  = 89,  //!< 89 EUD binary permissions are incorrect
+    DCGM_FR_EUD_NON_ROOT_USER       = 90,  //!< 90 EUD plugin is not running as root
+    DCGM_FR_EUD_SPAWN_FAILURE       = 91,  //!< 91 EUD plugin failed to spawn the EUD binary
+    DCGM_FR_EUD_TIMEOUT             = 92,  //!< 92 EUD plugin timed out
+    DCGM_FR_EUD_ZOMBIE              = 93,  //!< 93 EUD process remains running after the plugin considers it finished
+    DCGM_FR_EUD_NON_ZERO_EXIT_CODE  = 94,  //!< 94 EUD process exited with a non-zero exit code
+    DCGM_FR_EUD_TEST_FAILED         = 95,  //!< 95 EUD test failed
+    DCGM_FR_FILE_CREATE_PERMISSIONS = 96,  //!< 96 We cannot create a file in this directory.
+    DCGM_FR_PAUSE_RESUME_FAILED     = 97,  //!< 97 Pause/Resume failed
+    DCGM_FR_PCIE_H_REPLAY_VIOLATION = 98,  //!< 98 PCIe test caught correctable errors
+    DCGM_FR_GPU_EXPECTED_NVLINKS_UP = 99,  //!< 99 Expected nvlinks up per gpu
+    DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP    = 100,  //!< 100 Expected nvlinks up per nvswitch
+    DCGM_FR_XID_ERROR                       = 101,  //!< 101 XID error detected
+    DCGM_FR_SBE_VIOLATION                   = 102,  //!< 102 Single bit error detected
+    DCGM_FR_DBE_VIOLATION                   = 103,  //!< 103 Double bit error detected
+    DCGM_FR_PCIE_REPLAY_VIOLATION           = 104,  //!< 104 PCIe replay errors detected
+    DCGM_FR_SBE_THRESHOLD_VIOLATION         = 105,  //!< 105 SBE threshold violated
+    DCGM_FR_DBE_THRESHOLD_VIOLATION         = 106,  //!< 106 DBE threshold violated
+    DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION = 107,  //!< 107 PCIE replay count violated
+    DCGM_FR_CUDA_FM_NOT_INITIALIZED         = 108,  //!< 108 The fabricmanager is not initialized
+    DCGM_FR_SXID_ERROR                      = 109,  //!< 109 NvSwitch fatal error detected
+    DCGM_FR_ERROR_SENTINEL                  = 110,  //!< 110 MUST BE THE LAST ERROR CODE
+} dcgmError_t;
+
+typedef enum dcgmErrorSeverity_enum
+{
+    DCGM_ERROR_NONE    = 0,  //!< 0 NONE
+    DCGM_ERROR_MONITOR = 1,  //!< 1 Can perform workload, but needs to be monitored.
+    DCGM_ERROR_ISOLATE = 2,  //!< 2 Cannot perform workload. GPU should be isolated.
+    DCGM_ERROR_UNKNOWN = 3,  //!< 3 This error code is not recognized
+    DCGM_ERROR_TRIAGE  = 4,  //!< 4 This error should be triaged
+    DCGM_ERROR_CONFIG  = 5,  //!< 5 This error can be configured
+    DCGM_ERROR_RESET   = 6,  //!< 6 Drain and reset GPU
+} dcgmErrorSeverity_t;
+
+typedef enum dcgmErrorCategory_enum
+{
+    DCGM_FR_EC_NONE              = 0,   //!< 0 NONE
+    DCGM_FR_EC_PERF_THRESHOLD    = 1,   //!< 1 Performance Threshold
+    DCGM_FR_EC_PERF_VIOLATION    = 2,   //!< 2 Performance Violation
+    DCGM_FR_EC_SOFTWARE_CONFIG   = 3,   //!< 3 Software Configuration
+    DCGM_FR_EC_SOFTWARE_LIBRARY  = 4,   //!< 4 Software Library
+    DCGM_FR_EC_SOFTWARE_XID      = 5,   //!< 5 Software XID
+    DCGM_FR_EC_SOFTWARE_CUDA     = 6,   //!< 6 Software Cuda
+    DCGM_FR_EC_SOFTWARE_EUD      = 7,   //!< 7 Software EUD
+    DCGM_FR_EC_SOFTWARE_OTHER    = 8,   //!< 8 Software Other
+    DCGM_FR_EC_HARDWARE_THERMAL  = 9,   //!< 9 Hardware Thermal
+    DCGM_FR_EC_HARDWARE_MEMORY   = 10,  //!< 10 Hardware Memory
+    DCGM_FR_EC_HARDWARE_NVLINK   = 11,  //!< 11 Hardware NvLink
+    DCGM_FR_EC_HARDWARE_NVSWITCH = 12,  //!< 12 Hardware NvSwitch
+    DCGM_FR_EC_HARDWARE_PCIE     = 13,  //!< 13 Hardware PCIe
+    DCGM_FR_EC_HARDWARE_POWER    = 14,  //!< 14 Hardware Power
+    DCGM_FR_EC_HARDWARE_OTHER    = 15,  //!< 15 Hardware Other
+    DCGM_FR_EC_INTERNAL_OTHER    = 16,  //!< 16 Internal Other
+} dcgmErrorCategory_t;
+
+typedef struct
+{
+    dcgmError_t errorId;
+    const char* msgFormat;
+    const char* suggestion;
+    int         severity;
+    int         category;
+} dcgm_error_meta_t;
+
+extern dcgm_error_meta_t dcgmErrorMeta[];
+
+/* Standard message for running a field diagnostic */
+#define TRIAGE_RUN_FIELD_DIAG_MSG "Run a field diagnostic on the GPU."
+#define DEBUG_COOLING_MSG                                                                                              \
+    "Verify that the cooling on this machine is functional, including external, "                                      \
+    "thermal material interface, fans, and any other components."
+#define BUG_REPORT_MSG "Please capture an nvidia-bug-report and send it to NVIDIA."
+#define SYSTEM_TRIAGE_MSG "Check DCGM and system logs for errors. Reset GPU. Restart DCGM. Rerun diagnostics."
+#define CONFIG_MSG "Check DCGM and system configuration. This error may be eliminated with an updated configuration."
+
+/*
+ * Messages for the error codes. All messages must be defined in the ERROR_CODE_MSG <msg> format
+ * where <msg> is the actual message.
+ */
+#define DCGM_FR_OK_MSG "The operation completed successfully."
+#define DCGM_FR_UNKNOWN_MSG "Unknown error."
+#define DCGM_FR_UNRECOGNIZED_MSG "Unrecognized error code."
+// replay limit, gpu id, replay errors detected
+#define DCGM_FR_PCI_REPLAY_RATE_MSG "Detected more than %u PCIe replays per minute for GPU %u : %d"
+// dbes deteced, gpu id
+#define DCGM_FR_VOLATILE_DBE_DETECTED_MSG "Detected %d volatile double-bit ECC error(s) in GPU %u."
+// sbe limit, gpu id, sbes detected
+#define DCGM_FR_VOLATILE_SBE_DETECTED_MSG "More than %u single-bit ECC error(s) detected in GPU %u Volatile SBEs: %lld"
+// gpu id
+#define DCGM_FR_PENDING_PAGE_RETIREMENTS_MSG "A pending retired page has been detected in GPU %u."
+// retired pages detected, gpud id
+#define DCGM_FR_RETIRED_PAGES_LIMIT_MSG "%u or more retired pages have been detected in GPU %u. "
+// retired pages due to dbes detected, gpu id
+#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_MSG                                                                            \
+    "An excess of %u retired pages due to DBEs have been detected and"                                                 \
+    " more than one page has been retired due to DBEs in the past"                                                     \
+    " week in GPU %u."
+// gpu id
+#define DCGM_FR_CORRUPT_INFOROM_MSG "A corrupt InfoROM has been detected in GPU %u."
+// gpu id
+#define DCGM_FR_CLOCK_THROTTLE_THERMAL_MSG "Detected clock throttling due to thermal violation in GPU %u."
+// gpu id
+#define DCGM_FR_POWER_UNREADABLE_MSG "Cannot reliably read the power usage for GPU %u."
+// gpu id
+#define DCGM_FR_CLOCK_THROTTLE_POWER_MSG "Detected clock throttling due to power violation in GPU %u."
+// nvlink errors detected, nvlink id, error threshold
+#define DCGM_FR_NVLINK_ERROR_THRESHOLD_MSG                                                                             \
+    "Detected %ld %s NvLink errors on GPU %u's NVLink which exceeds "                                                  \
+    "threshold of %u"
+// gpu id, nvlink id
+#define DCGM_FR_NVLINK_DOWN_MSG "GPU %u's NvLink link %d is currently down"
+// nvlinks up, expected nvlinks up
+#define DCGM_FR_GPU_EXPECTED_NVLINKS_UP_MSG "Only %u NvLinks are up out of the expected %u"
+// switch id, nvlinks up, expected nvlinks up
+#define DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP_MSG "NvSwitch %u - Only %u NvLinks are up out of the expected %u"
+// nvswitch id, nvlink id
+#define DCGM_FR_NVSWITCH_FATAL_ERROR_MSG "Detected fatal errors on NvSwitch %u link %u"
+// nvswitch id, nvlink id
+#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_MSG "Detected nonfatal errors on NvSwitch %u link %u"
+// nvswitch id, nvlink port
+#define DCGM_FR_NVSWITCH_DOWN_MSG "NvSwitch physical ID %u's NvLink port %d is currently down."
+// file path, error detail
+#define DCGM_FR_NO_ACCESS_TO_FILE_MSG "File %s could not be accessed directly: %s"
+// purpose for communicating with NVML, NVML error as string, NVML error
+#define DCGM_FR_NVML_API_MSG "Error calling NVML API %s: %s"
+#define DCGM_FR_DEVICE_COUNT_MISMATCH_MSG                                                                              \
+    "The number of devices NVML returns is different than the number "                                                 \
+    "of devices in /dev."
+// function name
+#define DCGM_FR_BAD_PARAMETER_MSG "Bad parameter to function %s cannot be processed"
+// library name, error returned from dlopen
+#define DCGM_FR_CANNOT_OPEN_LIB_MSG "Cannot open library %s: '%s'"
+// the name of the denylisted driver
+#define DCGM_FR_DENYLISTED_DRIVER_MSG "Found driver on the denylist: %s"
+// the name of the function that wasn't found
+#define DCGM_FR_NVML_LIB_BAD_MSG "Cannot get pointer to %s from libnvidia-ml.so"
+#define DCGM_FR_GRAPHICS_PROCESSES_MSG                                                                                 \
+    "NVVS has detected processes with graphics contexts open running on at least one "                                 \
+    "GPU. This may cause some tests to fail."
+// error message from the API call
+#define DCGM_FR_HOSTENGINE_CONN_MSG "Could not connect to the host engine: '%s'"
+// field name, gpu id
+#define DCGM_FR_FIELD_QUERY_MSG "Could not query field %s for GPU %u"
+// environment variable name
+#define DCGM_FR_BAD_CUDA_ENV_MSG "Found CUDA performance-limiting environment variable '%s'."
+// gpu id
+#define DCGM_FR_PERSISTENCE_MODE_MSG "Persistence mode for GPU %u is disabled."
+// gpu id, direction (d2h, e.g.), measured bandwidth, expected bandwidth
+#define DCGM_FR_LOW_BANDWIDTH_MSG                                                                                      \
+    "Bandwidth of GPU %u in direction %s of %.2f did not exceed "                                                      \
+    "minimum required bandwidth of %.2f."
+// gpu id, direction (d2h, e.g.), measured latency, expected latency
+#define DCGM_FR_HIGH_LATENCY_MSG                                                                                       \
+    "Latency type %s of GPU %u value %.2f exceeded maximum allowed "                                                   \
+    "latency of %.2f."
+// field id
+#define DCGM_FR_CANNOT_GET_FIELD_TAG_MSG "Unable to get field information for field id %hu"
+// field value, field name, gpu id (this message is for fields that should always have a 0 value)
+#define DCGM_FR_FIELD_VIOLATION_MSG "Detected %ld %s for GPU %u"
+// field value, field name, gpu id, allowable threshold
+#define DCGM_FR_FIELD_THRESHOLD_MSG "Detected %ld %s for GPU %u which is above the threshold %ld"
+// field value, field name, gpu id (same as DCGM_FR_FIELD_VIOLATION, but it's a double)
+#define DCGM_FR_FIELD_VIOLATION_DBL_MSG "Detected %.1f %s for GPU %u"
+// field value, field name, gpu id, allowable threshold (same as DCGM_FR_FIELD_THRESHOLD, but it's a double)
+#define DCGM_FR_FIELD_THRESHOLD_DBL_MSG "Detected %.1f %s for GPU %u which is above the threshold %.1f"
+// field name
+#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_MSG                                                                             \
+    "Field %s is not supported by this API because it is neither an "                                                  \
+    "int64 nor a double type."
+// field name, allowable threshold, observed value, seconds
+#define DCGM_FR_FIELD_THRESHOLD_TS_MSG                                                                                 \
+    "%s met or exceeded the threshold of %lu per second: %lu at "                                                      \
+    "%.1f seconds into the test."
+// field name, allowable threshold, observed value, seconds (same as DCGM_FR_FIELD_THRESHOLD, but it's a double)
+#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_MSG                                                                             \
+    "%s met or exceeded the threshold of %.1f per second: %.1f at "                                                    \
+    "%.1f seconds into the test."
+// total seconds of violation, gpu id
+#define DCGM_FR_THERMAL_VIOLATIONS_MSG "There were thermal violations totaling %.1f seconds for GPU %u"
+// total seconds of violations, first instance, gpu id
+#define DCGM_FR_THERMAL_VIOLATIONS_TS_MSG                                                                              \
+    "Thermal violations totaling %.1f seconds started at %.1f seconds "                                                \
+    "into the test for GPU %u"
+// observed temperature, gpu id, max allowed temperature
+#define DCGM_FR_TEMP_VIOLATION_MSG                                                                                     \
+    "Temperature %lld of GPU %u exceeded user-specified maximum "                                                      \
+    "allowed temperature %lld"
+// gpu id, seconds into test, details about throttling
+#define DCGM_FR_THROTTLING_VIOLATION_MSG                                                                               \
+    "Clocks are being throttled for GPU %u because of clock "                                                          \
+    "throttling starting %.1f seconds into the test. %s"
+// details about error
+#define DCGM_FR_INTERNAL_MSG "There was an internal error during the test: '%s'"
+// gpu id, PCIe generation, minimum allowed, parameter to control
+#define DCGM_FR_PCIE_GENERATION_MSG                                                                                    \
+    "GPU %u is running at PCI link generation %d, which is below "                                                     \
+    "the minimum allowed link generation of %d (parameter '%s')"
+// gpu id, PCIe width, minimum allowed, parameter to control
+#define DCGM_FR_PCIE_WIDTH_MSG                                                                                         \
+    "GPU %u is running at PCI link width %dX, which is below the "                                                     \
+    "minimum allowed link generation of %d (parameter '%s')"
+#define DCGM_FR_ABORTED_MSG "Test was aborted early due to user signal"
+// Test name
+#define DCGM_FR_TEST_DISABLED_MSG "The %s test is skipped for this GPU."
+// stat name, gpu id
+#define DCGM_FR_CANNOT_GET_STAT_MSG "Unable to generate / collect stat %s for GPU %u"
+// observed value, minimum allowed, gpu id
+#define DCGM_FR_STRESS_LEVEL_MSG                                                                                       \
+    "Max stress level of %.1f did not reach desired stress level of "                                                  \
+    "%.1f for GPU %u"
+// CUDA API name
+#define DCGM_FR_CUDA_API_MSG "Error using CUDA API %s"
+// count, gpu id
+#define DCGM_FR_FAULTY_MEMORY_MSG "Found %d faulty memory elements on GPU %u"
+// error detail
+#define DCGM_FR_CANNOT_SET_WATCHES_MSG "Unable to add field watches to DCGM: %s"
+// gpu id
+#define DCGM_FR_CUDA_UNBOUND_MSG "Cuda GPU %d is no longer bound to a CUDA context...Aborting"
+// Test name, gpu id
+#define DCGM_FR_ECC_DISABLED_MSG "Skipping test %s because ECC is not enabled on GPU %u"
+// percentage of memory we tried to allocate, gpu id
+#define DCGM_FR_MEMORY_ALLOC_MSG "Couldn't allocate at least %.1f%% of GPU memory on GPU %u"
+// gpu id
+#define DCGM_FR_CUDA_DBE_MSG                                                                                           \
+    "CUDA APIs have indicated that a double-bit ECC error has "                                                        \
+    "occured on GPU %u."
+// gpu id
+#define DCGM_FR_MEMORY_MISMATCH_MSG                                                                                    \
+    "A memory mismatch was detected on GPU %u, but no error was "                                                      \
+    "reported by CUDA or NVML."
+// gpu id, error detail
+#define DCGM_FR_CUDA_DEVICE_MSG "Unable to find a corresponding CUDA device for GPU %u: '%s'"
+#define DCGM_FR_ECC_UNSUPPORTED_MSG "ECC Memory is not turned on or is unsupported. Skipping test."
+// gpu id
+#define DCGM_FR_ECC_PENDING_MSG "ECC memory for GPU %u is in a pending state."
+// gpu id, observed bandwidth, required, test name
+#define DCGM_FR_MEMORY_BANDWIDTH_MSG                                                                                   \
+    "GPU %u only achieved a memory bandwidth of %.2f GB/s, failing "                                                   \
+    "to meet %.2f GB/s for test %d"
+// power draw observed, field tag, minimum power draw required, gpu id
+#define DCGM_FR_TARGET_POWER_MSG                                                                                       \
+    "Max power of %.1f did not reach desired power minimum %s of "                                                     \
+    "%.1f for GPU %u"
+// API name, error detail
+#define DCGM_FR_API_FAIL_MSG "API call %s failed: '%s'"
+// API name, gpu id, error detail
+#define DCGM_FR_API_FAIL_GPU_MSG "API call %s failed for GPU %u: '%s'"
+// gpu id, error detail
+#define DCGM_FR_CUDA_CONTEXT_MSG "GPU %u failed to create a CUDA context: %s"
+// DCGM API name
+#define DCGM_FR_DCGM_API_MSG "Error using DCGM API %s"
+#define DCGM_FR_CONCURRENT_GPUS_MSG                                                                                    \
+    "Unable to run concurrent pair bandwidth test without 2 or more "                                                  \
+    "gpus. Skipping"
+#define DCGM_FR_TOO_MANY_ERRORS_MSG                                                                                    \
+    "This API can only return up to four errors per system. "                                                          \
+    "Additional errors were found for this system that couldn't be "                                                   \
+    "communicated."
+// error count, gpu id
+#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_MSG                                                                         \
+    "%.1f %s NvLink errors found occuring per second on GPU %u, "                                                      \
+    "exceeding the limit of 100 per second."
+// error count, field name, gpu id
+#define DCGM_FR_NVLINK_ERROR_CRITICAL_MSG "Detected %ld %s NvLink errors on GPU %u's NVLink (should be 0)"
+// gpu id, power limit, power reached
+#define DCGM_FR_ENFORCED_POWER_LIMIT_MSG                                                                               \
+    "Enforced power limit on GPU %u set to %.1f, which is too low to "                                                 \
+    "attempt to achieve target power %.1f"
+// memory
+#define DCGM_FR_MEMORY_ALLOC_HOST_MSG "Cannot allocate %zu bytes on the host"
+#define DCGM_FR_GPU_OP_MODE_MSG "Skipping plugin due to a GPU being in GPU Operating Mode: LOW_DP."
+// clock, count
+#define DCGM_FR_NO_MEMORY_CLOCKS_MSG "No memory clocks <= %u MHZ were found in %u supported memory clocks."
+// clock, count, clock
+#define DCGM_FR_NO_GRAPHICS_CLOCKS_MSG                                                                                 \
+    "No graphics clocks <= %u MHZ were found in %u supported graphics clocks for memory clock %u MHZ."
+// error detail
+#define DCGM_FR_HAD_TO_RESTORE_STATE_MSG "Had to restore GPU state on NVML GPU(s): %s"
+#define DCGM_FR_L1TAG_UNSUPPORTED_MSG "This card does not support the L1 cache test. Skipping test."
+#define DCGM_FR_L1TAG_MISCOMPARE_MSG "Detected a miscompare failure in the L1 cache."
+// gpu id
+#define DCGM_FR_ROW_REMAP_FAILURE_MSG "GPU %u had uncorrectable memory errors and row remapping failed."
+#define DCGM_FR_UNCONTAINED_ERROR_MSG "GPU had an uncontained error (XID 95)"
+#define DCGM_FR_EMPTY_GPU_LIST_MSG "No valid GPUs passed to plugin"
+#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_MSG "Pending page retirements together with a DBE were detected on GPU %u."
+// gpu id, rows remapped
+#define DCGM_FR_UNCORRECTABLE_ROW_REMAP_MSG "GPU %u had uncorrectable memory errors and %u rows were remapped"
+// gpu id
+#define DCGM_FR_PENDING_ROW_REMAP_MSG "GPU %u had memory errors and row remappings are pending"
+// gpu id, test name
+#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_MSG "GPU %u was unsuccessfully written to in a peer-to-peer test: %s"
+// gpu id, test name
+#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_MSG "GPU %u unsuccessfully wrote data in a peer-to-peer test: %s"
+// nvswitch id, nvlink id
+#define DCGM_FR_NVSWITCH_NVLINK_DOWN_MSG "NVSwitch %u's NvLink %u is down."
+#define DCGM_FR_EUD_BINARY_PERMISSIONS_MSG "" /* See message inplace */
+#define DCGM_FR_EUD_NON_ROOT_USER_MSG ""      /* See message inplace */
+#define DCGM_FR_EUD_SPAWN_FAILURE_MSG ""      /* See message inplace */
+#define DCGM_FR_EUD_TIMEOUT_MSG ""            /* See message inplace */
+#define DCGM_FR_EUD_ZOMBIE_MSG ""             /* See message inplace */
+#define DCGM_FR_EUD_NON_ZERO_EXIT_CODE_MSG "" /* See message inplace */
+#define DCGM_FR_EUD_TEST_FAILED_MSG ""        /* See message inplace */
+#define DCGM_FR_FILE_CREATE_PERMISSIONS_MSG                                                                            \
+    "The DCGM Diagnostic does not have permissions to create a file in directory '%s'"
+#define DCGM_FR_PAUSE_RESUME_FAILED_MSG "" /* See message inplace */
+// gpu id
+#define DCGM_FR_PCIE_H_REPLAY_VIOLATION_MSG "GPU %u host-side PCIe replay violation, see dmesg for more information"
+// xid error, gpu id
+#define DCGM_FR_XID_ERROR_MSG "Detected XID %u for GPU %u"
+// count, field, gpu id
+#define DCGM_FR_SBE_VIOLATION_MSG "Detected %ld %s for GPU %u"
+// count, field, gpu id
+#define DCGM_FR_DBE_VIOLATION_MSG "Detected %ld %s for GPU %u"
+// count, field, gpu id
+#define DCGM_FR_PCIE_REPLAY_VIOLATION_MSG "Detected %ld %s for GPU %u"
+// count, field, gpu id, threshold
+#define DCGM_FR_SBE_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld"
+#define DCGM_FR_DBE_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld"
+#define DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION_MSG "Detected %ld %s for GPU %u which is above the threshold %ld"
+#define DCGM_FR_CUDA_FM_NOT_INITIALIZED_MSG ""
+#define DCGM_FR_SXID_ERROR_MSG "Detected fatal NvSwitch SXID %u"
+#define DCGM_FR_ERROR_SENTINEL_MSG "" /* See message inplace */
+
+/*
+ * Suggestions for next steps for the corresponding error message
+ */
+#define DCGM_FR_OK_NEXT "N/A"
+#define DCGM_FR_UNKNOWN_NEXT ""
+#define DCGM_FR_UNRECOGNIZED_NEXT ""
+#define DCGM_FR_PCI_REPLAY_RATE_NEXT                                                                                   \
+    "Reconnect PCIe card. Run system side PCIE diagnostic utilities "                                                  \
+    "to verify hops off the GPU board. If issue is on the board, run "                                                 \
+    "the field diagnostic."
+#define DCGM_FR_VOLATILE_DBE_DETECTED_NEXT "Drain the GPU and reset it or reboot the node."
+#define DCGM_FR_VOLATILE_SBE_DETECTED_NEXT "Monitor - this GPU can still perform workload."
+#define DCGM_FR_PENDING_PAGE_RETIREMENTS_NEXT "Monitor - this GPU can still perform workload"
+#define DCGM_FR_RETIRED_PAGES_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_RETIRED_PAGES_DBE_LIMIT_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_CORRUPT_INFOROM_NEXT "Flash the InfoROM to clear this corruption."
+#define DCGM_FR_CLOCK_THROTTLE_THERMAL_NEXT DEBUG_COOLING_MSG
+#define DCGM_FR_POWER_UNREADABLE_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_CLOCK_THROTTLE_POWER_NEXT "Monitor the power conditions. This GPU can still perform workload."
+#define DCGM_FR_NVLINK_ERROR_THRESHOLD_NEXT "Monitor the NVLink. It can still perform workload."
+#define DCGM_FR_NVLINK_DOWN_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_NVSWITCH_FATAL_ERROR_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_NVSWITCH_NON_FATAL_ERROR_NEXT "Monitor the NVSwitch. It can still perform workload."
+#define DCGM_FR_NVSWITCH_DOWN_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_NO_ACCESS_TO_FILE_NEXT "Check relevant permissions, access, and existence of the file."
+#define DCGM_FR_GPU_EXPECTED_NVLINKS_UP_NEXT                                                                           \
+    "Ensure Fabric Manager is running. Check system logs, dmesg, and fabric-manager logs for more info."
+
+#define DCGM_FR_NVSWITCH_EXPECTED_NVLINKS_UP_NEXT                                                                      \
+    "Ensure Fabric Manager is running. Check system logs, dmesg, and fabric-manager logs for more info."
+
+#define DCGM_FR_NVML_API_NEXT                                                                                          \
+    "Check the error condition and ensure that appropriate libraries "                                                 \
+    "are present and accessible."
+#define DCGM_FR_DEVICE_COUNT_MISMATCH_NEXT                                                                             \
+    "Check for the presence of cgroups, operating system blocks, and "                                                 \
+    "or unsupported / older cards"
+#define DCGM_FR_BAD_PARAMETER_NEXT BUG_REPORT_MSG
+#define DCGM_FR_CANNOT_OPEN_LIB_NEXT                                                                                   \
+    "Check for the existence of the library and set LD_LIBRARY_PATH "                                                  \
+    "if needed."
+#define DCGM_FR_DENYLISTED_DRIVER_NEXT "Please load the appropriate driver."
+#define DCGM_FR_NVML_LIB_BAD_NEXT                                                                                      \
+    "Make sure that the required version of libnvidia-ml.so "                                                          \
+    "is present and accessible on the system."
+#define DCGM_FR_GRAPHICS_PROCESSES_NEXT                                                                                \
+    "Stop the graphics processes or run this diagnostic on a server "                                                  \
+    "that is not being used for display purposes."
+#define DCGM_FR_HOSTENGINE_CONN_NEXT                                                                                   \
+    "If hostengine is run separately, please ensure that it is up "                                                    \
+    "and responsive."
+#define DCGM_FR_FIELD_QUERY_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_BAD_CUDA_ENV_NEXT "Please unset this environment variable to address test failures."
+#define DCGM_FR_PERSISTENCE_MODE_NEXT                                                                                  \
+    "Enable persistence mode by running \"nvidia-smi -i <gpuId> -pm "                                                  \
+    "1 \" as root."
+#define DCGM_FR_LOW_BANDWIDTH_NEXT                                                                                     \
+    "Verify that your minimum bandwidth setting is appropriate for "                                                   \
+    "the topology of each GPU. If so, and errors are consistent, "                                                     \
+    "please run a field diagnostic."
+#define DCGM_FR_HIGH_LATENCY_NEXT                                                                                      \
+    "Verify that your maximum latency setting is appropriate for "                                                     \
+    "the topology of each GPU. If so, and errors are consistent, "                                                     \
+    "please run a field diagnostic."
+#define DCGM_FR_CANNOT_GET_FIELD_TAG_NEXT ""
+#define DCGM_FR_FIELD_VIOLATION_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_FIELD_THRESHOLD_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_FIELD_VIOLATION_DBL_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_FIELD_THRESHOLD_DBL_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_UNSUPPORTED_FIELD_TYPE_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_FIELD_THRESHOLD_TS_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_FIELD_THRESHOLD_TS_DBL_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_THERMAL_VIOLATIONS_NEXT DEBUG_COOLING_MSG
+#define DCGM_FR_THERMAL_VIOLATIONS_TS_NEXT DEBUG_COOLING_MSG
+#define DCGM_FR_TEMP_VIOLATION_NEXT                                                                                    \
+    "Verify that the user-specified temperature maximum is set "                                                       \
+    "correctly. If it is, check the cooling for this GPU and node: " DEBUG_COOLING_MSG
+#define DCGM_FR_THROTTLING_VIOLATION_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_INTERNAL_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_PCIE_GENERATION_NEXT CONFIG_MSG
+#define DCGM_FR_PCIE_WIDTH_NEXT CONFIG_MSG
+#define DCGM_FR_ABORTED_NEXT ""
+#define DCGM_FR_TEST_DISABLED_NEXT CONFIG_MSG
+#define DCGM_FR_CANNOT_GET_STAT_NEXT                                                                                   \
+    "If running a standalone nv-hostengine, verify that it is up "                                                     \
+    "and responsive."
+#define DCGM_FR_STRESS_LEVEL_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_CUDA_API_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_FAULTY_MEMORY_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_CANNOT_SET_WATCHES_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_CUDA_UNBOUND_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_ECC_DISABLED_NEXT                                                                                      \
+    "Enable ECC memory by running \"nvidia-smi -i <gpuId> -e 1\" "                                                     \
+    "to enable. This may require a GPU reset or reboot to take effect."
+#define DCGM_FR_MEMORY_ALLOC_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_CUDA_DBE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_MEMORY_MISMATCH_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_CUDA_DEVICE_NEXT                                                                                       \
+    "Make sure CUDA_VISIBLE_DEVICES is not preventing visibility of "                                                  \
+    "this GPU. Also check if CUDA libraries are compatible and "                                                       \
+    "correctly installed."
+#define DCGM_FR_ECC_UNSUPPORTED_NEXT CONFIG_MSG
+#define DCGM_FR_ECC_PENDING_NEXT "Reboot to complete activation of the ECC memory."
+#define DCGM_FR_MEMORY_BANDWIDTH_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_TARGET_POWER_NEXT "Verify that the clock speeds and GPU utilization are high."
+#define DCGM_FR_API_FAIL_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_API_FAIL_GPU_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_CUDA_CONTEXT_NEXT                                                                                      \
+    "Please make sure the correct driver version is installed and "                                                    \
+    "verify that no conflicting libraries are present."
+#define DCGM_FR_DCGM_API_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_CONCURRENT_GPUS_NEXT CONFIG_MSG
+#define DCGM_FR_TOO_MANY_ERRORS_NEXT ""
+#define DCGM_FR_NVLINK_CRC_ERROR_THRESHOLD_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_NVLINK_ERROR_CRITICAL_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_ENFORCED_POWER_LIMIT_NEXT                                                                              \
+    "If this enforced power limit is necessary, then this test "                                                       \
+    "cannot be run. If it is unnecessary, then raise the enforced "                                                    \
+    "power limit setting to be able to run this test."
+#define DCGM_FR_MEMORY_ALLOC_HOST_NEXT "Manually kill processes or restart your machine."
+#define DCGM_FR_GPU_OP_MODE_NEXT                                                                                       \
+    "Fix by running nvidia-smi as root with: nvidia-smi --gom=0 -i "                                                   \
+    "<gpu index>"
+#define DCGM_FR_NO_MEMORY_CLOCKS_NEXT ""
+#define DCGM_FR_NO_GRAPHICS_CLOCKS_NEXT ""
+#define DCGM_FR_HAD_TO_RESTORE_STATE_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_L1TAG_UNSUPPORTED_NEXT CONFIG_MSG
+#define DCGM_FR_L1TAG_MISCOMPARE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_ROW_REMAP_FAILURE_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_UNCONTAINED_ERROR_NEXT DCGM_FR_VOLATILE_DBE_DETECTED_NEXT
+#define DCGM_FR_DBE_PENDING_PAGE_RETIREMENTS_NEXT "Drain the GPU and reset it or reboot the node to resolve this issue."
+#define DCGM_FR_EMPTY_GPU_LIST_NEXT CONFIG_MSG
+#define DCGM_FR_UNCORRECTABLE_ROW_REMAP_NEXT ""
+#define DCGM_FR_PENDING_ROW_REMAP_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_BROKEN_P2P_MEMORY_DEVICE_NEXT BUG_REPORT_MSG
+#define DCGM_FR_BROKEN_P2P_WRITER_DEVICE_NEXT BUG_REPORT_MSG
+#define DCGM_FR_NVSWITCH_NVLINK_DOWN_NEXT                                                                              \
+    "Please check fabric manager and initialization logs to figure out why the link is down. "                         \
+    "You may also need to run a field diagnostic."
+#define DCGM_FR_EUD_BINARY_PERMISSIONS_NEXT "" /* See message inplace */
+#define DCGM_FR_EUD_NON_ROOT_USER_NEXT ""      /* See message inplace */
+#define DCGM_FR_EUD_SPAWN_FAILURE_NEXT ""      /* See message inplace */
+#define DCGM_FR_EUD_TIMEOUT_NEXT ""            /* See message inplace */
+#define DCGM_FR_EUD_ZOMBIE_NEXT ""             /* See message inplace */
+#define DCGM_FR_EUD_NON_ZERO_EXIT_CODE_NEXT "" /* See message inplace */
+#define DCGM_FR_EUD_TEST_FAILED_NEXT ""        /* See message inplace */
+#define DCGM_FR_FILE_CREATE_PERMISSIONS_NEXT                                                                           \
+    "Please restart the hostengine with parameter --home-dir to specify a different home directory for the "           \
+    "diagnostic or change permissions in the current directory to allow the user to write files there."
+#define DCGM_FR_PAUSE_RESUME_FAILED_NEXT ""     /* See message inplace */
+#define DCGM_FR_PCIE_H_REPLAY_VIOLATION_NEXT "" /* See message inplace */
+#define DCGM_FR_XID_ERROR_NEXT "Please consult the documentation for details of this XID."
+#define DCGM_FR_SBE_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_DBE_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_PCIE_REPLAY_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_SBE_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_DBE_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_PCIE_REPLAY_THRESHOLD_VIOLATION_NEXT TRIAGE_RUN_FIELD_DIAG_MSG
+#define DCGM_FR_CUDA_FM_NOT_INITIALIZED_NEXT "Ensure that the FabricManager is running without errors."
+#define DCGM_FR_SXID_ERROR_NEXT SYSTEM_TRIAGE_MSG
+#define DCGM_FR_ERROR_SENTINEL_NEXT "" /* See message inplace */
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+    DCGM_PUBLIC_API dcgmErrorSeverity_t dcgmErrorGetPriorityByCode(unsigned int code);
+    DCGM_PUBLIC_API dcgmErrorCategory_t dcgmErrorGetCategoryByCode(unsigned int code);
+    DCGM_PUBLIC_API const char*         dcgmErrorGetFormatMsgByCode(unsigned int code);
+
+    DCGM_PUBLIC_API const dcgm_error_meta_t* dcgmGetErrorMeta(dcgmError_t error);
+    DCGM_PUBLIC_API const char*              errorString(dcgmReturn_t result);
+
+    /** @} */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // DCGM_ERRORS_H
diff --git a/pkg/ixdcgm/include/ixdcgmApiExport.h b/pkg/ixdcgm/include/ixdcgmApiExport.h
index 789cd6fb8982c36a1021a5dab3abf5ee77e00c49..56053b3bfca80ebaef84583dce1a9b6f1b638a13 100644
--- a/pkg/ixdcgm/include/ixdcgmApiExport.h
+++ b/pkg/ixdcgm/include/ixdcgmApiExport.h
@@ -27,208 +27,39 @@ extern "C"
 #endif
 
 #define IXDCGM_PRIVATE_API __attribute((visibility("hidden")))
+    /**
+     * Get the IxLink link status for every IxLink in this system.
+     *
+     * @param pixdcgmHandle  IN: IxDCGM Handle
+     * @param linkStatus     OUT: Structure in which to store IxLink link statuses.
+     *
+     * @return
+     *        - \ref IXDCGM_RET_OK                if the call was successful.
+     *        - \ref IXDCGM_RET_NOT_SUPPORTED     if the given entityGroup does not support enumeration.
+     *        - \ref IXDCGM_RET_BADPARAM          if any parameter is invalid
+     *        - \ref IXDCGM_RET_VER_MISMATCH      if the version of linkStatus is not ixdcgmLinkStatus_v3
+     */
+    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLinkStatus(ixdcgmHandle_t pixdcgmHandle, ixdcgmLinkStatus_v3 *linkStatus);
 
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmInit(void);
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStartEmbedded(ixdcgmStartEmbeddedParam *params);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmEngineStart(unsigned short portNum, char const *socketPath, bool overTCP);
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmEngineRun(unsigned short portNumber,
-                                                     char const *socketPath,
-                                                     unsigned int isConnectionTCP);
-    IXDCGM_PUBLIC_API const char *ixdcgmErrorString(ixdcgmReturn_t result);
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmDisconnect(ixdcgmHandle_t pixdcgmHandle);
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConnect(const char *ipAddress,
-                                                   ixdcgmConnectParams *connectParams,
-                                                   ixdcgmHandle_t *pixdcgmHandle);
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetEntityGroupEntities(ixdcgmHandle_t pixdcgmHandle,
-                                                                  ixdcgm_field_entity_group_t entityGroup,
-                                                                  ixdcgm_field_eid_t *entities,
-                                                                  int *numEntities,
-                                                                  unsigned int flags);
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetDeviceAttributes(ixdcgmHandle_t pixdcgmHandle,
-                                                               unsigned int gpuId,
-                                                               ixdcgmDeviceAttributes_t *pixdcgmAttr);
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetAllDevices(ixdcgmHandle_t pixdcgmHandle,
-                                                         unsigned int gpuIdList[IXDCGM_MAX_NUM_DEVICES],
-                                                         int *count);
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetAllSupportedDevices(ixdcgmHandle_t pixdcgmHandle,
-                                                                  unsigned int gpuIdList[IXDCGM_MAX_NUM_DEVICES],
-                                                                  int *count);
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmEntitiesGetLatestValues(ixdcgmHandle_t pDcgmHandle,
-                                                                   ixdcgmGroupEntityPair_t entities[],
-                                                                   unsigned int entityCount,
-                                                                   unsigned short fields[],
-                                                                   unsigned int fieldCount,
-                                                                   unsigned int flags,
-                                                                   ixdcgmFieldValue_v2 values[]);
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHostengineVersionInfo(ixdcgmHandle_t pixdcgmHandle,
-                                                                 ixdcgmVersionInfo_t *pVersionInfo);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmVersionInfo(ixdcgmVersionInfo_t *pVersionInfo);
-
-    /*Grouping APIs*/
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupCreate(ixdcgmHandle_t pixdcgmHandle,
-                                                       ixdcgmGroupType_t type,
-                                                       const char *groupName,
-                                                       ixdcgmGpuGrp_t *groupId);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupDestroy(ixdcgmHandle_t pixdcgmHandle, ixdcgmGpuGrp_t groupId);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupAddEntity(ixdcgmHandle_t pixdcgmHandle,
-                                                          ixdcgmGpuGrp_t groupId,
-                                                          ixdcgm_field_entity_group_t entityGroupId,
-                                                          ixdcgm_field_eid_t entityId);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupAddDevice(ixdcgmHandle_t pixdcgmHandle,
-                                                          ixdcgmGpuGrp_t groupId,
-                                                          unsigned int gpuId);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupRemoveDevice(ixdcgmHandle_t pixdcgmHandle,
-                                                             ixdcgmGpuGrp_t groupId,
-                                                             unsigned int gpuId);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupRemoveEntity(ixdcgmHandle_t pixdcgmHandle,
-                                                             ixdcgmGpuGrp_t groupId,
-                                                             ixdcgm_field_entity_group_t entityGroupId,
-                                                             ixdcgm_field_eid_t entityId);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupGetInfo(ixdcgmHandle_t pixdcgmHandle,
-                                                        ixdcgmGpuGrp_t groupId,
-                                                        ixdcgmGroupInfo_t *pDcgmGroupInfo);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGroupGetAllIds(ixdcgmHandle_t pixdcgmHandle,
-                                                          ixdcgmGpuGrp_t groupIdList[],
-                                                          unsigned int *count);
-
-    /* Field Grouping APIs*/
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupCreate(ixdcgmHandle_t pixdcgmHandle,
-                                                            int numFieldIds,
-                                                            unsigned short *fieldIds,
-                                                            const char *fieldGroupName,
-                                                            ixdcgmFieldGrp_t *fieldGroupId);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupDestroy(ixdcgmHandle_t pixdcgmHandle,
-                                                             ixdcgmFieldGrp_t fieldGroupId);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupGetInfo(ixdcgmHandle_t pixdcgmHandle,
-                                                             ixdcgmFieldGroupInfo_t *fieldGroupInfo);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmFieldGroupGetAll(ixdcgmHandle_t pixdcgmHandle,
-                                                            ixdcgmAllFieldGroup_t *allGroupInfo);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmWatchFields(ixdcgmHandle_t pixdcgmHandle,
-                                                       ixdcgmGpuGrp_t groupId,
-                                                       ixdcgmFieldGrp_t fieldGroupId,
-                                                       long long updateFreq,
-                                                       double maxKeepAge,
-                                                       int maxKeepSamples);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmUnwatchFields(ixdcgmHandle_t pixdcgmHandle,
-                                                         ixdcgmGpuGrp_t groupId,
-                                                         ixdcgmFieldGrp_t fieldGroupId);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusCreate(ixdcgmStatus_t *statusHandle);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusDestroy(ixdcgmStatus_t statusHandle);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusGetCount(ixdcgmStatus_t statusHandle, unsigned int *count);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusPopError(ixdcgmStatus_t statusHandle,
-                                                          ixdcgmErrorInfo_t *pixdcgmErrorInfo);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStatusClear(ixdcgmStatus_t statusHandle);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConfigGet(ixdcgmHandle_t pixdcgmHandle,
-                                                     ixdcgmGpuGrp_t groupId,
-                                                     ixdcgmConfigType_t type,
-                                                     int count,
-                                                     ixdcgmConfig_t deviceConfigList[],
-                                                     ixdcgmStatus_t statusHandle);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConfigSet(ixdcgmHandle_t pixdcgmHandle,
-                                                     ixdcgmGpuGrp_t groupId,
-                                                     ixdcgmConfig_t *pDeviceConfig,
-                                                     ixdcgmStatus_t statusHandle);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmConfigEnforce(ixdcgmHandle_t pixdcgmHandle,
-                                                         ixdcgmGpuGrp_t groupId,
-                                                         ixdcgmStatus_t statusHandle);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetValuesSince_v2(ixdcgmHandle_t pixdcgmHandle,
-                                                             ixdcgmGpuGrp_t groupId,
-                                                             ixdcgmFieldGrp_t fieldGroupId,
-                                                             long long sinceTimestamp,
-                                                             long long *nextSinceTimestamp,
-                                                             ixdcgmFieldValueEntityEnumeration_f enumCB,
-                                                             void *userData);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLatestValues_v2(ixdcgmHandle_t pDcgmHandle,
-                                                              ixdcgmGpuGrp_t groupId,
-                                                              ixdcgmFieldGrp_t fieldGroupId,
-                                                              ixdcgmFieldValueEntityEnumeration_f enumCB,
-                                                              void *userData);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmStopEmbedded(ixdcgmHandle_t pixdcgmHandle);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetFieldSummary(ixdcgmHandle_t pixdcgmHandle,
-                                                           ixdcgmFieldSummaryRequest_t *request);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmShutdown(void);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmModuleIdToName(ixdcgmModuleId_t id, char const **name);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLatestValuesForFields(ixdcgmHandle_t pixdcgmHandle,
-                                                                    int gpuId,
-                                                                    unsigned short fields[],
-                                                                    unsigned int count,
-                                                                    ixdcgmFieldValue_v1 values[]);
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmUpdateAllFields(ixdcgmHandle_t pixdcgmHandle, int waitForUpdate);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHostengineSetLoggingSeverity(ixdcgmHandle_t pixdcgmHandle,
-                                                                        ixdcgmSettingsSetLoggingSeverity_t *logging);
-
+    /**
+     * Gets the 2 GPUs are on the same board or not.
+     * @param pixdcgmHandle    IN: IxDCGM Handle
+     * @param gpuId1           IN: GPU1 Id
+     * @param gpuId2           IN: GPU2 Id
+     * @param onSameBoard IN/OUT: On same board info of the GPU pair.   0= not on the same board; 1= on the same board
+     *
+     * @return
+     *        - \ref IXDCGM_RET_OK                   if the call was successful.
+     *        - \ref IXDCGM_RET_BADPARAM             if gpuId1, gpuId2 or onSameBoard were not valid.
+     */
     ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmDeviceOnSameBoard(ixdcgmHandle_t pixdcgmHandle,
                                                              unsigned int gpuId1,
                                                              unsigned int gpuId2,
                                                              int *onSameBoard);
 
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmModuleGetStatuses(ixdcgmHandle_t pixdcgmHandle,
-                                                             ixdcgmModuleGetStatuses_t *moduleStatuses);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmModuleDenylist(ixdcgmHandle_t pixdcgmHandle, ixdcgmModuleId_t moduleId);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetDeviceTopology(ixdcgmHandle_t pixdcgmHandle,
-                                                             unsigned int gpuId,
-                                                             ixdcgmDeviceTopology_t *deviceTopology);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetGroupTopology(ixdcgmHandle_t pixdcgmHandle,
-                                                            ixdcgmGpuGrp_t groupId,
-                                                            ixdcgmGroupTopology_t *pixdcgmGroupTopology);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmSelectGpusByTopology(ixdcgmHandle_t pixdcgmHandle,
-                                                                uint64_t inputGpuIds,
-                                                                uint32_t numGpus,
-                                                                uint64_t *outputGpuIds,
-                                                                uint64_t hintFlags);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHealthGet(ixdcgmHandle_t pixdcgmHandle,
-                                                     ixdcgmGpuGrp_t groupId,
-                                                     ixdcgmHealthSystems_t *systems);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHealthSet_v2(ixdcgmHandle_t pixdcgmHandle, ixdcgmHealthSetParams_v2 *params);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmHealthCheck_v4(ixdcgmHandle_t pixdcgmHandle,
-                                                          ixdcgmGpuGrp_t groupId,
-                                                          ixdcgmHealthResponse_v4 *response);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetLinkStatus(ixdcgmHandle_t pixdcgmHandle, ixdcgmLinkStatus_v3 *linkStatus);
-
-    ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmProfGetSupportedMetricGroups(ixdcgmHandle_t pixdcgmHandle,
-                                                                        ixdcgmProfGetMetricGroups_t *metricGroups);
-
     /**
      * Gets all the running process info corresponding to the gpuId .
-     * @param pixdcgmHandle         IN: ixDCGM Handle
+     * @param pixdcgmHandle         IN: IxDCGM Handle
      * @param gpuId                 IN: GPU Id corresponding to which the processes info should be fetched
      * @param infoCount             IN/OUT:
      *  IN - max number of the info could be stored in to the pids and usedMemoryBytes buffer
@@ -238,9 +69,9 @@ extern "C"
      * @param usedMemoryBytes       OUT: Buffer to store returned processes used memory in byte
      *
      * @return
-     * - \ref DCGM_ST_OK                         if the call was successful.
+     * - \ref IXDCGM_RET_OK                      if the call was successful.
      * - \ref IXDCGM_RET_INSUFFICIENT_SIZE       if the infoCount input is smaller than the buffer needed.
-     * - \ref DCGM_ST_BADPARAM                   if gpuId, infoCunt, pids or usedMemoryBytes not valid.
+     * - \ref IXDCGM_RET_BADPARAM                if gpuId, infoCunt, pids or usedMemoryBytes not valid.
      **/
     ixdcgmReturn_t IXDCGM_PUBLIC_API ixdcgmGetDeviceRunningProcesses(ixdcgmHandle_t pixdcgmHandle,
                                                                      unsigned int gpuId,
@@ -248,6 +79,8 @@ extern "C"
                                                                      uint64_t *pids,
                                                                      uint64_t *usedMemoryBytes);
 
+    IXDCGM_PUBLIC_API const char *ixdcgmErrorString(ixdcgmReturn_t result);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/pkg/ixdcgm/policy.go b/pkg/ixdcgm/policy.go
new file mode 100644
index 0000000000000000000000000000000000000000..ada6c23fe7daba7cead8b4a230c4c1b3ea5be188
--- /dev/null
+++ b/pkg/ixdcgm/policy.go
@@ -0,0 +1,464 @@
+/*
+Copyright (c) 2024, NVIDIA CORPORATION.
+Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package ixdcgm
+
+/*
+#include "include/dcgm_agent.h"
+#include "include/dcgm_structs.h"
+
+// wrapper for go callback function
+extern int violationNotify(void* p);
+extern int voidCallback(void* p);
+*/
+import "C"
+import (
+	"context"
+	"encoding/binary"
+	"fmt"
+	"log"
+	"math/rand"
+	"sync"
+	"time"
+	"unsafe"
+
+	"github.com/creasty/defaults"
+)
+
+// At least one policy must be enabled.
+type PolicyConditionParams struct {
+	// DbePolicyEnabled indicates whether the DbePolicy is enabled. Default is false (disabled).
+	DbePolicyEnabled bool `default:"false"`
+
+	// PCIePolicyEnabled indicates whether the PCIePolicy is enabled. Default is false (disabled).
+	PCIePolicyEnabled bool `default:"false"`
+
+	// MaxRtPgPolicyEnabled indicates whether the MaxRtPgPolicy is enabled. Default is false (disabled).
+	MaxRtPgPolicyEnabled bool `default:"false"`
+
+	// MaxRtPgPolicyThreshold specifies the maximum number of retired pages that will trigger a violation.
+	// Note that the MaxRtPgPolicyThreshold will be ignored if MaxRtPgPolicy is disabled.
+	// Default value is 10.
+	MaxRtPgPolicyThreshold uint32 `default:"10"`
+
+	// ThermalPolicyEnabled indicates whether the ThermalPolicy is enabled. Default is false (disabled).
+	ThermalPolicyEnabled bool `default:"false"`
+
+	// ThermalPolicyThreshold specifies the maximum temperature a group's GPUs can reach before triggering a violation.
+	// Note that the ThermalPolicyThreshold will be ignored if ThermalPolicy is disabled.
+	// Default value is 100 and the unit is in degrees Celsius (°C).
+	ThermalPolicyThreshold uint32 `default:"100"`
+
+	// PowerPolicyEnabled indicates whether the PowerPolicy is enabled. Default is false (disabled).
+	PowerPolicyEnabled bool `default:"false"`
+
+	// PowerPolicyThreshold specifies the maximum power a group's GPUs can reach before triggering a violation.
+	// Note that the PowerPolicyThreshold will be ignored if PowerPolicy is fadisabledlse.
+	// Default value is 250 and the unit is in watts (W).
+	PowerPolicyThreshold uint32 `default:"250"`
+}
+
+type policyCondition string
+
+const (
+	DbePolicy     = policyCondition("Double-bit ECC Error")
+	PCIePolicy    = policyCondition("PCI Error")
+	MaxRtPgPolicy = policyCondition("Max Retired Pages Limit")
+	ThermalPolicy = policyCondition("Thermal Limit")
+	PowerPolicy   = policyCondition("Power Limit")
+)
+
+type PolicyViolation struct {
+	Condition policyCondition
+	Timestamp time.Time
+	Data      interface{}
+}
+
+type policyIndex int
+
+const (
+	dbePolicyIndex policyIndex = iota
+	pciePolicyIndex
+	maxRtPgPolicyIndex
+	thermalPolicyIndex
+	powerPolicyIndex
+)
+
+type policyConditionParam struct {
+	typ   uint32
+	value uint32
+}
+
+type DbePolicyCondition struct {
+	Location  string
+	NumErrors uint
+}
+
+type PciPolicyCondition struct {
+	ReplayCounter uint
+}
+
+type RetiredPagesPolicyCondition struct {
+	SbePages uint
+	DbePages uint
+}
+
+type ThermalPolicyCondition struct {
+	ThermalViolation uint
+}
+
+type PowerPolicyCondition struct {
+	PowerViolation uint
+}
+
+var (
+	policyChanOnce sync.Once
+	policyMapOnce  sync.Once
+
+	// callbacks maps PolicyViolation channels with policy
+	// captures C callback() value for each violation condition
+	callbacks map[string]chan PolicyViolation
+
+	// paramMap maps C.dcgmPolicy_t.parms index and limits
+	// to be used in setPolicy() for setting user selected policies
+	paramMap map[policyIndex]policyConditionParam
+
+	registerCh = make(chan struct{})
+)
+
+func makePolicyChannels() {
+	policyChanOnce.Do(func() {
+		callbacks = make(map[string]chan PolicyViolation)
+		callbacks["dbe"] = make(chan PolicyViolation, 1)
+		callbacks["pcie"] = make(chan PolicyViolation, 1)
+		callbacks["maxrtpg"] = make(chan PolicyViolation, 1)
+		callbacks["thermal"] = make(chan PolicyViolation, 1)
+		callbacks["power"] = make(chan PolicyViolation, 1)
+	})
+}
+
+func makePolicyParamsMap(params *PolicyConditionParams) {
+	const (
+		policyFieldTypeBool = 0
+		policyFieldTypeLong = 1
+		policyBoolValue     = 1
+	)
+
+	policyMapOnce.Do(func() {
+		paramMap = make(map[policyIndex]policyConditionParam)
+
+		paramMap[dbePolicyIndex] = policyConditionParam{
+			typ:   policyFieldTypeBool,
+			value: policyBoolValue,
+		}
+
+		paramMap[pciePolicyIndex] = policyConditionParam{
+			typ:   policyFieldTypeBool,
+			value: policyBoolValue,
+		}
+
+		paramMap[maxRtPgPolicyIndex] = policyConditionParam{
+			typ:   policyFieldTypeLong,
+			value: params.MaxRtPgPolicyThreshold,
+		}
+
+		paramMap[thermalPolicyIndex] = policyConditionParam{
+			typ:   policyFieldTypeLong,
+			value: params.ThermalPolicyThreshold,
+		}
+
+		paramMap[powerPolicyIndex] = policyConditionParam{
+			typ:   policyFieldTypeLong,
+			value: params.PowerPolicyThreshold,
+		}
+
+	})
+}
+
+func setPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t, paramList []policyIndex) (err error) {
+	var policy C.dcgmPolicy_t
+	policy.version = makeVersion1(unsafe.Sizeof(policy))
+	policy.mode = C.dcgmPolicyMode_t(C.DCGM_OPERATION_MODE_AUTO)
+	policy.action = C.DCGM_POLICY_ACTION_NONE
+	policy.isolation = C.DCGM_POLICY_ISOLATION_NONE
+	policy.validation = C.DCGM_POLICY_VALID_NONE
+	policy.condition = condition
+
+	// iterate on paramMap for given policy conditions
+	for _, key := range paramList {
+		conditionParam, exists := paramMap[policyIndex(key)]
+		if !exists {
+			return fmt.Errorf("Error: Invalid Policy condition, %v does not exist", key)
+		}
+		// set policy condition parameters
+		// set condition type (bool or longlong)
+		policy.parms[key].tag = conditionParam.typ
+
+		// set condition val (violation threshold)
+		// policy.parms.val is a C union type
+		// cgo docs: Go doesn't have support for C's union type
+		// C union types are represented as a Go byte array
+		binary.LittleEndian.PutUint32(policy.parms[key].val[:], conditionParam.value)
+	}
+
+	var statusHandle C.dcgmStatus_t
+
+	result := C.dcgmPolicySet(handle.handle, groupId.handle, &policy, statusHandle)
+	if err = errorString(result); err != nil {
+		return fmt.Errorf("Error setting policies: %s", err)
+	}
+
+	log.Println("Policy successfully set.")
+
+	return
+}
+
+func validatePolicy(p *PolicyConditionParams) error {
+	if err := defaults.Set(p); err != nil {
+		return err
+	}
+	if !(p.DbePolicyEnabled || p.PCIePolicyEnabled || p.MaxRtPgPolicyEnabled || p.ThermalPolicyEnabled || p.PowerPolicyEnabled) {
+		return fmt.Errorf("bad parameters: at least one policy must be enabled")
+	}
+	return nil
+}
+
+func registerPolicyForGpus(ctx context.Context, params *PolicyConditionParams, gpuIds ...uint) (<-chan PolicyViolation, error) {
+	groupId, err := CreateGroup(fmt.Sprintf("PolicyGroup_%d", rand.Uint64()))
+	if err != nil {
+		return nil, fmt.Errorf("failed to create policy group, err: %v", err)
+	}
+
+	go func() {
+		<-ctx.Done()
+		select {
+		case <-registerCh: // Wait the policy is unregistered
+			_ = DestroyGroup(groupId)
+		case <-time.After(500 * time.Millisecond):
+			_ = DestroyGroup(groupId)
+		}
+	}()
+
+	for _, gpuId := range gpuIds {
+		err = AddToGroup(groupId, gpuId)
+		if err != nil {
+			return nil, fmt.Errorf("failed to add gpu %d to policy group, err: %v", gpuId, err)
+		}
+	}
+
+	return registerPolicy(ctx, groupId, params)
+}
+
+// registerPolicy sets GPU usage and error policies and notifies in case of any violations on GPUs within a specific group
+func registerPolicy(ctx context.Context, groupId GroupHandle, params *PolicyConditionParams) (<-chan PolicyViolation, error) {
+	if params == nil {
+		return nil, fmt.Errorf("PolicyConditionParams is required")
+	}
+	if err := validatePolicy(params); err != nil {
+		return nil, err
+	}
+
+	// init policy globals for internal API
+	makePolicyChannels()
+	makePolicyParamsMap(params)
+
+	// make a list of policy conditions for setting their parameters
+	var paramKeys []policyIndex
+	// get all conditions to be set in setPolicy()
+	var condition C.dcgmPolicyCondition_t = 0
+	// get length of enabled condition types
+	var conTypes int = 0
+	if params.DbePolicyEnabled {
+		conTypes++
+		paramKeys = append(paramKeys, dbePolicyIndex)
+		condition |= C.DCGM_POLICY_COND_DBE
+	}
+	if params.PCIePolicyEnabled {
+		conTypes++
+		paramKeys = append(paramKeys, pciePolicyIndex)
+		condition |= C.DCGM_POLICY_COND_PCI
+	}
+	if params.MaxRtPgPolicyEnabled {
+		conTypes++
+		paramKeys = append(paramKeys, maxRtPgPolicyIndex)
+		condition |= C.DCGM_POLICY_COND_MAX_PAGES_RETIRED
+	}
+	if params.ThermalPolicyEnabled {
+		conTypes++
+		paramKeys = append(paramKeys, thermalPolicyIndex)
+		condition |= C.DCGM_POLICY_COND_THERMAL
+	}
+	if params.PowerPolicyEnabled {
+		conTypes++
+		paramKeys = append(paramKeys, powerPolicyIndex)
+		condition |= C.DCGM_POLICY_COND_POWER
+	}
+
+	var err error
+	if err = setPolicy(groupId, condition, paramKeys); err != nil {
+		return nil, err
+	}
+
+	result := C.dcgmPolicyRegister(handle.handle, groupId.handle,
+		C.dcgmPolicyCondition_t(condition),
+		C.fpRecvUpdates(C.violationNotify),
+		C.fpRecvUpdates(C.voidCallback),
+	)
+
+	if err = errorString(result); err != nil {
+		return nil, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result}
+	}
+	log.Println("Listening for violations...")
+
+	violation := make(chan PolicyViolation, conTypes)
+
+	go func() {
+		defer func() {
+			log.Println("unregister policy violation...")
+			unregisterPolicy(groupId, condition)
+			close(violation)
+			close(registerCh)
+		}()
+		for {
+			select {
+			case dbe := <-callbacks["dbe"]:
+				violation <- dbe
+			case pcie := <-callbacks["pcie"]:
+				violation <- pcie
+			case maxrtpg := <-callbacks["maxrtpg"]:
+				violation <- maxrtpg
+			case thermal := <-callbacks["thermal"]:
+				violation <- thermal
+			case power := <-callbacks["power"]:
+				violation <- power
+			case <-ctx.Done():
+				return
+			}
+		}
+	}()
+
+	return violation, err
+}
+
+func unregisterPolicy(groupId GroupHandle, condition C.dcgmPolicyCondition_t) {
+	result := C.dcgmPolicyUnregister(handle.handle, groupId.handle, condition)
+
+	if err := errorString(result); err != nil {
+		log.Println(fmt.Errorf("error unregistering policy: %s", err))
+	}
+}
+
+func createTimeStamp(t C.longlong) time.Time {
+	tm := int64(t) / 1000000
+	ts := time.Unix(tm, 0)
+	return ts
+}
+
+func dbeLocation(location int) string {
+	switch location {
+	case 0:
+		return "L1"
+	case 1:
+		return "L2"
+	case 2:
+		return "Device"
+	case 3:
+		return "Register"
+	case 4:
+		return "Texture"
+	}
+	return "N/A"
+}
+
+// VoidCallback is a go callback function for dcgmPolicyRegister() wrapped in C.voidCallback()
+//
+//export VoidCallback
+func VoidCallback(data unsafe.Pointer) int {
+	return 0
+}
+
+// ViolationRegistration is a go callback function for dcgmPolicyRegister() wrapped in C.violationNotify()
+//
+//export ViolationRegistration
+func ViolationRegistration(data unsafe.Pointer) int {
+	var con policyCondition
+	var timestamp time.Time
+	var val interface{}
+
+	response := *(*C.dcgmPolicyCallbackResponse_t)(unsafe.Pointer(data))
+
+	switch response.condition {
+	case C.DCGM_POLICY_COND_DBE:
+		dbe := (*C.dcgmPolicyConditionDbe_t)(unsafe.Pointer(&response.val))
+		con = DbePolicy
+		timestamp = createTimeStamp(dbe.timestamp)
+		val = DbePolicyCondition{
+			Location:  dbeLocation(int(dbe.location)),
+			NumErrors: *uintPtr(dbe.numerrors),
+		}
+	case C.DCGM_POLICY_COND_PCI:
+		pci := (*C.dcgmPolicyConditionPci_t)(unsafe.Pointer(&response.val))
+		con = PCIePolicy
+		timestamp = createTimeStamp(pci.timestamp)
+		val = PciPolicyCondition{
+			ReplayCounter: *uintPtr(pci.counter),
+		}
+	case C.DCGM_POLICY_COND_MAX_PAGES_RETIRED:
+		mpr := (*C.dcgmPolicyConditionMpr_t)(unsafe.Pointer(&response.val))
+		con = MaxRtPgPolicy
+		timestamp = createTimeStamp(mpr.timestamp)
+		val = RetiredPagesPolicyCondition{
+			SbePages: *uintPtr(mpr.sbepages),
+			DbePages: *uintPtr(mpr.dbepages),
+		}
+	case C.DCGM_POLICY_COND_THERMAL:
+		thermal := (*C.dcgmPolicyConditionThermal_t)(unsafe.Pointer(&response.val))
+		con = ThermalPolicy
+		timestamp = createTimeStamp(thermal.timestamp)
+		val = ThermalPolicyCondition{
+			ThermalViolation: *uintPtr(thermal.thermalViolation),
+		}
+	case C.DCGM_POLICY_COND_POWER:
+		pwr := (*C.dcgmPolicyConditionPower_t)(unsafe.Pointer(&response.val))
+		con = PowerPolicy
+		timestamp = createTimeStamp(pwr.timestamp)
+		val = PowerPolicyCondition{
+			PowerViolation: *uintPtr(pwr.powerViolation),
+		}
+	}
+
+	err := PolicyViolation{
+		Condition: con,
+		Timestamp: timestamp,
+		Data:      val,
+	}
+
+	switch con {
+	case DbePolicy:
+		callbacks["dbe"] <- err
+	case PCIePolicy:
+		callbacks["pcie"] <- err
+	case MaxRtPgPolicy:
+		callbacks["maxrtpg"] <- err
+	case ThermalPolicy:
+		callbacks["thermal"] <- err
+	case PowerPolicy:
+		callbacks["power"] <- err
+	}
+	return 0
+}
diff --git a/pkg/ixdcgm/process_info.go b/pkg/ixdcgm/process_info.go
index 74955a883a8dc20725f493914fb92f4d31525b8a..8fe9f231baaae01344c29310bbeef26c43a73e31 100644
--- a/pkg/ixdcgm/process_info.go
+++ b/pkg/ixdcgm/process_info.go
@@ -59,14 +59,13 @@ func ixdcgmGetDeviceRunningProcesses(gpuId uint) (cnt C.uint32_t, pids []C.uint6
 		usedMemoryBytes = make([]C.uint64_t, cnt)
 		ret := C.ixdcgmGetDeviceRunningProcesses(C.ulong(handle.handle), C.uint(gpuId), &cnt, &pids[0], &usedMemoryBytes[0])
 		if ret == C.IXDCGM_RET_OK {
-			// fmt.Printf("the number of valid pids/usedMemoryBytes info is %d\n", uint32(cnt))
 			err = nil
 			return
 		} else if ret == C.IXDCGM_RET_INSUFFICIENT_SIZE {
-			// fmt.Printf("INSUFFICIENT_SIZE Warnnig: the needed buffer size is %d\n", uint32(cnt))
+			fmt.Printf("INSUFFICIENT_SIZE Warnnig: the needed buffer size is %d\n", uint32(cnt))
 			continue
-		} else if ret == C.IXDCGM_RET_BADPARAM {
-			err = fmt.Errorf("bad parameter")
+		} else {
+			err = ixdcgmErrorString(ret)
 			return
 		}
 	}
diff --git a/pkg/ixdcgm/standalone.go b/pkg/ixdcgm/standalone.go
index 7f1163d55bb6ef01b256aa51aed799a48553375e..76f30856449ee8c9b3de92b72a115cab4f38a173 100644
--- a/pkg/ixdcgm/standalone.go
+++ b/pkg/ixdcgm/standalone.go
@@ -49,7 +49,7 @@ func (s *standalone) Shutdown() error {
 }
 
 func (s *standalone) Start(args ...string) (DcgmHandle, error) {
-	fmt.Printf("standalone mode: %v\n", args)
+	fmt.Println("Start ixdcgm based on Standalone mode.")
 
 	if len(args) < 2 {
 		return DcgmHandle{}, fmt.Errorf("missing dcgm address or port")
diff --git a/pkg/ixdcgm/startHostengine.go b/pkg/ixdcgm/startHostengine.go
index 9c3dd923100617f1c7b9c4b877bb34ae05bfad0a..0a46990c30e413b4f401156ce3f4fc96b1af941f 100644
--- a/pkg/ixdcgm/startHostengine.go
+++ b/pkg/ixdcgm/startHostengine.go
@@ -28,6 +28,6 @@ func (s *startHostengine) Shutdown() (err error) {
 }
 
 func (s startHostengine) Start(args ...string) (DcgmHandle, error) {
-	fmt.Println("hello startHostengine mode")
+	fmt.Println("Start ixdcgm based on StartHostengine mode.")
 	return DcgmHandle{}, nil
 }
diff --git a/pkg/ixdcgm/topology.go b/pkg/ixdcgm/topology.go
index 47ed56fdb44ce88c660136260441a8a5bee994c9..85d5b19120b40636887299ab7c84dc3ab75c5cdb 100644
--- a/pkg/ixdcgm/topology.go
+++ b/pkg/ixdcgm/topology.go
@@ -1,12 +1,13 @@
 /*
+Copyright (c) 2024, NVIDIA CORPORATION.
 Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
 All Rights Reserved.
 
-Licensed under the Apache License, Version 2.0 (the "License"); you may
-not use this file except in compliance with the License. You may obtain
-a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -27,18 +28,128 @@ import (
 	"unsafe"
 )
 
-func getDeviceTopology(deviceId uint) (err error) {
+type P2PLinkType uint
+
+const (
+	P2PLinkUnknown P2PLinkType = iota // N/A
+
+	P2PLinkCrossCPU     // SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+	P2PLinkSameCPU      // NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+	P2PLinkHostBridge   // PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+	P2PLinkMultiSwitch  // PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
+	P2PLinkSingleSwitch // PIX  = Connection traversing at most a single PCIe bridge
+	P2PLinkSameBoard    // INTE = Connection traversing at most a single on-board PCIe bridge
+	P2PLinkIXLINK1      // IX1  = Connection traversing a single IXLink
+	P2PLinkIXLINK2      // IX2  = Connection traversing two IXLinks
+	P2PLinkIXLINK3      // IX3  = Connection traversing three IXLinks
+	P2PLinkIXLINK4      // IX4  = Connection traversing four IXLinks
+	P2PLinkIXLINK5      // IX5  = Connection traversing five IXLinks
+	P2PLinkIXLINK6      // IX6  = Connection traversing six IXLinks
+)
+
+func (l P2PLinkType) PCIPaths() string {
+	switch l {
+	case P2PLinkSameBoard:
+		return "INTE"
+	case P2PLinkSingleSwitch:
+		return "PIX"
+	case P2PLinkMultiSwitch:
+		return "PXB"
+	case P2PLinkHostBridge:
+		return "PHB"
+	case P2PLinkSameCPU:
+		return "NODE"
+	case P2PLinkCrossCPU:
+		return "SYS"
+	case P2PLinkIXLINK1:
+		return "IX1"
+	case P2PLinkIXLINK2:
+		return "IX2"
+	case P2PLinkIXLINK3:
+		return "IX3"
+	case P2PLinkIXLINK4:
+		return "IX4"
+	case P2PLinkIXLINK5:
+		return "IX5"
+	case P2PLinkIXLINK6:
+		return "IX6"
+	case P2PLinkUnknown:
+	}
+	return "N/A"
+}
+
+type P2PLink struct {
+	GPU   uint
+	BusID string
+	Link  P2PLinkType
+}
+
+func getP2PLink(path uint) P2PLinkType {
+	switch path {
+	case C.DCGM_TOPOLOGY_BOARD:
+		return P2PLinkSameBoard
+	case C.DCGM_TOPOLOGY_SINGLE:
+		return P2PLinkSingleSwitch
+	case C.DCGM_TOPOLOGY_MULTIPLE:
+		return P2PLinkMultiSwitch
+	case C.DCGM_TOPOLOGY_HOSTBRIDGE:
+		return P2PLinkHostBridge
+	case C.DCGM_TOPOLOGY_CPU:
+		return P2PLinkSameCPU
+	case C.DCGM_TOPOLOGY_SYSTEM:
+		return P2PLinkCrossCPU
+	case C.DCGM_TOPOLOGY_NVLINK1:
+		return P2PLinkIXLINK1
+	case C.DCGM_TOPOLOGY_NVLINK2:
+		return P2PLinkIXLINK2
+	case C.DCGM_TOPOLOGY_NVLINK3:
+		return P2PLinkIXLINK3
+	case C.DCGM_TOPOLOGY_NVLINK4:
+		return P2PLinkIXLINK4
+	case C.DCGM_TOPOLOGY_NVLINK5:
+		return P2PLinkIXLINK5
+	case C.DCGM_TOPOLOGY_NVLINK6:
+		return P2PLinkIXLINK6
+	}
+	return P2PLinkUnknown
+}
+
+func getBusid(gpuid uint) (string, error) {
+	var device C.dcgmDeviceAttributes_v3
+	device.version = makeVersion3(unsafe.Sizeof(device))
+
+	result := C.dcgmGetDeviceAttributes(handle.handle, C.uint(gpuid), &device)
+	if err := errorString(result); err != nil {
+		return "", fmt.Errorf("Error getting device busid: %s", err)
+	}
+	return *stringPtr(&device.identifiers.pciBusId[0]), nil
+}
+
+func getDeviceTopology(gpuid uint) (links []P2PLink, err error) {
 	var topology C.dcgmDeviceTopology_v1
 	topology.version = makeVersion1(unsafe.Sizeof(topology))
 
-	res := C.dcgmGetDeviceTopology(handle.handle, C.uint(deviceId), &topology)
-	if res == C.DCGM_ST_NOT_SUPPORTED {
-		fmt.Println("not supported")
-		return nil
+	result := C.dcgmGetDeviceTopology(handle.handle, C.uint(gpuid), &topology)
+	if result == C.DCGM_ST_NOT_SUPPORTED {
+		return links, fmt.Errorf("DcgmGetDeviceTopology is not supported")
+	}
+	if result != C.DCGM_ST_OK {
+		return links, &DcgmError{msg: C.GoString(C.errorString(result)), Code: result}
 	}
-	if res != C.DCGM_ST_OK {
-		return fmt.Errorf("error getting device topology %s", C.GoString(C.errorString(res)))
+
+	busid, err := getBusid(gpuid)
+	if err != nil {
+		return
 	}
 
-	return nil
+	for i := uint(0); i < uint(topology.numGpus); i++ {
+		gpu := topology.gpuPaths[i].gpuId
+		p2pLink := P2PLink{
+			GPU:   uint(gpu),
+			BusID: busid,
+			Link:  getP2PLink(uint(topology.gpuPaths[i].path)),
+		}
+		links = append(links, p2pLink)
+	}
+	return
 }
diff --git a/pkg/ixdcgm/utils.go b/pkg/ixdcgm/utils.go
index 7ab4650210f05a31fc78547707ebb2d3087ca7e9..2a19c44821e4a7c479ba3ca897f93a91574b9ef7 100644
--- a/pkg/ixdcgm/utils.go
+++ b/pkg/ixdcgm/utils.go
@@ -30,9 +30,33 @@ package ixdcgm
 import "C"
 import (
 	"fmt"
+	"strconv"
+	"strings"
 	"unsafe"
 )
 
+const (
+	dcgmInt32Blank = 0x7ffffff0         // 2147483632
+	dcgmInt64Blank = 0x7ffffffffffffff0 // 9223372036854775792
+)
+
+func uintPtr(c C.uint) *uint {
+	i := uint(c)
+	return &i
+}
+
+func stringPtr(c *C.char) *string {
+	s := C.GoString(c)
+	return &s
+}
+
+type DcgmError struct {
+	msg  string         // description of error
+	Code C.dcgmReturn_t // dcgmReturn_t value of error
+}
+
+func (e *DcgmError) Error() string { return e.msg }
+
 func makeVersion1(struct_type uintptr) C.uint {
 	version := C.uint(struct_type | 1<<24)
 	return version
@@ -48,6 +72,16 @@ func makeVersion3(struct_type uintptr) C.uint {
 	return version
 }
 
+func makeVersion4(struct_type uintptr) C.uint {
+	version := C.uint(struct_type | 4<<24)
+	return version
+}
+
+func makeVersion5(struct_type uintptr) C.uint {
+	version := C.uint(struct_type | 5<<24)
+	return version
+}
+
 func errorString(result C.dcgmReturn_t) error {
 	if result == C.DCGM_ST_OK {
 		return nil
@@ -87,3 +121,61 @@ func removeBytesSpaces(originalBytes []byte) string {
 
 	return string(cleanedBytes)
 }
+
+// convertBitsetStr converts a set of numbers in string format to a range representation.
+// input sample: "{0,1,2,3,6,10,11,12,13}"
+// output sample: "0-3,6,10-13"
+func convertBitsetStr(input string) (output string) {
+	input = strings.Trim(input, "{}")
+	numStrs := strings.Split(input, ",")
+	nums := make([]int, len(numStrs))
+
+	// Convert string numbers to integers
+	for i, numStr := range numStrs {
+		num, err := strconv.Atoi(strings.TrimSpace(numStr))
+		if err != nil {
+			panic(err)
+		}
+		nums[i] = num
+	}
+
+	// Sort the numbers (assuming they are not sorted)
+	// If the input is always sorted, you can skip this step
+	for i := 0; i < len(nums)-1; i++ {
+		for j := 0; j < len(nums)-1-i; j++ {
+			if nums[j] > nums[j+1] {
+				nums[j], nums[j+1] = nums[j+1], nums[j]
+			}
+		}
+	}
+
+	// Process the numbers to create ranges
+	var result []string
+	start := nums[0]
+	end := nums[0]
+
+	for i := 1; i < len(nums); i++ {
+		if nums[i] == end+1 {
+			end = nums[i]
+		} else {
+			if start == end {
+				result = append(result, strconv.Itoa(start))
+			} else {
+				result = append(result, fmt.Sprintf("%d-%d", start, end))
+			}
+			start = nums[i]
+			end = nums[i]
+		}
+	}
+
+	// Handle the last range
+	if start == end {
+		result = append(result, strconv.Itoa(start))
+	} else {
+		result = append(result, fmt.Sprintf("%d-%d", start, end))
+	}
+
+	// Join the result into a single string
+	output = strings.Join(result, ",")
+	return
+}
diff --git a/samples/devicecommon/main.go b/samples/devicecommon/main.go
index e758b53734413c9c838b9679ade725fca5c934a5..f4c9651a3f543e9b96ebfef5e95c9bce2738ceb5 100644
--- a/samples/devicecommon/main.go
+++ b/samples/devicecommon/main.go
@@ -30,7 +30,7 @@ var (
 )
 
 func main() {
-	// choose ixdcgm hostengine running mode
+	// Choose ixdcgm hostengine running mode
 	// 1. ixdcgm.Embedded
 	// 2. ixdcgm.Standalone -connect "addr", -socket "isSocket"
 	// 3. ixdcgm.StartHostengine
diff --git a/samples/deviceinfo/main.go b/samples/deviceinfo/main.go
index a1ededd2fa994289d5c6b992414106ce08139a51..791b124929cfcb92cffa57a9bd9d1a5f9be981d0 100644
--- a/samples/deviceinfo/main.go
+++ b/samples/deviceinfo/main.go
@@ -18,7 +18,6 @@ limitations under the License.
 package main
 
 import (
-	"flag"
 	"fmt"
 	"html/template"
 	"log"
@@ -29,7 +28,7 @@ import (
 
 const (
 	deviceInfo = `Driver Version         : {{.Identifiers.DriverVersion}}
-GPUId		       : {{.GPUId}}
+GPUId                  : {{.GPUId}}
 IxDCGMSupported        : {{.IxDCGMSupported}}
 Uuid                   : {{.Uuid}}
 Product Name           : {{.Identifiers.ProductName}}
@@ -41,22 +40,20 @@ Used Memory (MB):      : {{or .MemoryUsage.Used "N/A"}}
 Free Memory (MB):      : {{or .MemoryUsage.Free "N/A"}}
 Bandwidth (MB/s)       : {{or .PCI.Bandwidth "N/A"}}
 PowerLimit (W)         : {{or .PowerLimit "N/A"}}
----------------------------------------------------------------------
+CPUAffinity            : {{or .CPUAffinity "N/A"}}
+NUMAAffinity           : {{or .NUMAAffinity "N/A"}}
+P2P Available          : {{if not .Topology}}None{{else}}{{range .Topology}}
+    GPU{{.GPU}} - (BusID){{.BusID}} - {{.Link.PCIPaths}}{{end}}{{end}}
+--------------------------------------------------
 `
 )
 
-var (
-	connectAddr = flag.String("connectAddr", "0.0.0.0:5777", "DCGM connect address")
-	isSocket    = flag.String("socket", "0", "Connect to Unix socket")
-)
-
 func main() {
-	// choose ixdcgm hostengine running mode
+	// Choose ixdcgm hostengine running mode
 	// 1. ixdcgm.Embedded
 	// 2. ixdcgm.Standalone -connect "addr", -socket "isSocket"
 	// 3. ixdcgm.StartHostengine
-	flag.Parse()
-	cleanup, err := ixdcgm.Init(ixdcgm.Standalone, *connectAddr, *isSocket)
+	cleanup, err := ixdcgm.Init(ixdcgm.Embedded)
 	if err != nil {
 		panic(err)
 	}
@@ -76,7 +73,7 @@ func main() {
 		}
 
 		if err = t.Execute(os.Stdout, d); err != nil {
-			log.Panicln("Template error:", err)
+			log.Panicln("Template error: ", err)
 		}
 	}
 }
diff --git a/samples/deviceprocessinfo/main.go b/samples/deviceprocessinfo/main.go
index f16b1bad8ef23f1d296eaf40cd29589cbbf45e47..01fda9b13b6e38250f898f853b11d15be3cb4fb8 100644
--- a/samples/deviceprocessinfo/main.go
+++ b/samples/deviceprocessinfo/main.go
@@ -20,18 +20,16 @@ package main
 import (
 	"fmt"
 	"log"
-	"os"
-	"os/signal"
-	"syscall"
 
 	"gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm"
 )
 
 func main() {
-	sigs := make(chan os.Signal, 1)
-	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
-
-	cleanup, err := ixdcgm.Init(ixdcgm.Embedded, "LogInfo")
+	// Choose ixdcgm hostengine running mode
+	// 1. ixdcgm.Embedded
+	// 2. ixdcgm.Standalone -connect "addr", -socket "isSocket"
+	// 3. ixdcgm.StartHostengine
+	cleanup, err := ixdcgm.Init(ixdcgm.Embedded, "LogWarn")
 	if err != nil {
 		log.Panicln(err)
 	}
@@ -49,7 +47,7 @@ func main() {
 			fmt.Printf("%v\n", err)
 		}
 		for _, info := range infos {
-			fmt.Printf("> Pid: %d, Name: %s, UsedGpuMemory(MiB): %d\n", info.Pid, info.Name, info.UsedGpuMemory)
+			fmt.Printf("> Pid: %d\n  Name: %s\n  UsedGpuMemory(MiB): %d\n", info.Pid, info.Name, info.UsedGpuMemory)
 		}
 		fmt.Println("---------------------------------------------------------------------")
 	}
diff --git a/samples/devicestatus/main.go b/samples/devicestatus/main.go
index 91dbb1e743658bba99112ad9b87d20c24f8429f4..fee63857ff04ee9cc5cef89a522fa5eccd2621a5 100644
--- a/samples/devicestatus/main.go
+++ b/samples/devicestatus/main.go
@@ -20,17 +20,15 @@ package main
 import (
 	"fmt"
 	"log"
-	"os"
-	"os/signal"
-	"syscall"
 
 	"gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm"
 )
 
 func main() {
-	sigs := make(chan os.Signal, 1)
-	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
-
+	// Choose ixdcgm hostengine running mode
+	// 1. ixdcgm.Embedded
+	// 2. ixdcgm.Standalone -connect "addr", -socket "isSocket"
+	// 3. ixdcgm.StartHostengine
 	cleanup, err := ixdcgm.Init(ixdcgm.Embedded)
 	if err != nil {
 		log.Panicln(err)
diff --git a/samples/health/main.go b/samples/health/main.go
new file mode 100644
index 0000000000000000000000000000000000000000..cc7802ab209762f6326944bf6907572e080fea29
--- /dev/null
+++ b/samples/health/main.go
@@ -0,0 +1,70 @@
+package main
+
+import (
+	"log"
+	"os"
+	"os/signal"
+	"syscall"
+	"text/template"
+	"time"
+
+	"gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm"
+)
+
+const (
+	HealthStatus = `GPU                : {{.GPU}}
+Status             : {{.Status}}
+{{range .Watches}}
+Type               : {{.Type}}
+Status             : {{.Status}}
+Error              : {{.Error}}
+{{end}}
+`
+)
+
+// Based on ixdcgmi health commands:
+// - Create group: ixdcgmi group -c <groupName>
+// - Enable all watches: ixdcgmi health -g GROUPID -s a
+// - Check: ixdcgmi health -g GROUPID -c
+func main() {
+	// Choose ixdcgm hostengine running mode
+	// 1. ixdcgm.Embedded
+	// 2. ixdcgm.Standalone -connect "addr", -socket "isSocket"
+	// 3. ixdcgm.StartHostengine
+	cleanup, err := ixdcgm.Init(ixdcgm.Embedded)
+	if err != nil {
+		log.Panicln(err)
+	}
+	defer cleanup()
+
+	sigs := make(chan os.Signal, 1)
+	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
+
+	gpuIds, err := ixdcgm.GetSupportedDevices()
+	if err != nil {
+		log.Panicln(err)
+	}
+
+	ticker := time.NewTicker(time.Second * 1)
+	defer ticker.Stop()
+
+	t := template.Must(template.New("HealthStatus").Parse(HealthStatus))
+	for {
+		select {
+		case <-ticker.C:
+			for _, gpuId := range gpuIds {
+				h, err := ixdcgm.HealthCheckByGpuId(gpuId)
+				if err != nil {
+					log.Panicln(err)
+				}
+
+				if err = t.Execute(os.Stdout, h); err != nil {
+					log.Panicln("Template error: ", err)
+				}
+			}
+		case <-sigs:
+			return
+		}
+	}
+
+}
diff --git a/samples/policy/main.go b/samples/policy/main.go
new file mode 100644
index 0000000000000000000000000000000000000000..9a81a958d0b9b6774a8ffc0a4b4918c82ee2b7a0
--- /dev/null
+++ b/samples/policy/main.go
@@ -0,0 +1,75 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"os"
+	"os/signal"
+	"syscall"
+	"time"
+
+	"gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm"
+)
+
+// Based on ixdcgmi policy commands:
+// - Create group: ixdcgmi group -c <groupName>
+// - Set violation policy: ixdcgmi policy -g GROUPID --set 0,0 -e -p -T 60
+// - Register for policy updates: ixdcgmi policy -g GROUPID --reg
+func main() {
+	// Choose ixdcgm hostengine running mode
+	// 1. ixdcgm.Embedded
+	// 2. ixdcgm.Standalone -connect "addr", -socket "isSocket"
+	// 3. ixdcgm.StartHostengine
+	cleanup, err := ixdcgm.Init(ixdcgm.Embedded)
+	if err != nil {
+		log.Panicln(err)
+	}
+	defer func() {
+		cleanup()
+	}()
+
+	ctx, done := context.WithCancel(context.Background())
+	// Handle SIGINT (Ctrl+C) and SIGTERM (termination signal)
+	sigs := make(chan os.Signal, 1)
+	signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM)
+
+	go func() {
+		<-sigs
+		fmt.Println("Received termination signal, exiting...")
+		done()
+	}()
+
+	// Create policy condition parameters to register violation callback.
+	// Note: at least one policy must be enabled.
+	params := &ixdcgm.PolicyConditionParams{
+		DbePolicyEnabled:       true,
+		PCIePolicyEnabled:      true,
+		ThermalPolicyEnabled:   true,
+		ThermalPolicyThreshold: 60, // °C
+	}
+
+	// Monitor policy violations for all GPUs
+	ch, err := ixdcgm.ListenForPolicyViolationsForAllGPUs(ctx, params)
+
+	// If you want to monitor policy violations for particular GPUs (e.g., gpuId0 and gpuId1),
+	// use the following code:
+	// ch, err := ixdcgm.ListenForPolicyViolationsForGPUs(ctx, params, 0, 1)
+
+	if err != nil {
+		fmt.Printf("Failed to monitor policy violations, err: %v", err)
+		return
+	}
+
+	for {
+		select {
+		case pe := <-ch:
+			fmt.Printf("PolicyViolation : %v\nTimestamp       : %v\nData            : %v\n",
+				pe.Condition, pe.Timestamp, pe.Data)
+		case <-ctx.Done():
+			// Sleep to ensure the ixdcgm policy is unregistered before cleanup.
+			time.Sleep(1 * time.Second)
+			return
+		}
+	}
+}
diff --git a/samples/topology/main.go b/samples/topology/main.go
new file mode 100644
index 0000000000000000000000000000000000000000..df082bfc8fa2b0891b6e6dea59b62817e675bc40
--- /dev/null
+++ b/samples/topology/main.go
@@ -0,0 +1,73 @@
+package main
+
+import (
+	"fmt"
+	"log"
+	"strconv"
+
+	"gitee.com/deep-spark/go-ixdcgm/pkg/ixdcgm"
+)
+
+const (
+	legend = `
+Legend:
+  X    = Self
+  SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
+  NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
+  PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
+  PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
+  PIX  = Connection traversing at most a single PCIe bridge
+  INTE = Connection traversing at most a single on-board PCIe bridge
+  IX#  = Connection traversing a bonded set of # IXLinks`
+)
+
+// Based on topo commands of ixdcgmi and ixsmi
+func main() {
+	// Choose ixdcgm hostengine running mode
+	// 1. ixdcgm.Embedded
+	// 2. ixdcgm.Standalone -connect "addr", -socket "isSocket"
+	// 3. ixdcgm.StartHostengine
+	cleanup, err := ixdcgm.Init(ixdcgm.Embedded)
+	if err != nil {
+		log.Panicln(err)
+	}
+	defer cleanup()
+
+	gpus, err := ixdcgm.GetSupportedDevices()
+	if err != nil {
+		log.Panicln(err)
+	}
+
+	fmt.Printf("%-8s", "")
+	for _, gpu := range gpus {
+		fmt.Printf("%-8s", "GPU"+strconv.Itoa(int(gpu)))
+	}
+	fmt.Printf("%-16s", "CPU Affinity")
+	fmt.Printf("%-16s\n", "NUMA Affinity")
+
+	numGpus := len(gpus)
+	gpuTopo := make([]string, numGpus)
+	for i := 0; i < numGpus; i++ {
+		topo, err := ixdcgm.GetDeviceTopology(gpus[i])
+		if err != nil {
+			log.Panicln(err)
+		}
+
+		fmt.Printf("%-8s", "GPU"+strconv.Itoa(int(gpus[i])))
+		for j := 0; j < len(topo); j++ {
+			// skip current GPU
+			gpuTopo[topo[j].GPU] = topo[j].Link.PCIPaths()
+		}
+		gpuTopo[i] = " X "
+		for j := 0; j < numGpus; j++ {
+			fmt.Printf("%-8s", gpuTopo[j])
+		}
+		deviceInfo, err := ixdcgm.GetDeviceInfo(gpus[i])
+		if err != nil {
+			log.Panicln(err)
+		}
+		fmt.Printf("%-16s", deviceInfo.CPUAffinity)
+		fmt.Printf("%-16s\n", deviceInfo.NUMAAffinity)
+	}
+	fmt.Println(legend)
+}
diff --git a/vendor/github.com/creasty/defaults/.gitignore b/vendor/github.com/creasty/defaults/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..e43b0f988953ae3a84b00331d0ccf5f7d51cb3cf
--- /dev/null
+++ b/vendor/github.com/creasty/defaults/.gitignore
@@ -0,0 +1 @@
+.DS_Store
diff --git a/vendor/github.com/creasty/defaults/LICENSE b/vendor/github.com/creasty/defaults/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..1483dd2d83e77325a4306cb20075b3594a9dd79e
--- /dev/null
+++ b/vendor/github.com/creasty/defaults/LICENSE
@@ -0,0 +1,22 @@
+Copyright (c) 2017-present Yuki Iwanaga
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/vendor/github.com/creasty/defaults/Makefile b/vendor/github.com/creasty/defaults/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..404212a2e5bda82946ee80064cd6657aeb3e1a0a
--- /dev/null
+++ b/vendor/github.com/creasty/defaults/Makefile
@@ -0,0 +1,29 @@
+SHELL := /bin/bash -eu -o pipefail
+
+GO_TEST_FLAGS  := -v
+
+PACKAGE_DIRS := $(shell go list ./... 2> /dev/null | grep -v /vendor/)
+SRC_FILES    := $(shell find . -name '*.go' -not -path './vendor/*')
+
+
+#  Tasks
+#-----------------------------------------------
+.PHONY: lint
+lint:
+	@gofmt -e -d -s $(SRC_FILES) | awk '{ e = 1; print $0 } END { if (e) exit(1) }'
+	@golangci-lint --disable errcheck,unused run
+
+.PHONY: test
+test: lint
+	@go test $(GO_TEST_FLAGS) $(PACKAGE_DIRS)
+
+.PHONY: ci-test
+ci-test: lint
+	@echo > coverage.txt
+	@for d in $(PACKAGE_DIRS); do \
+		go test -coverprofile=profile.out -covermode=atomic -race -v $$d; \
+		if [ -f profile.out ]; then \
+			cat profile.out >> coverage.txt; \
+			rm profile.out; \
+		fi; \
+	done
diff --git a/vendor/github.com/creasty/defaults/README.md b/vendor/github.com/creasty/defaults/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8096b6db903f4be84ce1acfc6a1e6f206fbc8c9
--- /dev/null
+++ b/vendor/github.com/creasty/defaults/README.md
@@ -0,0 +1,160 @@
+defaults
+========
+
+[![CircleCI](https://circleci.com/gh/creasty/defaults/tree/master.svg?style=svg)](https://circleci.com/gh/creasty/defaults/tree/master)
+[![codecov](https://codecov.io/gh/creasty/defaults/branch/master/graph/badge.svg)](https://codecov.io/gh/creasty/defaults)
+[![GitHub release](https://img.shields.io/github/release/creasty/defaults.svg)](https://github.com/creasty/defaults/releases)
+[![License](https://img.shields.io/github/license/creasty/defaults.svg)](./LICENSE)
+
+Initialize structs with default values
+
+- Supports almost all kind of types
+  - Scalar types
+    - `int/8/16/32/64`, `uint/8/16/32/64`, `float32/64`
+    - `uintptr`, `bool`, `string`
+  - Complex types
+    - `map`, `slice`, `struct`
+  - Nested types
+    - `map[K1]map[K2]Struct`, `[]map[K1]Struct[]`
+  - Aliased types
+    - `time.Duration`
+    - e.g., `type Enum string`
+  - Pointer types
+    - e.g., `*SampleStruct`, `*int`
+- Recursively initializes fields in a struct
+- Dynamically sets default values by [`defaults.Setter`](./setter.go) interface
+- Preserves non-initial values from being reset with a default value
+
+
+Usage
+-----
+
+```go
+package main
+
+import (
+	"encoding/json"
+	"fmt"
+	"math/rand"
+
+	"github.com/creasty/defaults"
+)
+
+type Gender string
+
+type Sample struct {
+	Name    string `default:"John Smith"`
+	Age     int    `default:"27"`
+	Gender  Gender `default:"m"`
+	Working bool   `default:"true"`
+
+	SliceInt    []int    `default:"[1, 2, 3]"`
+	SlicePtr    []*int   `default:"[1, 2, 3]"`
+	SliceString []string `default:"[\"a\", \"b\"]"`
+
+	MapNull            map[string]int          `default:"{}"`
+	Map                map[string]int          `default:"{\"key1\": 123}"`
+	MapOfStruct        map[string]OtherStruct  `default:"{\"Key2\": {\"Foo\":123}}"`
+	MapOfPtrStruct     map[string]*OtherStruct `default:"{\"Key3\": {\"Foo\":123}}"`
+	MapOfStructWithTag map[string]OtherStruct  `default:"{\"Key4\": {\"Foo\":123}}"`
+
+	Struct    OtherStruct  `default:"{\"Foo\": 123}"`
+	StructPtr *OtherStruct `default:"{\"Foo\": 123}"`
+
+	NoTag    OtherStruct // Recurses into a nested struct by default
+	NoOption OtherStruct `default:"-"` // no option
+}
+
+type OtherStruct struct {
+	Hello  string `default:"world"` // Tags in a nested struct also work
+	Foo    int    `default:"-"`
+	Random int    `default:"-"`
+}
+
+// SetDefaults implements defaults.Setter interface
+func (s *OtherStruct) SetDefaults() {
+	if defaults.CanUpdate(s.Random) { // Check if it's a zero value (recommended)
+		s.Random = rand.Int() // Set a dynamic value
+	}
+}
+
+func main() {
+	obj := &Sample{}
+	if err := defaults.Set(obj); err != nil {
+		panic(err)
+	}
+
+	out, err := json.MarshalIndent(obj, "", "	")
+	if err != nil {
+		panic(err)
+	}
+	fmt.Println(string(out))
+
+	// Output:
+	// {
+	// 	"Name": "John Smith",
+	// 	"Age": 27,
+	// 	"Gender": "m",
+	// 	"Working": true,
+	// 	"SliceInt": [
+	// 		1,
+	// 		2,
+	// 		3
+	// 	],
+	// 	"SlicePtr": [
+	// 		1,
+	// 		2,
+	// 		3
+	// 	],
+	// 	"SliceString": [
+	// 		"a",
+	// 		"b"
+	// 	],
+	// 	"MapNull": {},
+	// 	"Map": {
+	// 		"key1": 123
+	// 	},
+	// 	"MapOfStruct": {
+	// 		"Key2": {
+	// 			"Hello": "world",
+	// 			"Foo": 123,
+	// 			"Random": 5577006791947779410
+	// 		}
+	// 	},
+	// 	"MapOfPtrStruct": {
+	// 		"Key3": {
+	// 			"Hello": "world",
+	// 			"Foo": 123,
+	// 			"Random": 8674665223082153551
+	// 		}
+	// 	},
+	// 	"MapOfStructWithTag": {
+	// 		"Key4": {
+	// 			"Hello": "world",
+	// 			"Foo": 123,
+	// 			"Random": 6129484611666145821
+	// 		}
+	// 	},
+	// 	"Struct": {
+	// 		"Hello": "world",
+	// 		"Foo": 123,
+	// 		"Random": 4037200794235010051
+	// 	},
+	// 	"StructPtr": {
+	// 		"Hello": "world",
+	// 		"Foo": 123,
+	// 		"Random": 3916589616287113937
+	// 	},
+	// 	"NoTag": {
+	// 		"Hello": "world",
+	// 		"Foo": 0,
+	// 		"Random": 6334824724549167320
+	// 	},
+	// 	"NoOption": {
+	// 		"Hello": "",
+	// 		"Foo": 0,
+	// 		"Random": 0
+	// 	}
+	// }
+}
+```
diff --git a/vendor/github.com/creasty/defaults/defaults.go b/vendor/github.com/creasty/defaults/defaults.go
new file mode 100644
index 0000000000000000000000000000000000000000..f4539281a44cc0647dc52319f9566f0592079c5b
--- /dev/null
+++ b/vendor/github.com/creasty/defaults/defaults.go
@@ -0,0 +1,244 @@
+package defaults
+
+import (
+	"encoding"
+	"encoding/json"
+	"errors"
+	"reflect"
+	"strconv"
+	"time"
+)
+
+var (
+	errInvalidType = errors.New("not a struct pointer")
+)
+
+const (
+	fieldName = "default"
+)
+
+// Set initializes members in a struct referenced by a pointer.
+// Maps and slices are initialized by `make` and other primitive types are set with default values.
+// `ptr` should be a struct pointer
+func Set(ptr interface{}) error {
+	if reflect.TypeOf(ptr).Kind() != reflect.Ptr {
+		return errInvalidType
+	}
+
+	v := reflect.ValueOf(ptr).Elem()
+	t := v.Type()
+
+	if t.Kind() != reflect.Struct {
+		return errInvalidType
+	}
+
+	for i := 0; i < t.NumField(); i++ {
+		if defaultVal := t.Field(i).Tag.Get(fieldName); defaultVal != "-" {
+			if err := setField(v.Field(i), defaultVal); err != nil {
+				return err
+			}
+		}
+	}
+	callSetter(ptr)
+	return nil
+}
+
+// MustSet function is a wrapper of Set function
+// It will call Set and panic if err not equals nil.
+func MustSet(ptr interface{}) {
+	if err := Set(ptr); err != nil {
+		panic(err)
+	}
+}
+
+func setField(field reflect.Value, defaultVal string) error {
+	if !field.CanSet() {
+		return nil
+	}
+
+	if !shouldInitializeField(field, defaultVal) {
+		return nil
+	}
+
+	isInitial := isInitialValue(field)
+	if isInitial {
+		if unmarshalByInterface(field, defaultVal) {
+			return nil
+		}
+
+		switch field.Kind() {
+		case reflect.Bool:
+			if val, err := strconv.ParseBool(defaultVal); err == nil {
+				field.Set(reflect.ValueOf(val).Convert(field.Type()))
+			}
+		case reflect.Int:
+			if val, err := strconv.ParseInt(defaultVal, 0, strconv.IntSize); err == nil {
+				field.Set(reflect.ValueOf(int(val)).Convert(field.Type()))
+			}
+		case reflect.Int8:
+			if val, err := strconv.ParseInt(defaultVal, 0, 8); err == nil {
+				field.Set(reflect.ValueOf(int8(val)).Convert(field.Type()))
+			}
+		case reflect.Int16:
+			if val, err := strconv.ParseInt(defaultVal, 0, 16); err == nil {
+				field.Set(reflect.ValueOf(int16(val)).Convert(field.Type()))
+			}
+		case reflect.Int32:
+			if val, err := strconv.ParseInt(defaultVal, 0, 32); err == nil {
+				field.Set(reflect.ValueOf(int32(val)).Convert(field.Type()))
+			}
+		case reflect.Int64:
+			if val, err := time.ParseDuration(defaultVal); err == nil {
+				field.Set(reflect.ValueOf(val).Convert(field.Type()))
+			} else if val, err := strconv.ParseInt(defaultVal, 0, 64); err == nil {
+				field.Set(reflect.ValueOf(val).Convert(field.Type()))
+			}
+		case reflect.Uint:
+			if val, err := strconv.ParseUint(defaultVal, 0, strconv.IntSize); err == nil {
+				field.Set(reflect.ValueOf(uint(val)).Convert(field.Type()))
+			}
+		case reflect.Uint8:
+			if val, err := strconv.ParseUint(defaultVal, 0, 8); err == nil {
+				field.Set(reflect.ValueOf(uint8(val)).Convert(field.Type()))
+			}
+		case reflect.Uint16:
+			if val, err := strconv.ParseUint(defaultVal, 0, 16); err == nil {
+				field.Set(reflect.ValueOf(uint16(val)).Convert(field.Type()))
+			}
+		case reflect.Uint32:
+			if val, err := strconv.ParseUint(defaultVal, 0, 32); err == nil {
+				field.Set(reflect.ValueOf(uint32(val)).Convert(field.Type()))
+			}
+		case reflect.Uint64:
+			if val, err := strconv.ParseUint(defaultVal, 0, 64); err == nil {
+				field.Set(reflect.ValueOf(val).Convert(field.Type()))
+			}
+		case reflect.Uintptr:
+			if val, err := strconv.ParseUint(defaultVal, 0, strconv.IntSize); err == nil {
+				field.Set(reflect.ValueOf(uintptr(val)).Convert(field.Type()))
+			}
+		case reflect.Float32:
+			if val, err := strconv.ParseFloat(defaultVal, 32); err == nil {
+				field.Set(reflect.ValueOf(float32(val)).Convert(field.Type()))
+			}
+		case reflect.Float64:
+			if val, err := strconv.ParseFloat(defaultVal, 64); err == nil {
+				field.Set(reflect.ValueOf(val).Convert(field.Type()))
+			}
+		case reflect.String:
+			field.Set(reflect.ValueOf(defaultVal).Convert(field.Type()))
+
+		case reflect.Slice:
+			ref := reflect.New(field.Type())
+			ref.Elem().Set(reflect.MakeSlice(field.Type(), 0, 0))
+			if defaultVal != "" && defaultVal != "[]" {
+				if err := json.Unmarshal([]byte(defaultVal), ref.Interface()); err != nil {
+					return err
+				}
+			}
+			field.Set(ref.Elem().Convert(field.Type()))
+		case reflect.Map:
+			ref := reflect.New(field.Type())
+			ref.Elem().Set(reflect.MakeMap(field.Type()))
+			if defaultVal != "" && defaultVal != "{}" {
+				if err := json.Unmarshal([]byte(defaultVal), ref.Interface()); err != nil {
+					return err
+				}
+			}
+			field.Set(ref.Elem().Convert(field.Type()))
+		case reflect.Struct:
+			if defaultVal != "" && defaultVal != "{}" {
+				if err := json.Unmarshal([]byte(defaultVal), field.Addr().Interface()); err != nil {
+					return err
+				}
+			}
+		case reflect.Ptr:
+			field.Set(reflect.New(field.Type().Elem()))
+		}
+	}
+
+	switch field.Kind() {
+	case reflect.Ptr:
+		if isInitial || field.Elem().Kind() == reflect.Struct {
+			setField(field.Elem(), defaultVal)
+			callSetter(field.Interface())
+		}
+	case reflect.Struct:
+		if err := Set(field.Addr().Interface()); err != nil {
+			return err
+		}
+	case reflect.Slice:
+		for j := 0; j < field.Len(); j++ {
+			if err := setField(field.Index(j), ""); err != nil {
+				return err
+			}
+		}
+	case reflect.Map:
+		for _, e := range field.MapKeys() {
+			var v = field.MapIndex(e)
+
+			switch v.Kind() {
+			case reflect.Ptr:
+				switch v.Elem().Kind() {
+				case reflect.Struct, reflect.Slice, reflect.Map:
+					if err := setField(v.Elem(), ""); err != nil {
+						return err
+					}
+				}
+			case reflect.Struct, reflect.Slice, reflect.Map:
+				ref := reflect.New(v.Type())
+				ref.Elem().Set(v)
+				if err := setField(ref.Elem(), ""); err != nil {
+					return err
+				}
+				field.SetMapIndex(e, ref.Elem().Convert(v.Type()))
+			}
+		}
+	}
+
+	return nil
+}
+
+func unmarshalByInterface(field reflect.Value, defaultVal string) bool {
+	asText, ok := field.Addr().Interface().(encoding.TextUnmarshaler)
+	if ok && defaultVal != "" {
+		// if field implements encode.TextUnmarshaler, try to use it before decode by kind
+		if err := asText.UnmarshalText([]byte(defaultVal)); err == nil {
+			return true
+		}
+	}
+	asJSON, ok := field.Addr().Interface().(json.Unmarshaler)
+	if ok && defaultVal != "" && defaultVal != "{}" && defaultVal != "[]" {
+		// if field implements json.Unmarshaler, try to use it before decode by kind
+		if err := asJSON.UnmarshalJSON([]byte(defaultVal)); err == nil {
+			return true
+		}
+	}
+	return false
+}
+
+func isInitialValue(field reflect.Value) bool {
+	return reflect.DeepEqual(reflect.Zero(field.Type()).Interface(), field.Interface())
+}
+
+func shouldInitializeField(field reflect.Value, tag string) bool {
+	switch field.Kind() {
+	case reflect.Struct:
+		return true
+	case reflect.Ptr:
+		if !field.IsNil() && field.Elem().Kind() == reflect.Struct {
+			return true
+		}
+	case reflect.Slice:
+		return field.Len() > 0 || tag != ""
+	case reflect.Map:
+		return field.Len() > 0 || tag != ""
+	}
+
+	return tag != ""
+}
+
+// CanUpdate returns true when the given value is an initial value of its type
+func CanUpdate(v interface{}) bool {
+	return isInitialValue(reflect.ValueOf(v))
+}
diff --git a/vendor/github.com/creasty/defaults/setter.go b/vendor/github.com/creasty/defaults/setter.go
new file mode 100644
index 0000000000000000000000000000000000000000..1f64aa65993be7fccf57beff3569ea8b7bff5367
--- /dev/null
+++ b/vendor/github.com/creasty/defaults/setter.go
@@ -0,0 +1,12 @@
+package defaults
+
+// Setter is an interface for setting default values
+type Setter interface {
+	SetDefaults()
+}
+
+func callSetter(v interface{}) {
+	if ds, ok := v.(Setter); ok {
+		ds.SetDefaults()
+	}
+}
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 0cd694adf1fb0943a2c6746ffb7a270a2f09f0d1..e3036d188bd8e7906fd224996953659386467926 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -1,3 +1,6 @@
 # github.com/bits-and-blooms/bitset v1.13.0
 ## explicit; go 1.16
 github.com/bits-and-blooms/bitset
+# github.com/creasty/defaults v1.8.0
+## explicit; go 1.14
+github.com/creasty/defaults