diff --git a/pkg/ixdcgm/device_info.go b/pkg/ixdcgm/device_info.go index 6687a52c2f206e57bcc7794c57105667fcba657d..ccbc5f61982cd8aa7b1ca3327418a68504cbf2d5 100644 --- a/pkg/ixdcgm/device_info.go +++ b/pkg/ixdcgm/device_info.go @@ -24,6 +24,7 @@ package ixdcgm import "C" import ( "fmt" + "math/rand" "unsafe" "github.com/bits-and-blooms/bitset" @@ -247,14 +248,14 @@ func getAffinity(gpuId uint, typ string) (result string, err error) { return "N/A", fmt.Errorf("not supported affinity type: %s", typ) } - fieldGrpName := fmt.Sprintf("%sAffFields%d", typ, gpuId) + fieldGrpName := fmt.Sprintf("%sAffFields%d", typ, rand.Uint64()) fieldGrpHdl, err := FieldGroupCreate(fieldGrpName, affFields) if err != nil { return "N/A", err } defer FieldGroupDestroy(fieldGrpHdl) - gpuGrpName := fmt.Sprintf("%sAff%d", typ, gpuId) + gpuGrpName := fmt.Sprintf("%sAff%d", typ, rand.Uint64()) gpuGrpHdl, err := WatchFields([]uint{gpuId}, fieldGrpHdl, gpuGrpName) if err != nil { return "N/A", err diff --git a/pkg/ixdcgm/fields.go b/pkg/ixdcgm/fields.go index 4773acbfc9f56a3f98b42c8ed83fe54f202cfe47..5a2db4bbe0930f56b58a8f586749021c36cf8503 100644 --- a/pkg/ixdcgm/fields.go +++ b/pkg/ixdcgm/fields.go @@ -85,11 +85,34 @@ func WatchFields(gpuIds []uint, fieldGrp FieldGrpHandle, groupName string) (Grou cWaitForUpdate := C.int(1) res = C.dcgmUpdateAllFields(handle.handle, cWaitForUpdate) if err = errorString(res); err != nil { - return GroupHandle{}, fmt.Errorf("error updating DCGM fields: %s", err) + return GroupHandle{}, fmt.Errorf("error updating all fields: %s", err) } return group, nil } +func WatchFieldsWithGroupEx( + fieldsGroup FieldGrpHandle, group GroupHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32, +) error { + result := C.dcgmWatchFields(handle.handle, group.handle, fieldsGroup.handle, + C.longlong(updateFreq), C.double(maxKeepAge), C.int(maxKeepSamples)) + + if err := errorString(result); err != nil { + return fmt.Errorf("Error watching fields: %s", err) + } + + cWaitForUpdate := C.int(1) + res := C.dcgmUpdateAllFields(handle.handle, cWaitForUpdate) + if err := errorString(res); err != nil { + return fmt.Errorf("error updating all fields: %s", err) + } + + return nil +} + +func WatchFieldsWithGroup(fieldsGroup FieldGrpHandle, group GroupHandle) error { + return WatchFieldsWithGroupEx(fieldsGroup, group, defaultUpdateFreq, defaultMaxKeepAge, defaultMaxKeepSamples) +} + func GetLatestValuesForFields(gpu uint, fields []Short) ([]FieldValue_v1, error) { values := make([]C.dcgmFieldValue_v1, len(fields)) cFields := *(*[]C.ushort)(unsafe.Pointer(&fields)) diff --git a/pkg/ixdcgm/health.go b/pkg/ixdcgm/health.go index 573d6b10d531982e5931d1248621919b7c7731e2..f97aed70452e637f7fe1f87190ddb535d44e6231 100644 --- a/pkg/ixdcgm/health.go +++ b/pkg/ixdcgm/health.go @@ -51,8 +51,8 @@ func HealthSet(groupId GroupHandle, systems HealthSystem) (err error) { version: C.dcgmHealthSetParams_version2, groupId: groupId.handle, systems: C.dcgmHealthSystems_t(systems), - updateInterval: C.longlong(int64(100000)), // How often to query the underlying health information from the driver in usecs. - maxKeepAge: C.double(float64(1)), // How long to keep data cached for this field in seconds. + updateInterval: C.longlong(int64(30 * 1000000)), // How often to query the underlying health information from the driver in usecs. + maxKeepAge: C.double(float64(600)), // How long to keep data cached for this field in seconds. } result := C.dcgmHealthSet_v2(handle.handle, ¶ms_v2)