Skip to content

Commit f9a85cd

Browse files
authored
Merge pull request #1559 from tkatila/qat/heartbeat-support
qat: heartbeat support + new capabilities
2 parents 752e494 + 5016f54 commit f9a85cd

File tree

6 files changed

+177
-15
lines changed

6 files changed

+177
-15
lines changed

.golangci.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,4 @@ issues:
6767
# Until the testing package allows pinning variables disable scopelint
6868
# for tests. See https://github.com/kyoh86/scopelint/issues/4.
6969
- scopelint
70+
- gocognit

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ The summary of resources available via plugins in this repository is given in th
232232
* [intelgpu-job.yaml](demo/intelgpu-job.yaml)
233233
* `iaa.intel.com` : `wq-user-[shared or dedicated]`
234234
* [iaa-qpl-demo-pod.yaml](demo/iaa-qpl-demo-pod.yaml)
235-
* `qat.intel.com` : `generic` or `cy`/`dc`
235+
* `qat.intel.com` : `generic` or `cy`/`dc`/`asym-dc`/`sym-dc`
236236
* [crypto-perf-dpdk-pod-requesting-qat.yaml](deployments/qat_dpdk_app/base/crypto-perf-dpdk-pod-requesting-qat.yaml)
237237
* `sgx.intel.com` : `epc`
238238
* [intelsgx-job.yaml](deployments/sgx_enclave_apps/base/intelsgx-job.yaml)

cmd/qat_plugin/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ In addition to the default configuration, you can add device-specific configurat
132132

133133
| Device | Possible Configuration | How To Customize | Options | Notes |
134134
|:-------|:-----------------------|:-----------------|:--------|:------|
135-
| 4xxx, 401xx,402xx | [cfg_services](https://github.com/torvalds/linux/blob/42e66b1cc3a070671001f8a1e933a80818a192bf/Documentation/ABI/testing/sysfs-driver-qat) reports the configured services (crypto services or compression services) of the QAT device. | `ServicesEnabled=<value>` | compress:`dc`, crypto:`sym;asym` | Linux 6.0+ kernel is required. |
135+
| 4xxx, 401xx,402xx | [cfg_services](https://github.com/torvalds/linux/blob/v6.6-rc5/Documentation/ABI/testing/sysfs-driver-qat) reports the configured services (crypto services or compression services) of the QAT device. | `ServicesEnabled=<value>` | compress:`dc`, crypto:`sym;asym`, <br>crypto+compress:`asym;dc`,<br>crypto+compress:`sym;dc` | Linux 6.0+ kernel is required. |
136136

137137
To create a provisioning `configMap`, run the following command before deploying initcontainer:
138138

@@ -141,7 +141,7 @@ $ kubectl create configmap --namespace=inteldeviceplugins-system qat-config --fr
141141
```
142142
or
143143
```bash
144-
$ kubectl create configmap --namespace=inteldeviceplugins-system --from-literal "qat.conf=ServicesEnabled=<option>" qat-config
144+
$ kubectl create configmap --namespace=inteldeviceplugins-system --from-literal "qat.conf=ServicesEnabled=<option>" qat-config
145145
```
146146

147147
When using the operator for deploying the plugin with provisioning config, use `provisioningConfig` field for the name of the ConfigMap, then the config is passed to initcontainer through the volume mount.

cmd/qat_plugin/dpdkdrv/dpdkdrv.go

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,37 @@ func readDeviceConfiguration(pfDev string) string {
393393
return devCfg.Section("GENERAL").Key("ServicesEnabled").String()
394394
}
395395

396+
func getDeviceHealthiness(device string, lookup map[string]string) string {
397+
healthiness := pluginapi.Healthy
398+
399+
pfDev, err := filepath.EvalSymlinks(filepath.Join(device, "physfn"))
400+
if err != nil {
401+
klog.Warningf("failed to get PF device ID for %s: %q", filepath.Base(device), err)
402+
return healthiness
403+
}
404+
405+
// VFs share one PF, so all the VFs should return the same result.
406+
if _, found := lookup[pfDev]; found {
407+
return lookup[pfDev]
408+
}
409+
410+
// Try to find the PF's heartbeat status. If unable to, return Healthy.
411+
driver := getCurrentDriver(pfDev)
412+
413+
hbStatusFile := filepath.Join(filepath.Dir(filepath.Join(pfDev, "../../")), "kernel/debug",
414+
fmt.Sprintf("qat_%s_%s/heartbeat/status", driver, filepath.Base(pfDev)))
415+
416+
// If status reads "-1", the device is considered bad:
417+
// https://github.com/torvalds/linux/blob/v6.6-rc5/Documentation/ABI/testing/debugfs-driver-qat
418+
if data, err := os.ReadFile(hbStatusFile); err == nil && string(data) == "-1" {
419+
healthiness = pluginapi.Unhealthy
420+
}
421+
422+
lookup[pfDev] = healthiness
423+
424+
return healthiness
425+
}
426+
396427
func getDeviceCapabilities(device string) (string, error) {
397428
devID, err := getDeviceID(device)
398429
if err != nil {
@@ -426,6 +457,14 @@ func getDeviceCapabilities(device string) (string, error) {
426457
return "sym", nil
427458
case "asym":
428459
return "asym", nil
460+
case "asym;dc":
461+
return "asym-dc", nil
462+
case "dc;asym":
463+
return "asym-dc", nil
464+
case "sym;dc":
465+
return "sym-dc", nil
466+
case "dc;sym":
467+
return "sym-dc", nil
429468
default:
430469
return defaultCapabilities, nil
431470
}
@@ -583,6 +622,8 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) {
583622
devTree := dpapi.NewDeviceTree()
584623
n := 0
585624

625+
pfHealthLookup := map[string]string{}
626+
586627
for _, vfDevice := range dp.getVfDevices() {
587628
vfBdf := filepath.Base(vfDevice)
588629

@@ -610,14 +651,16 @@ func (dp *DevicePlugin) scan() (dpapi.DeviceTree, error) {
610651
return nil, err
611652
}
612653

613-
klog.V(1).Infof("Device %s with %s capabilities found", vfBdf, cap)
654+
healthiness := getDeviceHealthiness(vfDevice, pfHealthLookup)
655+
656+
klog.V(1).Infof("Device %s with %s capabilities found (%s)", vfBdf, cap, healthiness)
614657

615658
n = n + 1
616659
envs := map[string]string{
617660
fmt.Sprintf("%s%d", envVarPrefix, n): vfBdf,
618661
}
619662

620-
devinfo := dpapi.NewDeviceInfo(pluginapi.Healthy, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil)
663+
devinfo := dpapi.NewDeviceInfo(healthiness, dp.getDpdkDeviceSpecs(dpdkDeviceName), dp.getDpdkMounts(dpdkDeviceName), envs, nil)
621664

622665
devTree.AddDevice(cap, vfBdf, devinfo)
623666
}

cmd/qat_plugin/dpdkdrv/dpdkdrv_test.go

Lines changed: 127 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ package dpdkdrv
1616

1717
import (
1818
"flag"
19+
"fmt"
1920
"os"
2021
"path"
2122
"reflect"
@@ -162,15 +163,16 @@ func TestGetPreferredAllocation(t *testing.T) {
162163

163164
func TestScan(t *testing.T) {
164165
tcases := []struct {
165-
name string
166-
dpdkDriver string
167-
dirs []string
168-
files map[string][]byte
169-
symlinks map[string]string
170-
kernelVfDrivers []string
171-
expectedErr bool
172-
maxDevNum int
173-
expectedDevNum int
166+
name string
167+
dpdkDriver string
168+
dirs []string
169+
files map[string][]byte
170+
symlinks map[string]string
171+
kernelVfDrivers []string
172+
expectedErr bool
173+
maxDevNum int
174+
expectedDevNum int
175+
expectedUnhealthyNum int
174176
}{
175177
{
176178
name: "No error returned for uninitialized device plugin",
@@ -519,7 +521,119 @@ func TestScan(t *testing.T) {
519521
maxDevNum: 2,
520522
expectedDevNum: 2,
521523
},
524+
{
525+
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status bad",
526+
dpdkDriver: "vfio-pci",
527+
kernelVfDrivers: []string{"4xxxvf"},
528+
dirs: []string{
529+
"sys/bus/pci/drivers/4xxx",
530+
"sys/bus/pci/drivers/vfio-pci",
531+
"sys/devices/pci0000:02/0000:02:00.0",
532+
"sys/devices/pci0000:02/0000:02:00.0/qat",
533+
"sys/kernel/debug/qat_4xxx_0000:02:00.0",
534+
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat",
535+
"sys/bus/pci/devices/0000:02:01.0",
536+
},
537+
files: map[string][]byte{
538+
"sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"),
539+
"sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"),
540+
"sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"),
541+
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"),
542+
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("-1"),
543+
},
544+
symlinks: map[string]string{
545+
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
546+
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
547+
"sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
548+
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
549+
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
550+
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx",
551+
},
552+
maxDevNum: 1,
553+
expectedDevNum: 1,
554+
expectedUnhealthyNum: 1,
555+
},
556+
{
557+
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfdevID is equal to qatDevId (4941) heartbeat status good",
558+
dpdkDriver: "vfio-pci",
559+
kernelVfDrivers: []string{"4xxxvf"},
560+
dirs: []string{
561+
"sys/bus/pci/drivers/4xxx",
562+
"sys/bus/pci/drivers/vfio-pci",
563+
"sys/devices/pci0000:02/0000:02:00.0",
564+
"sys/devices/pci0000:02/0000:02:00.0/qat",
565+
"sys/kernel/debug/qat_4xxx_0000:02:00.0",
566+
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat",
567+
"sys/bus/pci/devices/0000:02:01.0",
568+
},
569+
files: map[string][]byte{
570+
"sys/devices/pci0000:02/0000:02:00.0/device": []byte("0x4940"),
571+
"sys/devices/pci0000:02/0000:02:00.0/qat/state": []byte("up"),
572+
"sys/devices/pci0000:02/0000:02:00.0/qat/cfg_services": []byte("sym;asym"),
573+
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x4941"),
574+
"sys/kernel/debug/qat_4xxx_0000:02:00.0/heartbeat/status": []byte("0"),
575+
},
576+
symlinks: map[string]string{
577+
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
578+
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
579+
"sys/bus/pci/drivers/4xxx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
580+
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
581+
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
582+
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/4xxx",
583+
},
584+
maxDevNum: 1,
585+
expectedDevNum: 1,
586+
expectedUnhealthyNum: 0,
587+
},
588+
{
589+
name: "vfio-pci DPDKdriver with no kernel bound driver and where vfDevID is equal to qatDevId (37c9) heartbeat status bad",
590+
dpdkDriver: "vfio-pci",
591+
kernelVfDrivers: []string{"c6xxvf"},
592+
dirs: []string{
593+
"sys/bus/pci/drivers/c6xx",
594+
"sys/bus/pci/drivers/vfio-pci",
595+
"sys/bus/pci/devices/0000:02:01.0",
596+
"sys/bus/pci/devices/0000:02:01.1",
597+
"sys/devices/pci0000:02/0000:02:00.0",
598+
"sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat",
599+
},
600+
files: map[string][]byte{
601+
"sys/bus/pci/devices/0000:02:01.0/device": []byte("0x37c9"),
602+
"sys/bus/pci/devices/0000:02:01.1/device": []byte("0x37c9"),
603+
"sys/kernel/debug/qat_c6xx_0000:02:00.0/heartbeat/status": []byte("-1"),
604+
},
605+
symlinks: map[string]string{
606+
"sys/bus/pci/devices/0000:02:01.0/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
607+
"sys/bus/pci/devices/0000:02:01.0/physfn": "sys/devices/pci0000:02/0000:02:00.0",
608+
"sys/bus/pci/devices/0000:02:01.1/iommu_group": "sys/kernel/iommu_groups/vfiotestfile",
609+
"sys/bus/pci/devices/0000:02:01.1/physfn": "sys/devices/pci0000:02/0000:02:00.0",
610+
"sys/bus/pci/drivers/c6xx/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
611+
"sys/bus/pci/devices/0000:02:00.0": "sys/devices/pci0000:02/0000:02:00.0",
612+
"sys/devices/pci0000:02/0000:02:00.0/virtfn0": "sys/bus/pci/devices/0000:02:01.0",
613+
"sys/devices/pci0000:02/0000:02:00.0/virtfn1": "sys/bus/pci/devices/0000:02:01.1",
614+
"sys/devices/pci0000:02/0000:02:00.0/driver": "sys/bus/pci/drivers/c6xx",
615+
},
616+
maxDevNum: 3,
617+
expectedDevNum: 2,
618+
expectedUnhealthyNum: 2,
619+
},
522620
}
621+
622+
countUnhealthyDevices := func(tree dpapi.DeviceTree) int {
623+
unhealtyNum := 0
624+
625+
for _, v := range tree {
626+
for _, vv := range v {
627+
field := reflect.ValueOf(vv).FieldByName("state")
628+
if fmt.Sprintf("%+v", field) == pluginapi.Unhealthy {
629+
unhealtyNum = unhealtyNum + 1
630+
}
631+
}
632+
}
633+
634+
return unhealtyNum
635+
}
636+
523637
for _, tt := range tcases {
524638
t.Run(tt.name, func(t *testing.T) {
525639
tmpdir, err := os.MkdirTemp("/tmp/", "qatplugin-TestScanPrivate-*")
@@ -560,6 +674,10 @@ func TestScan(t *testing.T) {
560674
t.Errorf("expected %d, but got %d devices", tt.expectedDevNum, devNum)
561675
}
562676

677+
if unhealtyNum := countUnhealthyDevices(fN.tree); unhealtyNum != tt.expectedUnhealthyNum {
678+
t.Errorf("expected %d, but got %d unhealthy devices", tt.expectedUnhealthyNum, unhealtyNum)
679+
}
680+
563681
if err = os.RemoveAll(tmpdir); err != nil {
564682
t.Fatal(err)
565683
}

demo/qat-init.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
NODE_NAME="${NODE_NAME:-}"
44
ENABLED_QAT_PF_PCIIDS=${ENABLED_QAT_PF_PCIIDS:-37c8 4940 4942 4944}
55
DEVS=$(for pf in $ENABLED_QAT_PF_PCIIDS; do lspci -n | grep -e "$pf" | grep -o -e "^\\S*"; done)
6-
SERVICES_LIST="sym;asym dc"
6+
SERVICES_LIST="sym asym sym;asym dc sym;dc asym;dc"
77
QAT_4XXX_DEVICE_PCI_ID="0x4940"
88
QAT_401XX_DEVICE_PCI_ID="0x4942"
99
QAT_402XX_DEVICE_PCI_ID="0x4944"

0 commit comments

Comments
 (0)