Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions assets/performanceprofile/scripts/dynamic-memory-enforcement.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env python3
# Dynamic Memory Enforcement Service
# This service is used to protect the cluster from memory exhaustion.
# It is used to set the memory.max value for the kubepods-burstable.slice and kubepods-besteffort.slice.

from datetime import datetime
import os
import os.path

ROOT="/sys/fs/cgroup/kubepods.slice"
ROOT_MAX=os.path.join(ROOT, "memory.max")
Copy link
Contributor

@Tal-or Tal-or Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The base line or the entrypoint from which we are performing all the calculation is /sys/fs/cgroup/kubepods.slice/memory.max

Does this value has already the memory system reserved subtract from?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

msivak@msivak-thinkpadx1carbongen12 ~> oc describe node testnode | grep -i 'memory:'
memory: 130895396Ki # capacity
memory: 129768996Ki # allocatable

sh-5.1# cat /sys/fs/cgroup/kubepods.slice/memory.max
132988309504
msivak@msivak-thinkpadx1carbongen12 ~> expr 132988309504 / 1024
129871396

I would say: Yes, it already does.


BESTEFFORT_MAX=os.path.join(ROOT, "kubepods-besteffort.slice", "memory.max")

BURSTABLE_CURRENT=os.path.join(ROOT, "kubepods-burstable.slice", "memory.current")
BURSTABLE_MAX=os.path.join(ROOT, "kubepods-burstable.slice", "memory.max")

def guaranteed_max():
sum = 0

for d in os.listdir(ROOT):
if not d.startswith("kubepods-pod"):
continue
try:
sum += int(open(os.path.join(ROOT, d, "memory.max")).read().strip())
except IOError as e:
continue

return sum

if __name__ == "__main__":
root_max = int(open(ROOT_MAX).read().strip())
burstable_current = int(open(BURSTABLE_CURRENT).read().strip())
guaranteed_current = guaranteed_max()

burstable_max = root_max - guaranteed_current
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need to double check that memory reserved for workload partitioning is subtracted here as well.

besteffort_max = burstable_max - burstable_current

now = datetime.now().isoformat(' ', timespec='microseconds')
print(f"\n{now}\n")
print(f"root: {root_max}\nburstable: {burstable_current}\nguaranteed: {guaranteed_current}\n\nburstable max: {burstable_max}\nbest-effort max: {besteffort_max}\n")

with open(BURSTABLE_MAX, "w") as f:
f.write(str(burstable_max))

with open(BESTEFFORT_MAX, "w") as f:
f.write(str(besteffort_max))

4 changes: 4 additions & 0 deletions pkg/apis/performanceprofile/v2/performanceprofile_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ const PerformanceProfileEnablePhysicalRpsAnnotation = "performance.openshift.io/
// that ignores the removal of all RPS settings when realtime workload hint is explicitly set to false.
const PerformanceProfileEnableRpsAnnotation = "performance.openshift.io/enable-rps"

// PerformanceProfileEnforceReservedMemoryAnnotation enables dynamic memory enforcement
// service that manages memory limits for kubepods slices.
const PerformanceProfileEnforceReservedMemoryAnnotation = "enforce-reserved-memory.experimental"

// PerformanceProfileSpec defines the desired state of PerformanceProfile.
type PerformanceProfileSpec struct {
// CPU defines a set of CPU related parameters.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ const (
ovsDynamicPinningTriggerFile = "ovs-enable-dynamic-cpu-affinity"
ovsDynamicPinningTriggerHostFile = "/var/lib/ovn-ic/etc/enable_dynamic_cpu_affinity"

cpusetConfigure = "cpuset-configure"
cpusetConfigure = "cpuset-configure"
dynamicMemoryEnforcement = "dynamic-memory-enforcement"
)

const (
Expand All @@ -98,6 +99,7 @@ const (
systemdServiceCrio = "crio.service"
systemdServiceTunedOneShot = "ocp-tuned-one-shot.service"
systemdServiceTypeOneshot = "oneshot"
systemdServiceTypeSimple = "simple"
systemdTargetMultiUser = "multi-user.target"
systemdTrue = "true"
)
Expand Down Expand Up @@ -314,6 +316,30 @@ func getIgnitionConfig(profile *performancev2.PerformanceProfile, opts *componen
addContent(ignitionConfig, content, dst, &mode)
}

// Add dynamic memory enforcement service only if annotation is enabled and workload partitioning is enabled
clusterHasWorkloadPartitioning := opts.PinningMode != nil && *opts.PinningMode == apiconfigv1.CPUPartitioningAllNodes
if profilecomponent.IsEnforceReservedMemoryEnabled(profile) && clusterHasWorkloadPartitioning {
Copy link
Contributor

@Tal-or Tal-or Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should also check that MemoryManager configure to Static which is suppose to be obvious if the topologyManager configured to single-numa-node or restricted but we're not checking that in this flow

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why? This has nothing to do with NUMA or logical memory tracking. The code enforces the runtime values only.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But the whole point was to avoid consuming the reserved memory used for the house keeping tasks, right?

Then if we won't have topologyManager configured to single-numa-node/restricted we won't have the memory manager enabled which required configuring reserved memory for the system.

Copy link
Contributor

@Tal-or Tal-or Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On second thought, in the worse case the script will work but will do nothing useful

dynamicMemoryEnforcementService, err := getSystemdContent(getDynamicMemoryEnforcementUnitOptions())
if err != nil {
return nil, err
}

ignitionConfig.Systemd.Units = append(ignitionConfig.Systemd.Units, igntypes.Unit{
Contents: &dynamicMemoryEnforcementService,
Enabled: ptr.To(true),
Name: getSystemdService(dynamicMemoryEnforcement),
})

// Add the Python script for dynamic memory enforcement
pythonScriptContent, err := getDynamicMemoryEnforcementPythonScript()
if err != nil {
return nil, err
}
pythonScriptPath := getPythonScriptPath(dynamicMemoryEnforcement)
pythonMode := 0755 // Make it executable
addContent(ignitionConfig, pythonScriptContent, pythonScriptPath, &pythonMode)
}

if profile.Spec.CPU.Offlined != nil {
offlinedCPUSList, err := cpuset.Parse(string(*profile.Spec.CPU.Offlined))
if err != nil {
Expand Down Expand Up @@ -412,6 +438,14 @@ func getBashScriptPath(scriptName string) string {
return fmt.Sprintf("%s/%s.sh", bashScriptsDir, scriptName)
}

func getPythonScriptPath(scriptName string) string {
return fmt.Sprintf("%s/%s.py", bashScriptsDir, scriptName)
}

func getDynamicMemoryEnforcementPythonScript() ([]byte, error) {
return assets.Scripts.ReadFile(fmt.Sprintf("scripts/%s.py", dynamicMemoryEnforcement))
}

func getSystemdEnvironment(key string, value string) string {
return fmt.Sprintf("%s=%s", key, value)
}
Expand Down Expand Up @@ -565,6 +599,26 @@ func getRPSUnitOptions(rpsMask string) []*unit.UnitOption {
}
}

func getDynamicMemoryEnforcementUnitOptions() []*unit.UnitOption {
return []*unit.UnitOption{
// [Unit]
// Description
unit.NewUnitOption(systemdSectionUnit, systemdDescription, "Dynamic memory enforcement service"),
// Before
unit.NewUnitOption(systemdSectionUnit, systemdAfter, systemdServiceKubelet),
// [Service]
// Type
unit.NewUnitOption(systemdSectionService, systemdType, systemdServiceTypeSimple),
// RemainAfterExit
unit.NewUnitOption(systemdSectionService, systemdRemainAfterExit, systemdTrue),
// ExecStart
unit.NewUnitOption(systemdSectionService, systemdExecStart, fmt.Sprintf("bash -c 'while true; do /usr/bin/python3 %s; sleep 2; done'", getPythonScriptPath(dynamicMemoryEnforcement))),
// [Install]
// WantedBy
unit.NewUnitOption(systemdSectionInstall, systemdWantedBy, systemdTargetMultiUser),
}
}

func addContent(ignitionConfig *igntypes.Config, content []byte, dst string, mode *int) {
contentBase64 := base64.StdEncoding.EncodeToString(content)
ignitionConfig.Storage.Files = append(ignitionConfig.Storage.Files, igntypes.File{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,17 @@ func IsMixedCPUsEnabled(profile *performancev2.PerformanceProfile) bool {
}
return *profile.Spec.WorkloadHints.MixedCpus
}

// IsEnforceReservedMemoryEnabled checks if dynamic memory enforcement should be enabled
func IsEnforceReservedMemoryEnabled(profile *performancev2.PerformanceProfile) bool {
if profile.Annotations == nil {
return false
}

isEnabled, ok := profile.Annotations[performancev2.PerformanceProfileEnforceReservedMemoryAnnotation]
if ok && isEnabled == "enable" {
return true
}

return false
}