-
Notifications
You must be signed in to change notification settings - Fork 116
WIP: CNF-18922: Dynamic memory enforcement #1369
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
cd3b755
71b7b3e
714292b
6dd716e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| #!/usr/bin/env python3 | ||
| # Dynamic Memory Enforcement Service | ||
| # This service is used to protect the cluster from memory exhaustion. | ||
| # It is used to set the memory.max value for the kubepods-burstable.slice and kubepods-besteffort.slice. | ||
|
|
||
| from datetime import datetime | ||
| import os | ||
| import os.path | ||
|
|
||
| ROOT="/sys/fs/cgroup/kubepods.slice" | ||
| ROOT_MAX=os.path.join(ROOT, "memory.max") | ||
|
|
||
| BESTEFFORT_MAX=os.path.join(ROOT, "kubepods-besteffort.slice", "memory.max") | ||
|
|
||
| BURSTABLE_CURRENT=os.path.join(ROOT, "kubepods-burstable.slice", "memory.current") | ||
| BURSTABLE_MAX=os.path.join(ROOT, "kubepods-burstable.slice", "memory.max") | ||
|
|
||
| def guaranteed_max(): | ||
| sum = 0 | ||
|
|
||
| for d in os.listdir(ROOT): | ||
| if not d.startswith("kubepods-pod"): | ||
| continue | ||
| try: | ||
| sum += int(open(os.path.join(ROOT, d, "memory.max")).read().strip()) | ||
| except IOError as e: | ||
| continue | ||
|
|
||
| return sum | ||
|
|
||
| if __name__ == "__main__": | ||
| root_max = int(open(ROOT_MAX).read().strip()) | ||
| burstable_current = int(open(BURSTABLE_CURRENT).read().strip()) | ||
| guaranteed_current = guaranteed_max() | ||
|
|
||
| burstable_max = root_max - guaranteed_current | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I need to double check that memory reserved for workload partitioning is subtracted here as well. |
||
| besteffort_max = burstable_max - burstable_current | ||
|
|
||
| now = datetime.now().isoformat(' ', timespec='microseconds') | ||
| print(f"\n{now}\n") | ||
| print(f"root: {root_max}\nburstable: {burstable_current}\nguaranteed: {guaranteed_current}\n\nburstable max: {burstable_max}\nbest-effort max: {besteffort_max}\n") | ||
|
|
||
| with open(BURSTABLE_MAX, "w") as f: | ||
| f.write(str(burstable_max)) | ||
|
|
||
| with open(BESTEFFORT_MAX, "w") as f: | ||
| f.write(str(besteffort_max)) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -75,7 +75,8 @@ const ( | |
| ovsDynamicPinningTriggerFile = "ovs-enable-dynamic-cpu-affinity" | ||
| ovsDynamicPinningTriggerHostFile = "/var/lib/ovn-ic/etc/enable_dynamic_cpu_affinity" | ||
|
|
||
| cpusetConfigure = "cpuset-configure" | ||
| cpusetConfigure = "cpuset-configure" | ||
| dynamicMemoryEnforcement = "dynamic-memory-enforcement" | ||
| ) | ||
|
|
||
| const ( | ||
|
|
@@ -98,6 +99,7 @@ const ( | |
| systemdServiceCrio = "crio.service" | ||
| systemdServiceTunedOneShot = "ocp-tuned-one-shot.service" | ||
| systemdServiceTypeOneshot = "oneshot" | ||
| systemdServiceTypeSimple = "simple" | ||
| systemdTargetMultiUser = "multi-user.target" | ||
| systemdTrue = "true" | ||
| ) | ||
|
|
@@ -314,6 +316,30 @@ func getIgnitionConfig(profile *performancev2.PerformanceProfile, opts *componen | |
| addContent(ignitionConfig, content, dst, &mode) | ||
| } | ||
|
|
||
| // Add dynamic memory enforcement service only if annotation is enabled and workload partitioning is enabled | ||
| clusterHasWorkloadPartitioning := opts.PinningMode != nil && *opts.PinningMode == apiconfigv1.CPUPartitioningAllNodes | ||
| if profilecomponent.IsEnforceReservedMemoryEnabled(profile) && clusterHasWorkloadPartitioning { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should also check that MemoryManager configure to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? This has nothing to do with NUMA or logical memory tracking. The code enforces the runtime values only. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But the whole point was to avoid consuming the reserved memory used for the house keeping tasks, right? Then if we won't have topologyManager configured to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On second thought, in the worse case the script will work but will do nothing useful |
||
| dynamicMemoryEnforcementService, err := getSystemdContent(getDynamicMemoryEnforcementUnitOptions()) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| ignitionConfig.Systemd.Units = append(ignitionConfig.Systemd.Units, igntypes.Unit{ | ||
| Contents: &dynamicMemoryEnforcementService, | ||
| Enabled: ptr.To(true), | ||
| Name: getSystemdService(dynamicMemoryEnforcement), | ||
| }) | ||
|
|
||
| // Add the Python script for dynamic memory enforcement | ||
| pythonScriptContent, err := getDynamicMemoryEnforcementPythonScript() | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
| pythonScriptPath := getPythonScriptPath(dynamicMemoryEnforcement) | ||
| pythonMode := 0755 // Make it executable | ||
| addContent(ignitionConfig, pythonScriptContent, pythonScriptPath, &pythonMode) | ||
| } | ||
|
|
||
| if profile.Spec.CPU.Offlined != nil { | ||
| offlinedCPUSList, err := cpuset.Parse(string(*profile.Spec.CPU.Offlined)) | ||
| if err != nil { | ||
|
|
@@ -412,6 +438,14 @@ func getBashScriptPath(scriptName string) string { | |
| return fmt.Sprintf("%s/%s.sh", bashScriptsDir, scriptName) | ||
| } | ||
|
|
||
| func getPythonScriptPath(scriptName string) string { | ||
| return fmt.Sprintf("%s/%s.py", bashScriptsDir, scriptName) | ||
| } | ||
|
|
||
| func getDynamicMemoryEnforcementPythonScript() ([]byte, error) { | ||
| return assets.Scripts.ReadFile(fmt.Sprintf("scripts/%s.py", dynamicMemoryEnforcement)) | ||
| } | ||
|
|
||
| func getSystemdEnvironment(key string, value string) string { | ||
| return fmt.Sprintf("%s=%s", key, value) | ||
| } | ||
|
|
@@ -565,6 +599,26 @@ func getRPSUnitOptions(rpsMask string) []*unit.UnitOption { | |
| } | ||
| } | ||
|
|
||
| func getDynamicMemoryEnforcementUnitOptions() []*unit.UnitOption { | ||
| return []*unit.UnitOption{ | ||
| // [Unit] | ||
| // Description | ||
| unit.NewUnitOption(systemdSectionUnit, systemdDescription, "Dynamic memory enforcement service"), | ||
| // Before | ||
| unit.NewUnitOption(systemdSectionUnit, systemdAfter, systemdServiceKubelet), | ||
| // [Service] | ||
| // Type | ||
| unit.NewUnitOption(systemdSectionService, systemdType, systemdServiceTypeSimple), | ||
| // RemainAfterExit | ||
| unit.NewUnitOption(systemdSectionService, systemdRemainAfterExit, systemdTrue), | ||
| // ExecStart | ||
| unit.NewUnitOption(systemdSectionService, systemdExecStart, fmt.Sprintf("bash -c 'while true; do /usr/bin/python3 %s; sleep 2; done'", getPythonScriptPath(dynamicMemoryEnforcement))), | ||
| // [Install] | ||
| // WantedBy | ||
| unit.NewUnitOption(systemdSectionInstall, systemdWantedBy, systemdTargetMultiUser), | ||
| } | ||
| } | ||
|
|
||
| func addContent(ignitionConfig *igntypes.Config, content []byte, dst string, mode *int) { | ||
| contentBase64 := base64.StdEncoding.EncodeToString(content) | ||
| ignitionConfig.Storage.Files = append(ignitionConfig.Storage.Files, igntypes.File{ | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The base line or the entrypoint from which we are performing all the calculation is
/sys/fs/cgroup/kubepods.slice/memory.maxDoes this value has already the memory system reserved subtract from?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
msivak@msivak-thinkpadx1carbongen12 ~> oc describe node testnode | grep -i 'memory:'
memory: 130895396Ki # capacity
memory: 129768996Ki # allocatable
sh-5.1# cat /sys/fs/cgroup/kubepods.slice/memory.max
132988309504
msivak@msivak-thinkpadx1carbongen12 ~> expr 132988309504 / 1024
129871396
I would say: Yes, it already does.