Skip to content

Commit b8c9ee8

Browse files
author
Kubernetes Submit Queue
authored
Merge pull request #46456 from jingxu97/May/allocatable
Automatic merge from submit-queue Add local storage (scratch space) allocatable support This PR adds the support for allocatable local storage (scratch space). This feature is only for root file system which is shared by kubernetes componenets, users' containers and/or images. User could use --kube-reserved flag to reserve the storage for kube system components. If the allocatable storage for user's pods is used up, some pods will be evicted to free the storage resource. This feature is part of local storage capacity isolation and described in the proposal kubernetes/community#306 **Release note**: ```release-note This feature exposes local storage capacity for the primary partitions, and supports & enforces storage reservation in Node Allocatable ```
2 parents 822e29d + 943fc53 commit b8c9ee8

File tree

18 files changed

+696
-69
lines changed

18 files changed

+696
-69
lines changed

cmd/kubelet/app/options/options.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ func (c *kubeletConfiguration) addFlags(fs *pflag.FlagSet) {
291291

292292
// Node Allocatable Flags
293293
fs.Var(&c.SystemReserved, "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
294-
fs.Var(&c.KubeReserved, "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi) pairs that describe resources reserved for kubernetes system components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
294+
fs.Var(&c.KubeReserved, "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi, storage=1Gi) pairs that describe resources reserved for kubernetes system components. Currently cpu, memory and local storage for root file system are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
295295
fs.StringSliceVar(&c.EnforceNodeAllocatable, "enforce-node-allocatable", c.EnforceNodeAllocatable, "A comma separated list of levels of node allocatable enforcement to be enforced by kubelet. Acceptible options are 'pods', 'system-reserved' & 'kube-reserved'. If the latter two options are specified, '--system-reserved-cgroup' & '--kube-reserved-cgroup' must also be set respectively. See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md for more details.")
296296
fs.StringVar(&c.SystemReservedCgroup, "system-reserved-cgroup", c.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']")
297297
fs.StringVar(&c.KubeReservedCgroup, "kube-reserved-cgroup", c.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']")

cmd/kubelet/app/server.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,8 +1025,8 @@ func parseResourceList(m componentconfig.ConfigurationMap) (v1.ResourceList, err
10251025
rl := make(v1.ResourceList)
10261026
for k, v := range m {
10271027
switch v1.ResourceName(k) {
1028-
// Only CPU and memory resources are supported.
1029-
case v1.ResourceCPU, v1.ResourceMemory:
1028+
// CPU, memory and local storage resources are supported.
1029+
case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceStorage:
10301030
q, err := resource.ParseQuantity(v)
10311031
if err != nil {
10321032
return nil, err

pkg/apis/componentconfig/v1alpha1/types.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,7 @@ type KubeletConfiguration struct {
551551
SystemReserved map[string]string `json:"systemReserved"`
552552
// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs
553553
// that describe resources reserved for kubernetes system components.
554-
// Currently only cpu and memory are supported. [default=none]
554+
// Currently cpu, memory and local storage for root file system are supported. [default=none]
555555
// See http://kubernetes.io/docs/user-guide/compute-resources for more detail.
556556
KubeReserved map[string]string `json:"kubeReserved"`
557557

pkg/kubelet/cadvisor/util.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package cadvisor
1818

1919
import (
2020
cadvisorapi "github.com/google/cadvisor/info/v1"
21+
cadvisorapi2 "github.com/google/cadvisor/info/v2"
2122
"k8s.io/apimachinery/pkg/api/resource"
2223
"k8s.io/kubernetes/pkg/api/v1"
2324
)
@@ -33,3 +34,21 @@ func CapacityFromMachineInfo(info *cadvisorapi.MachineInfo) v1.ResourceList {
3334
}
3435
return c
3536
}
37+
38+
func StorageScratchCapacityFromFsInfo(info cadvisorapi2.FsInfo) v1.ResourceList {
39+
c := v1.ResourceList{
40+
v1.ResourceStorage: *resource.NewQuantity(
41+
int64(info.Capacity),
42+
resource.BinarySI),
43+
}
44+
return c
45+
}
46+
47+
func StorageOverlayCapacityFromFsInfo(info cadvisorapi2.FsInfo) v1.ResourceList {
48+
c := v1.ResourceList{
49+
v1.ResourceStorageOverlay: *resource.NewQuantity(
50+
int64(info.Capacity),
51+
resource.BinarySI),
52+
}
53+
return c
54+
}

pkg/kubelet/cm/node_container_manager.go

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"k8s.io/apimachinery/pkg/types"
3030
clientv1 "k8s.io/client-go/pkg/api/v1"
3131
"k8s.io/kubernetes/pkg/api/v1"
32+
"k8s.io/kubernetes/pkg/kubelet/cadvisor"
3233
"k8s.io/kubernetes/pkg/kubelet/events"
3334
evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api"
3435
)
@@ -180,9 +181,20 @@ func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList {
180181

181182
}
182183

183-
// GetNodeAllocatable returns amount of compute resource that have to be reserved on this node from scheduling.
184+
// GetNodeAllocatable returns amount of compute or storage resource that have to be reserved on this node from scheduling.
184185
func (cm *containerManagerImpl) GetNodeAllocatableReservation() v1.ResourceList {
185186
evictionReservation := hardEvictionReservation(cm.HardEvictionThresholds, cm.capacity)
187+
if _, ok := cm.capacity[v1.ResourceStorage]; !ok {
188+
if cm.cadvisorInterface != nil {
189+
if rootfs, err := cm.cadvisorInterface.RootFsInfo(); err == nil {
190+
for rName, rCap := range cadvisor.StorageScratchCapacityFromFsInfo(rootfs) {
191+
cm.capacity[rName] = rCap
192+
}
193+
} else {
194+
glog.Warning("Error getting rootfs info: %v", err)
195+
}
196+
}
197+
}
186198
result := make(v1.ResourceList)
187199
for k := range cm.capacity {
188200
value := resource.NewQuantity(0, resource.DecimalSI)

pkg/kubelet/eviction/api/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ const (
3838
SignalImageFsInodesFree Signal = "imagefs.inodesFree"
3939
// SignalAllocatableMemoryAvailable is amount of memory available for pod allocation (i.e. allocatable - workingSet (of pods), in bytes.
4040
SignalAllocatableMemoryAvailable Signal = "allocatableMemory.available"
41+
// SignalAllocatableNodeFsAvailable is amount of local storage available for pod allocation
42+
SignalAllocatableNodeFsAvailable Signal = "allocatableNodeFs.available"
4143
)
4244

4345
// ThresholdOperator is the operator used to express a Threshold.

pkg/kubelet/eviction/eviction_manager.go

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,8 @@ type managerImpl struct {
8282
lastObservations signalObservations
8383
// notifiersInitialized indicates if the threshold notifiers have been initialized (i.e. synchronize() has been called once)
8484
notifiersInitialized bool
85+
// dedicatedImageFs indicates if imagefs is on a separate device from the rootfs
86+
dedicatedImageFs *bool
8587
}
8688

8789
// ensure it implements the required interface
@@ -106,6 +108,7 @@ func NewManager(
106108
nodeRef: nodeRef,
107109
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
108110
thresholdsFirstObservedAt: thresholdsObservedAt{},
111+
dedicatedImageFs: nil,
109112
}
110113
return manager, manager
111114
}
@@ -211,21 +214,22 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
211214
}
212215

213216
glog.V(3).Infof("eviction manager: synchronize housekeeping")
214-
215217
// build the ranking functions (if not yet known)
216218
// TODO: have a function in cadvisor that lets us know if global housekeeping has completed
217-
if len(m.resourceToRankFunc) == 0 || len(m.resourceToNodeReclaimFuncs) == 0 {
218-
// this may error if cadvisor has yet to complete housekeeping, so we will just try again in next pass.
219-
hasDedicatedImageFs, err := diskInfoProvider.HasDedicatedImageFs()
220-
if err != nil {
219+
if m.dedicatedImageFs == nil {
220+
hasImageFs, ok := diskInfoProvider.HasDedicatedImageFs()
221+
if ok != nil {
221222
return nil
222223
}
223-
m.resourceToRankFunc = buildResourceToRankFunc(hasDedicatedImageFs)
224-
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasDedicatedImageFs)
224+
m.dedicatedImageFs = &hasImageFs
225+
m.resourceToRankFunc = buildResourceToRankFunc(hasImageFs)
226+
m.resourceToNodeReclaimFuncs = buildResourceToNodeReclaimFuncs(m.imageGC, hasImageFs)
227+
225228
}
226229

230+
activePods := podFunc()
227231
// make observations and get a function to derive pod usage stats relative to those observations.
228-
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider)
232+
observations, statsFunc, err := makeSignalObservations(m.summaryProvider, nodeProvider, activePods, *m.dedicatedImageFs)
229233
if err != nil {
230234
glog.Errorf("eviction manager: unexpected err: %v", err)
231235
return nil
@@ -336,7 +340,11 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
336340
}
337341

338342
// the only candidates viable for eviction are those pods that had anything running.
339-
activePods := podFunc()
343+
if len(activePods) == 0 {
344+
glog.Errorf("eviction manager: eviction thresholds have been met, but no pods are active to evict")
345+
return nil
346+
}
347+
340348
// rank the running pods for eviction for the specified resource
341349
rank(activePods, statsFunc)
342350

pkg/kubelet/eviction/helpers.go

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ const (
5454
resourceNodeFs v1.ResourceName = "nodefs"
5555
// nodefs inodes, number. internal to this module, used to account for local node root filesystem inodes.
5656
resourceNodeFsInodes v1.ResourceName = "nodefsInodes"
57+
// container overlay storage, in bytes. internal to this module, used to account for local disk usage for container overlay.
58+
resourceOverlay v1.ResourceName = "overlay"
5759
)
5860

5961
var (
@@ -74,19 +76,25 @@ func init() {
7476
signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure
7577
signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure
7678
signalToNodeCondition[evictionapi.SignalNodeFsInodesFree] = v1.NodeDiskPressure
79+
signalToNodeCondition[evictionapi.SignalAllocatableNodeFsAvailable] = v1.NodeDiskPressure
7780

7881
// map signals to resources (and vice-versa)
7982
signalToResource = map[evictionapi.Signal]v1.ResourceName{}
8083
signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory
8184
signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory
85+
signalToResource[evictionapi.SignalAllocatableNodeFsAvailable] = resourceNodeFs
8286
signalToResource[evictionapi.SignalImageFsAvailable] = resourceImageFs
8387
signalToResource[evictionapi.SignalImageFsInodesFree] = resourceImageFsInodes
8488
signalToResource[evictionapi.SignalNodeFsAvailable] = resourceNodeFs
8589
signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceNodeFsInodes
90+
8691
resourceToSignal = map[v1.ResourceName]evictionapi.Signal{}
8792
for key, value := range signalToResource {
8893
resourceToSignal[value] = key
8994
}
95+
// Hard-code here to make sure resourceNodeFs maps to evictionapi.SignalNodeFsAvailable
96+
// (TODO) resourceToSignal is a map from resource name to a list of signals
97+
resourceToSignal[resourceNodeFs] = evictionapi.SignalNodeFsAvailable
9098
}
9199

92100
// validSignal returns true if the signal is supported.
@@ -234,6 +242,16 @@ func getAllocatableThreshold(allocatableConfig []string) []evictionapi.Threshold
234242
Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
235243
},
236244
},
245+
{
246+
Signal: evictionapi.SignalAllocatableNodeFsAvailable,
247+
Operator: evictionapi.OpLessThan,
248+
Value: evictionapi.ThresholdValue{
249+
Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
250+
},
251+
MinReclaim: &evictionapi.ThresholdValue{
252+
Quantity: resource.NewQuantity(int64(0), resource.BinarySI),
253+
},
254+
},
237255
}
238256
}
239257
}
@@ -382,10 +400,12 @@ func localVolumeNames(pod *v1.Pod) []string {
382400
func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) {
383401
disk := resource.Quantity{Format: resource.BinarySI}
384402
inodes := resource.Quantity{Format: resource.BinarySI}
403+
overlay := resource.Quantity{Format: resource.BinarySI}
385404
for _, container := range podStats.Containers {
386405
if hasFsStatsType(statsToMeasure, fsStatsRoot) {
387406
disk.Add(*diskUsage(container.Rootfs))
388407
inodes.Add(*inodeUsage(container.Rootfs))
408+
overlay.Add(*diskUsage(container.Rootfs))
389409
}
390410
if hasFsStatsType(statsToMeasure, fsStatsLogs) {
391411
disk.Add(*diskUsage(container.Logs))
@@ -405,8 +425,9 @@ func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsSt
405425
}
406426
}
407427
return v1.ResourceList{
408-
resourceDisk: disk,
409-
resourceInodes: inodes,
428+
resourceDisk: disk,
429+
resourceInodes: inodes,
430+
resourceOverlay: overlay,
410431
}, nil
411432
}
412433

@@ -637,7 +658,7 @@ func (a byEvictionPriority) Less(i, j int) bool {
637658
}
638659

639660
// makeSignalObservations derives observations using the specified summary provider.
640-
func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider NodeProvider) (signalObservations, statsFunc, error) {
661+
func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider NodeProvider, pods []*v1.Pod, withImageFs bool) (signalObservations, statsFunc, error) {
641662
summary, err := summaryProvider.Get()
642663
if err != nil {
643664
return nil, nil, err
@@ -706,6 +727,37 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider, nodeProvider
706727
capacity: memoryAllocatableCapacity.Copy(),
707728
}
708729
}
730+
731+
if storageScratchAllocatableCapacity, ok := node.Status.Allocatable[v1.ResourceStorage]; ok {
732+
storageScratchAllocatable := storageScratchAllocatableCapacity.Copy()
733+
for _, pod := range pods {
734+
podStat, ok := statsFunc(pod)
735+
if !ok {
736+
continue
737+
}
738+
739+
usage, err := podDiskUsage(podStat, pod, []fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource, fsStatsRoot})
740+
if err != nil {
741+
glog.Warningf("eviction manager: error getting pod disk usage %v", err)
742+
continue
743+
}
744+
// If there is a seperate imagefs set up for container runtimes, the scratch disk usage from nodefs should exclude the overlay usage
745+
if withImageFs {
746+
diskUsage := usage[resourceDisk]
747+
diskUsageP := &diskUsage
748+
diskUsagep := diskUsageP.Copy()
749+
diskUsagep.Sub(usage[resourceOverlay])
750+
storageScratchAllocatable.Sub(*diskUsagep)
751+
} else {
752+
storageScratchAllocatable.Sub(usage[resourceDisk])
753+
}
754+
}
755+
result[evictionapi.SignalAllocatableNodeFsAvailable] = signalObservation{
756+
available: storageScratchAllocatable,
757+
capacity: storageScratchAllocatableCapacity.Copy(),
758+
}
759+
}
760+
709761
return result, statsFunc, nil
710762
}
711763

pkg/kubelet/eviction/helpers_test.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,16 @@ func TestParseThresholdConfig(t *testing.T) {
7676
Quantity: quantityMustParse("0"),
7777
},
7878
},
79+
{
80+
Signal: evictionapi.SignalAllocatableNodeFsAvailable,
81+
Operator: evictionapi.OpLessThan,
82+
Value: evictionapi.ThresholdValue{
83+
Quantity: quantityMustParse("0"),
84+
},
85+
MinReclaim: &evictionapi.ThresholdValue{
86+
Quantity: quantityMustParse("0"),
87+
},
88+
},
7989
{
8090
Signal: evictionapi.SignalMemoryAvailable,
8191
Operator: evictionapi.OpLessThan,
@@ -777,8 +787,7 @@ func TestMakeSignalObservations(t *testing.T) {
777787
if res.CmpInt64(int64(allocatableMemoryCapacity)) != 0 {
778788
t.Errorf("Expected Threshold %v to be equal to value %v", res.Value(), allocatableMemoryCapacity)
779789
}
780-
actualObservations, statsFunc, err := makeSignalObservations(provider, nodeProvider)
781-
790+
actualObservations, statsFunc, err := makeSignalObservations(provider, nodeProvider, pods, false)
782791
if err != nil {
783792
t.Errorf("Unexpected err: %v", err)
784793
}

pkg/kubelet/kubelet.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
clientgoclientset "k8s.io/client-go/kubernetes"
3838

3939
cadvisorapi "github.com/google/cadvisor/info/v1"
40+
cadvisorapiv2 "github.com/google/cadvisor/info/v2"
4041
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
4142
"k8s.io/apimachinery/pkg/fields"
4243
"k8s.io/apimachinery/pkg/labels"
@@ -927,6 +928,9 @@ type Kubelet struct {
927928
// Cached MachineInfo returned by cadvisor.
928929
machineInfo *cadvisorapi.MachineInfo
929930

931+
//Cached RootFsInfo returned by cadvisor
932+
rootfsInfo *cadvisorapiv2.FsInfo
933+
930934
// Handles certificate rotations.
931935
serverCertificateManager certificate.Manager
932936

0 commit comments

Comments
 (0)