@@ -37,6 +37,7 @@ import (
3737 info "github.com/google/cadvisor/info/v1"
3838 v2 "github.com/google/cadvisor/info/v2"
3939 "github.com/google/cadvisor/machine"
40+ "github.com/google/cadvisor/metrics"
4041 "github.com/google/cadvisor/nvm"
4142 "github.com/google/cadvisor/perf"
4243 "github.com/google/cadvisor/resctrl"
@@ -144,6 +145,8 @@ type Manager interface {
144145 AllPodmanContainers (c * info.ContainerInfoRequest ) (map [string ]info.ContainerInfo , error )
145146
146147 PodmanContainer (containerName string , query * info.ContainerInfoRequest ) (info.ContainerInfo , error )
148+
149+ GetOOMInfos () map [string ]* oomparser.ContainerOomInfo
147150}
148151
149152// Housekeeping configuration for the manager
@@ -153,7 +156,9 @@ type HouskeepingConfig = struct {
153156}
154157
155158// New takes a memory storage and returns a new manager.
156- func New (memoryCache * memory.InMemoryCache , sysfs sysfs.SysFs , houskeepingConfig HouskeepingConfig , includedMetricsSet container.MetricSet , collectorHTTPClient * http.Client , rawContainerCgroupPathPrefixWhiteList , containerEnvMetadataWhiteList []string , perfEventsFile string , resctrlInterval time.Duration ) (Manager , error ) {
159+ func New (memoryCache * memory.InMemoryCache , sysfs sysfs.SysFs , houskeepingConfig HouskeepingConfig , includedMetricsSet container.MetricSet ,
160+ collectorHTTPClient * http.Client , rawContainerCgroupPathPrefixWhiteList , containerEnvMetadataWhiteList []string ,
161+ perfEventsFile string , resctrlInterval time.Duration , f metrics.ContainerLabelsFunc , oomRetainDuration * time.Duration ) (Manager , error ) {
157162 if memoryCache == nil {
158163 return nil , fmt .Errorf ("manager requires memory storage" )
159164 }
@@ -208,6 +213,9 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig
208213 collectorHTTPClient : collectorHTTPClient ,
209214 rawContainerCgroupPathPrefixWhiteList : rawContainerCgroupPathPrefixWhiteList ,
210215 containerEnvMetadataWhiteList : containerEnvMetadataWhiteList ,
216+ oomInfos : map [string ]* oomparser.ContainerOomInfo {},
217+ containerLabelFunc : f ,
218+ oomRetainDuration : oomRetainDuration ,
211219 }
212220
213221 machineInfo , err := machine .Info (sysfs , fsInfo , inHostNamespace )
@@ -247,6 +255,7 @@ type namespacedContainerName struct {
247255}
248256
249257type manager struct {
258+ oomInfos map [string ]* oomparser.ContainerOomInfo
250259 containers map [namespacedContainerName ]* containerData
251260 containersLock sync.RWMutex
252261 memoryCache * memory.InMemoryCache
@@ -271,6 +280,8 @@ type manager struct {
271280 rawContainerCgroupPathPrefixWhiteList []string
272281 // List of container env prefix whitelist, the matched container envs would be collected into metrics as extra labels.
273282 containerEnvMetadataWhiteList []string
283+ containerLabelFunc metrics.ContainerLabelsFunc
284+ oomRetainDuration * time.Duration
274285}
275286
276287func (m * manager ) PodmanContainer (containerName string , query * info.ContainerInfoRequest ) (info.ContainerInfo , error ) {
@@ -318,7 +329,7 @@ func (m *manager) Start() error {
318329 return err
319330 }
320331 klog .V (2 ).Infof ("Starting recovery of all containers" )
321- err = m .detectSubcontainers ("/" )
332+ err = m .detectSubContainers ("/" )
322333 if err != nil {
323334 return err
324335 }
@@ -340,6 +351,7 @@ func (m *manager) Start() error {
340351 quitUpdateMachineInfo := make (chan error )
341352 m .quitChannels = append (m .quitChannels , quitUpdateMachineInfo )
342353 go m .updateMachineInfo (quitUpdateMachineInfo )
354+ go m .cleanUpOomInfos ()
343355
344356 return nil
345357}
@@ -363,6 +375,61 @@ func (m *manager) Stop() error {
363375 return nil
364376}
365377
378+ func (m * manager ) GetOOMInfos () map [string ]* oomparser.ContainerOomInfo {
379+ m .containersLock .RLock ()
380+ defer m .containersLock .RUnlock ()
381+ oomInfos := make (map [string ]* oomparser.ContainerOomInfo )
382+ for k , v := range m .oomInfos {
383+ if time .Since (v .TimeOfDeath ) > * m .oomRetainDuration {
384+ continue
385+ }
386+ oomInfos [k ] = v
387+ }
388+ return oomInfos
389+ }
390+
391+ func (m * manager ) cleanUpOomInfos () {
392+ ticker := time .NewTicker (time .Minute )
393+ defer ticker .Stop ()
394+
395+ for {
396+ select {
397+ case <- ticker .C :
398+ m .containersLock .Lock ()
399+ for k , v := range m .oomInfos {
400+ if time .Since (v .TimeOfDeath ) > * m .oomRetainDuration {
401+ delete (m .oomInfos , k )
402+ }
403+ }
404+ m .containersLock .Unlock ()
405+ }
406+ }
407+ }
408+
409+ func (m * manager ) addOrUpdateOomInfo (cont * containerData , timeOfDeath time.Time ) error {
410+ m .containersLock .Lock ()
411+ defer m .containersLock .Unlock ()
412+
413+ contInfo , err := m .containerDataToContainerInfo (cont , & info.ContainerInfoRequest {
414+ NumStats : 60 ,
415+ })
416+ if err != nil {
417+ return err
418+ }
419+ if oomInfo , ok := m .oomInfos [contInfo .Id ]; ok {
420+ atomic .AddUint64 (& oomInfo .OomEvents , 1 )
421+ return nil
422+ }
423+ containerLabels := m .containerLabelFunc (contInfo )
424+ newOomInfo := & oomparser.ContainerOomInfo {
425+ MetricLabels : containerLabels ,
426+ TimeOfDeath : timeOfDeath ,
427+ }
428+ atomic .AddUint64 (& newOomInfo .OomEvents , 1 )
429+ m .oomInfos [contInfo .Id ] = newOomInfo
430+ return nil
431+ }
432+
366433func (m * manager ) destroyCollectors () {
367434 for _ , container := range m .containers {
368435 container .perfCollector .Destroy ()
@@ -406,7 +473,7 @@ func (m *manager) globalHousekeeping(quit chan error) {
406473 start := time .Now ()
407474
408475 // Check for new containers.
409- err := m .detectSubcontainers ("/" )
476+ err := m .detectSubContainers ("/" )
410477 if err != nil {
411478 klog .Errorf ("Failed to detect containers: %s" , err )
412479 }
@@ -1056,7 +1123,7 @@ func (m *manager) destroyContainerLocked(containerName string) error {
10561123
10571124// Detect all containers that have been added or deleted from the specified container.
10581125func (m * manager ) getContainersDiff (containerName string ) (added []info.ContainerReference , removed []info.ContainerReference , err error ) {
1059- // Get all subcontainers recursively.
1126+ // Get all subContainers recursively.
10601127 m .containersLock .RLock ()
10611128 cont , ok := m .containers [namespacedContainerName {
10621129 Name : containerName ,
@@ -1103,8 +1170,8 @@ func (m *manager) getContainersDiff(containerName string) (added []info.Containe
11031170 return
11041171}
11051172
1106- // Detect the existing subcontainers and reflect the setup here.
1107- func (m * manager ) detectSubcontainers (containerName string ) error {
1173+ // Detect the existing subContainers and reflect the setup here.
1174+ func (m * manager ) detectSubContainers (containerName string ) error {
11081175 added , removed , err := m .getContainersDiff (containerName )
11091176 if err != nil {
11101177 return err
@@ -1147,7 +1214,7 @@ func (m *manager) watchForNewContainers(quit chan error) error {
11471214 }
11481215
11491216 // There is a race between starting the watch and new container creation so we do a detection before we read new containers.
1150- err := m .detectSubcontainers ("/" )
1217+ err := m .detectSubContainers ("/" )
11511218 if err != nil {
11521219 return err
11531220 }
@@ -1247,7 +1314,9 @@ func (m *manager) watchForNewOoms() error {
12471314 continue
12481315 }
12491316 for _ , cont := range conts {
1250- atomic .AddUint64 (& cont .oomEvents , 1 )
1317+ if err := m .addOrUpdateOomInfo (cont , oomInstance .TimeOfDeath ); err != nil {
1318+ klog .Errorf ("failed to add OOM info for %q: %v" , oomInstance .ContainerName , err )
1319+ }
12511320 }
12521321 }
12531322 }()
0 commit comments