Skip to content

Commit 48f95ca

Browse files
Add container.cpu.time metric (#5806)
1 parent 403882c commit 48f95ca

File tree

9 files changed

+192
-54
lines changed

9 files changed

+192
-54
lines changed

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/LinuxUtilizationParserCgroupV1.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ public long GetHostCpuUsageInNanoseconds()
151151
$"'{_procStat}' should contain whitespace separated values according to POSIX. We've failed trying to get {i}th value. File content: '{new string(stat)}'.");
152152
}
153153

154-
stat = stat.Slice(next, stat.Length - next);
154+
stat = stat.Slice(next);
155155
}
156156

157157
return (long)(total / (double)_userHz * NanosecondsInSecond);

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/LinuxUtilizationParserCgroupV2.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ public string GetCgroupPath(string filename)
131131
}
132132

133133
// Extract the part after the last colon and cache it for future use
134-
ReadOnlySpan<char> trimmedPath = fileContent.Slice(colonIndex + 1);
134+
ReadOnlySpan<char> trimmedPath = fileContent[(colonIndex + 1)..];
135135
_cachedCgroupPath = "/sys/fs/cgroup" + trimmedPath.ToString().TrimEnd('/') + "/";
136136

137137
return $"{_cachedCgroupPath}{filename}";
@@ -195,7 +195,7 @@ public long GetHostCpuUsageInNanoseconds()
195195
$"'{_procStat}' should contain whitespace separated values according to POSIX. We've failed trying to get {i}th value. File content: '{new string(stat)}'.");
196196
}
197197

198-
stat = stat.Slice(next, stat.Length - next);
198+
stat = stat.Slice(next);
199199
}
200200

201201
return (long)(total / (double)_userHz * NanosecondsInSecond);

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Linux/LinuxUtilizationProvider.cs

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
using System;
55
using System.Collections.Generic;
66
using System.Diagnostics.Metrics;
7-
using System.Linq;
87
using System.Threading;
98
using Microsoft.Extensions.Logging;
109
using Microsoft.Extensions.Logging.Abstractions;
@@ -17,6 +16,7 @@ internal sealed class LinuxUtilizationProvider : ISnapshotProvider
1716
{
1817
private const double One = 1.0;
1918
private const long Hundred = 100L;
19+
private const double NanosecondsInSecond = 1_000_000_000;
2020

2121
private readonly object _cpuLocker = new();
2222
private readonly object _memoryLocker = new();
@@ -82,14 +82,19 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
8282
(_previousCgroupCpuTime, _previousCgroupCpuPeriodCounter) = _parser.GetCgroupCpuUsageInNanosecondsAndCpuPeriodsV2();
8383

8484
_ = meter.CreateObservableGauge(
85-
ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
86-
() => GetMeasurementWithRetry(() => CpuUtilizationLimit(cpuLimit)),
87-
"1");
85+
name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization,
86+
observeValues: () => GetMeasurementWithRetry(() => CpuUtilizationLimit(cpuLimit)),
87+
unit: "1");
8888

8989
_ = meter.CreateObservableGauge(
9090
name: ResourceUtilizationInstruments.ContainerCpuRequestUtilization,
9191
observeValues: () => GetMeasurementWithRetry(() => CpuUtilizationRequest(cpuRequest)),
9292
unit: "1");
93+
94+
_ = meter.CreateObservableGauge(
95+
name: ResourceUtilizationInstruments.ContainerCpuTime,
96+
observeValues: GetCpuTime,
97+
unit: "1");
9398
}
9499
else
95100
{
@@ -111,12 +116,12 @@ public LinuxUtilizationProvider(IOptions<ResourceMonitoringOptions> options, ILi
111116

112117
_ = meter.CreateObservableGauge(
113118
name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization,
114-
observeValues: () => GetMeasurementWithRetry(() => MemoryUtilization()),
119+
observeValues: () => GetMeasurementWithRetry(MemoryUtilization),
115120
unit: "1");
116121

117122
_ = meter.CreateObservableGauge(
118123
name: ResourceUtilizationInstruments.ProcessMemoryUtilization,
119-
observeValues: () => GetMeasurementWithRetry(() => MemoryUtilization()),
124+
observeValues: () => GetMeasurementWithRetry(MemoryUtilization),
120125
unit: "1");
121126

122127
// cpuRequest is a CPU request (aka guaranteed number of CPU units) for pod, for host its 1 core
@@ -259,23 +264,32 @@ public Snapshot GetSnapshot()
259264
memoryUsageInBytes: memoryUsed);
260265
}
261266

262-
private IEnumerable<Measurement<double>> GetMeasurementWithRetry(Func<double> func)
267+
private Measurement<double>[] GetMeasurementWithRetry(Func<double> func)
268+
{
269+
if (!TryGetValueWithRetry(func, out double value))
270+
{
271+
return Array.Empty<Measurement<double>>();
272+
}
273+
274+
return new[] { new Measurement<double>(value) };
275+
}
276+
277+
private bool TryGetValueWithRetry<T>(Func<T> func, out T value)
278+
where T : struct
263279
{
280+
value = default;
264281
if (Volatile.Read(ref _measurementsUnavailable) == 1 &&
265282
_timeProvider.GetUtcNow() - _lastFailure < _retryInterval)
266283
{
267-
return Enumerable.Empty<Measurement<double>>();
284+
return false;
268285
}
269286

270287
try
271288
{
272-
double result = func();
273-
if (Volatile.Read(ref _measurementsUnavailable) == 1)
274-
{
275-
_ = Interlocked.Exchange(ref _measurementsUnavailable, 0);
276-
}
289+
value = func();
290+
_ = Interlocked.CompareExchange(ref _measurementsUnavailable, 0, 1);
277291

278-
return new[] { new Measurement<double>(result) };
292+
return true;
279293
}
280294
catch (Exception ex) when (
281295
ex is System.IO.FileNotFoundException ||
@@ -285,12 +299,25 @@ ex is System.IO.DirectoryNotFoundException ||
285299
_lastFailure = _timeProvider.GetUtcNow();
286300
_ = Interlocked.Exchange(ref _measurementsUnavailable, 1);
287301

288-
return Enumerable.Empty<Measurement<double>>();
302+
return false;
289303
}
290304
}
291305

292306
// Math.Min() is used below to mitigate margin errors and various kinds of precisions losses
293307
// due to the fact that the calculation itself is not an atomic operation:
294308
private double CpuUtilizationRequest(double cpuRequest) => Math.Min(One, CpuUtilizationV2() / cpuRequest);
295309
private double CpuUtilizationLimit(double cpuLimit) => Math.Min(One, CpuUtilizationV2() / cpuLimit);
310+
311+
private IEnumerable<Measurement<double>> GetCpuTime()
312+
{
313+
if (TryGetValueWithRetry(_parser.GetHostCpuUsageInNanoseconds, out long systemCpuTime))
314+
{
315+
yield return new Measurement<double>(systemCpuTime / NanosecondsInSecond, [new KeyValuePair<string, object?>("cpu.mode", "system")]);
316+
}
317+
318+
if (TryGetValueWithRetry(CpuUtilizationV2, out double userCpuTime))
319+
{
320+
yield return new Measurement<double>(userCpuTime, [new KeyValuePair<string, object?>("cpu.mode", "user")]);
321+
}
322+
}
296323
}

src/Libraries/Microsoft.Extensions.Diagnostics.ResourceMonitoring/Windows/WindowsContainerSnapshotProvider.cs

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// The .NET Foundation licenses this file to you under the MIT license.
33

44
using System;
5+
using System.Collections.Generic;
56
using System.Diagnostics.CodeAnalysis;
67
using System.Diagnostics.Metrics;
78
using System.Threading;
@@ -17,6 +18,7 @@ internal sealed class WindowsContainerSnapshotProvider : ISnapshotProvider
1718
{
1819
private const double One = 1.0d;
1920
private const double Hundred = 100.0d;
21+
private const double TicksPerSecondDouble = TimeSpan.TicksPerSecond;
2022

2123
private readonly Lazy<MEMORYSTATUSEX> _memoryStatus;
2224

@@ -85,16 +87,16 @@ internal WindowsContainerSnapshotProvider(
8587

8688
_timeProvider = timeProvider;
8789

88-
using var jobHandle = _createJobHandleObject();
90+
using IJobHandle jobHandle = _createJobHandleObject();
8991

90-
var memoryLimitLong = GetMemoryLimit(jobHandle);
92+
ulong memoryLimitLong = GetMemoryLimit(jobHandle);
9193
_memoryLimit = memoryLimitLong;
9294
_cpuLimit = GetCpuLimit(jobHandle, systemInfo);
9395

9496
// CPU request (aka guaranteed CPU units) is not supported on Windows, so we set it to the same value as CPU limit (aka maximum CPU units).
9597
// Memory request (aka guaranteed memory) is not supported on Windows, so we set it to the same value as memory limit (aka maximum memory).
96-
var cpuRequest = _cpuLimit;
97-
var memoryRequest = memoryLimitLong;
98+
double cpuRequest = _cpuLimit;
99+
ulong memoryRequest = memoryLimitLong;
98100
Resources = new SystemResources(cpuRequest, _cpuLimit, memoryRequest, memoryLimitLong);
99101
_logger.SystemResourcesInfo(_cpuLimit, cpuRequest, memoryLimitLong, memoryRequest);
100102

@@ -110,10 +112,11 @@ internal WindowsContainerSnapshotProvider(
110112
// We don't dispose the meter because IMeterFactory handles that
111113
// An issue on analyzer side: https://github.com/dotnet/roslyn-analyzers/issues/6912
112114
// Related documentation: https://github.com/dotnet/docs/pull/37170
113-
var meter = meterFactory.Create(ResourceUtilizationInstruments.MeterName);
115+
Meter meter = meterFactory.Create(ResourceUtilizationInstruments.MeterName);
114116
#pragma warning restore CA2000 // Dispose objects before losing scope
115117

116118
// Container based metrics:
119+
_ = meter.CreateObservableCounter(name: ResourceUtilizationInstruments.ContainerCpuTime, observeValues: GetCpuTime, unit: "s", description: "CPU time used by the container.");
117120
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerCpuLimitUtilization, observeValue: CpuPercentage);
118121
_ = meter.CreateObservableGauge(name: ResourceUtilizationInstruments.ContainerMemoryLimitUtilization, observeValue: () => MemoryPercentage(() => _processInfo.GetMemoryUsage()));
119122

@@ -155,7 +158,7 @@ private static double GetCpuLimit(IJobHandle jobHandle, ISystemInfo systemInfo)
155158
cpuRatio = cpuLimit.CpuRate / CpuCycles;
156159
}
157160

158-
var systemInfoValue = systemInfo.GetSystemInfo();
161+
SYSTEM_INFO systemInfoValue = systemInfo.GetSystemInfo();
159162

160163
// Multiply the cpu ratio by the number of processors to get you the portion
161164
// of processors used from the system.
@@ -172,7 +175,7 @@ private ulong GetMemoryLimit(IJobHandle jobHandle)
172175

173176
if (memoryLimitInBytes <= 0)
174177
{
175-
var memoryStatus = _memoryStatus.Value;
178+
MEMORYSTATUSEX memoryStatus = _memoryStatus.Value;
176179

177180
// Technically, the unconstrained limit is memoryStatus.TotalPageFile.
178181
// Leaving this at physical as it is more understandable to consumers.
@@ -184,7 +187,7 @@ private ulong GetMemoryLimit(IJobHandle jobHandle)
184187

185188
private double MemoryPercentage(Func<ulong> getMemoryUsage)
186189
{
187-
var now = _timeProvider.GetUtcNow();
190+
DateTimeOffset now = _timeProvider.GetUtcNow();
188191

189192
lock (_memoryLocker)
190193
{
@@ -194,7 +197,7 @@ private double MemoryPercentage(Func<ulong> getMemoryUsage)
194197
}
195198
}
196199

197-
var memoryUsage = getMemoryUsage();
200+
ulong memoryUsage = getMemoryUsage();
198201

199202
lock (_memoryLocker)
200203
{
@@ -211,6 +214,17 @@ private double MemoryPercentage(Func<ulong> getMemoryUsage)
211214
}
212215
}
213216

217+
private IEnumerable<Measurement<double>> GetCpuTime()
218+
{
219+
using IJobHandle jobHandle = _createJobHandleObject();
220+
var basicAccountingInfo = jobHandle.GetBasicAccountingInfo();
221+
222+
yield return new Measurement<double>(basicAccountingInfo.TotalUserTime / TicksPerSecondDouble,
223+
[new KeyValuePair<string, object?>("cpu.mode", "user")]);
224+
yield return new Measurement<double>(basicAccountingInfo.TotalKernelTime / TicksPerSecondDouble,
225+
[new KeyValuePair<string, object?>("cpu.mode", "system")]);
226+
}
227+
214228
private double CpuPercentage()
215229
{
216230
var now = _timeProvider.GetUtcNow();

src/Shared/Instruments/ResourceUtilizationInstruments.cs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@ internal static class ResourceUtilizationInstruments
1818
/// </summary>
1919
public const string MeterName = "Microsoft.Extensions.Diagnostics.ResourceMonitoring";
2020

21+
/// <summary>
22+
/// The name of an instrument to retrieve CPU time consumed by the specific container on all available CPU cores, measured in seconds.
23+
/// </summary>
24+
/// <remarks>
25+
/// The type of an instrument is <see cref="System.Diagnostics.Metrics.ObservableCounter{T}"/>.
26+
/// </remarks>
27+
public const string ContainerCpuTime = "container.cpu.time";
28+
2129
/// <summary>
2230
/// The name of an instrument to retrieve CPU limit consumption of all processes running inside a container or control group in range <c>[0, 1]</c>.
2331
/// </summary>

test/Libraries/Microsoft.Extensions.Diagnostics.HealthChecks.ResourceUtilization.Tests/ResourceHealthCheckExtensionsTests.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,7 @@ public async Task TestCpuAndMemoryChecks_WithMetrics(
500500
accountingInfoAfter1Ms.TotalUserTime = (long)(utilization * 100);
501501
jobHandleMock.SetupSequence(j => j.GetBasicAccountingInfo())
502502
.Returns(() => initialAccountingInfo) // this is called from the WindowsContainerSnapshotProvider's constructor
503+
.Returns(() => initialAccountingInfo) // this is called from the WindowsContainerSnapshotProvider's GetCpuTime method
503504
.Returns(() => accountingInfoAfter1Ms); // this is called from the WindowsContainerSnapshotProvider's CpuPercentage method
504505

505506
using var meter = new Meter("Microsoft.Extensions.Diagnostics.ResourceMonitoring");

0 commit comments

Comments
 (0)