Skip to content

Commit 27a6508

Browse files
authored
sled agent should archive log files on startup and zone shutdown (#9226)
1 parent a3bf7ea commit 27a6508

File tree

12 files changed

+654
-117
lines changed

12 files changed

+654
-117
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

common/src/zpool_name.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,15 @@ impl ZpoolName {
120120

121121
/// Returns a path to a dataset's mountpoint within the zpool.
122122
///
123+
/// `dataset` is relative to the root of the pool. For example, if your
124+
/// pool is `oxp_123`, then your full dataset name would be something like
125+
/// `oxp_123/foo/bar`. You'd pass `foo/bar` to this function and it would
126+
/// return something like `/pool/ext/123/foo/bar`.
127+
///
123128
/// For example: oxp_(UUID) -> /pool/ext/(UUID)/(dataset)
129+
///
130+
/// In this example, the value of UUID here is implied by `self`. `dataset`
131+
/// is the argument to this function.
124132
pub fn dataset_mountpoint(
125133
&self,
126134
root: &Utf8Path,

illumos-utils/src/zfs.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1287,7 +1287,7 @@ impl Zfs {
12871287
/// Calls "zfs get" to acquire multiple values
12881288
///
12891289
/// - `names`: The properties being acquired
1290-
/// - `source`: The optioanl property source (origin of the property)
1290+
/// - `source`: The optional property source (origin of the property)
12911291
/// Defaults to "all sources" when unspecified.
12921292
pub async fn get_values<const N: usize>(
12931293
filesystem_name: &str,

sled-agent/config-reconciler/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ nexus-sled-agent-shared.workspace = true
2929
ntp-admin-client.workspace = true
3030
omicron-common.workspace = true
3131
omicron-uuid-kinds.workspace = true
32+
rand.workspace = true
3233
serde.workspace = true
3334
sha2.workspace = true
3435
sled-agent-api.workspace = true

sled-agent/config-reconciler/src/dump_setup.rs

Lines changed: 171 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@ use slog::info;
108108
use slog::o;
109109
use slog::trace;
110110
use slog::warn;
111+
use slog_error_chain::InlineErrorChain;
111112
use std::collections::HashSet;
112113
use std::ffi::OsString;
113114
use std::path::{Path, PathBuf};
@@ -186,6 +187,11 @@ trait GetMountpoint: AsRef<ZpoolName> {
186187

187188
#[derive(Debug)]
188189
enum DumpSetupCmd {
190+
ArchiveFormerZoneRoot {
191+
zone_root: Utf8PathBuf,
192+
zone_name: String,
193+
completion_tx: oneshot::Sender<()>,
194+
},
189195
UpdateDumpdevSetup {
190196
dump_slices: Vec<DumpSlicePath>,
191197
debug_datasets: Vec<DebugZpool>,
@@ -254,19 +260,21 @@ impl DumpSetup {
254260
let mut m2_core_datasets = Vec::new();
255261
let mount_config = self.mount_config.clone();
256262
for disk in disks {
257-
if disk.is_synthetic() {
258-
// We only setup dump devices on real disks
259-
continue;
260-
}
261263
match disk.variant() {
262264
DiskVariant::M2 => {
263-
match disk.dump_device_devfs_path(false) {
264-
Ok(path) => m2_dump_slices.push(DumpSlicePath(path)),
265-
Err(err) => {
266-
warn!(
267-
log,
268-
"Error getting dump device devfs path: {err:?}"
269-
);
265+
// We only setup dump devices on real disks
266+
if !disk.is_synthetic() {
267+
match disk.dump_device_devfs_path(false) {
268+
Ok(path) => {
269+
m2_dump_slices.push(DumpSlicePath(path))
270+
}
271+
Err(err) => {
272+
warn!(
273+
log,
274+
"Error getting dump device devfs path: \
275+
{err:?}"
276+
);
277+
}
270278
}
271279
}
272280
let name = disk.zpool_name();
@@ -327,6 +335,65 @@ impl DumpSetup {
327335
error!(log, "DumpSetup failed to await update"; "err" => ?err);
328336
}
329337
}
338+
339+
/// Request archive of logs from the specified directory, which is assumed
340+
/// to correspond to the root filesystem of a zone that is no longer
341+
/// running.
342+
///
343+
/// Unlike typical log file archival, this includes non-rotated log files.
344+
///
345+
/// This makes a best-effort and logs failures rather than reporting them to
346+
/// the caller.
347+
///
348+
/// When this future completes, the request has only been enqueued. To know
349+
/// when archival has completed, you must wait on the receive side of
350+
/// `completion_tx`.
351+
pub async fn archive_former_zone_root(
352+
&self,
353+
zone_root: &Utf8Path,
354+
completion_tx: oneshot::Sender<()>,
355+
) {
356+
let log = self.log.new(o!("zone_root" => zone_root.to_string()));
357+
358+
// Validate the path that we were given. We're only ever given zone
359+
// root filesystems, whose basename is always a zonename, and we always
360+
// prefix our zone names with `oxz_`. If that's not what we find here,
361+
// log an error and bail out. These error cases should be impossible to
362+
// hit in practice.
363+
let Some(file_name) = zone_root.file_name() else {
364+
error!(
365+
log,
366+
"cannot archive former zone root";
367+
"error" => "path has no filename part",
368+
);
369+
return;
370+
};
371+
372+
if !file_name.starts_with("oxz_") {
373+
error!(
374+
log,
375+
"cannot archive former zone root";
376+
"error" => "filename does not start with \"oxz_\"",
377+
);
378+
return;
379+
}
380+
381+
info!(log, "requesting archive of former zone root");
382+
let zone_root = zone_root.to_owned();
383+
let zone_name = file_name.to_string();
384+
let cmd = DumpSetupCmd::ArchiveFormerZoneRoot {
385+
zone_root,
386+
zone_name,
387+
completion_tx,
388+
};
389+
if let Err(_) = self.tx.send(cmd).await {
390+
error!(
391+
log,
392+
"failed to request archive of former zone root";
393+
"error" => "DumpSetup channel closed"
394+
);
395+
}
396+
}
330397
}
331398

332399
#[derive(Debug, thiserror::Error)]
@@ -586,6 +653,36 @@ impl DumpSetupWorker {
586653
core_datasets,
587654
);
588655
}
656+
Ok(Some(DumpSetupCmd::ArchiveFormerZoneRoot {
657+
zone_root,
658+
zone_name,
659+
completion_tx,
660+
})) => {
661+
match self
662+
.do_archive_former_zone_root(
663+
&zone_root,
664+
&zone_name,
665+
completion_tx,
666+
)
667+
.await
668+
{
669+
Ok(()) => {
670+
info!(
671+
self.log,
672+
"Archived logs from former zone root";
673+
"zone_root" => %zone_root
674+
);
675+
}
676+
Err(error) => {
677+
error!(
678+
self.log,
679+
"Failed to archive former zone root";
680+
"zone_root" => %zone_root,
681+
InlineErrorChain::new(&error),
682+
);
683+
}
684+
}
685+
}
589686
Ok(None) => {
590687
warn!(
591688
self.log,
@@ -962,7 +1059,7 @@ impl DumpSetupWorker {
9621059
);
9631060
}
9641061

965-
if let Err(err) = self.archive_logs().await {
1062+
if let Err(err) = self.archive_logs_from_running_zones().await {
9661063
if !matches!(err, ArchiveLogsError::NoDebugDirYet) {
9671064
error!(
9681065
self.log,
@@ -994,7 +1091,9 @@ impl DumpSetupWorker {
9941091
Ok(())
9951092
}
9961093

997-
async fn archive_logs(&self) -> Result<(), ArchiveLogsError> {
1094+
async fn archive_logs_from_running_zones(
1095+
&self,
1096+
) -> Result<(), ArchiveLogsError> {
9981097
let debug_dir = self
9991098
.chosen_debug_dir
10001099
.as_ref()
@@ -1007,27 +1106,74 @@ impl DumpSetupWorker {
10071106
zone.path().join("root/var/svc/log")
10081107
};
10091108
let zone_name = zone.name();
1010-
self.archive_logs_inner(debug_dir, logdir, zone_name).await?;
1109+
self.archive_logs_from_zone_path(
1110+
debug_dir, logdir, zone_name, false,
1111+
)
1112+
.await?;
10111113
}
10121114
Ok(())
10131115
}
10141116

1015-
async fn archive_logs_inner(
1117+
async fn do_archive_former_zone_root(
1118+
&self,
1119+
zone_root: &Utf8Path,
1120+
zone_name: &str,
1121+
completion_tx: oneshot::Sender<()>,
1122+
) -> Result<(), ArchiveLogsError> {
1123+
let debug_dir = self
1124+
.chosen_debug_dir
1125+
.as_ref()
1126+
.ok_or(ArchiveLogsError::NoDebugDirYet)?;
1127+
let logdir = zone_root.join("root/var/svc/log");
1128+
let rv = self
1129+
.archive_logs_from_zone_path(
1130+
debug_dir,
1131+
logdir.into(),
1132+
zone_name,
1133+
true,
1134+
)
1135+
.await;
1136+
if let Err(()) = completion_tx.send(()) {
1137+
// In practice, it would be surprising for our caller to have
1138+
// dropped this channel. Make a note.
1139+
warn!(
1140+
self.log,
1141+
"Failed to report completion of former zone root archival";
1142+
"error" => "completion channel closed",
1143+
);
1144+
}
1145+
rv
1146+
}
1147+
1148+
async fn archive_logs_from_zone_path(
10161149
&self,
10171150
debug_dir: &DebugDataset,
10181151
logdir: PathBuf,
10191152
zone_name: &str,
1153+
include_live: bool,
10201154
) -> Result<(), ArchiveLogsError> {
10211155
let mut rotated_log_files = Vec::new();
1022-
// patterns matching archived logs, e.g. foo.log.3
1023-
// keep checking for greater numbers of digits until we don't find any
1024-
for n in 1..9 {
1156+
if include_live {
10251157
let pattern = logdir
1026-
.join(format!("*.log.{}", "[0-9]".repeat(n)))
1158+
.join("*.log*")
10271159
.to_str()
10281160
.ok_or_else(|| ArchiveLogsError::Utf8(zone_name.to_string()))?
10291161
.to_string();
10301162
rotated_log_files.extend(glob::glob(&pattern)?.flatten());
1163+
} else {
1164+
// patterns matching archived logs, e.g. foo.log.3
1165+
// keep checking for greater numbers of digits until we don't find
1166+
// any
1167+
for n in 1..9 {
1168+
let pattern = logdir
1169+
.join(format!("*.log.{}", "[0-9]".repeat(n)))
1170+
.to_str()
1171+
.ok_or_else(|| {
1172+
ArchiveLogsError::Utf8(zone_name.to_string())
1173+
})?
1174+
.to_string();
1175+
rotated_log_files.extend(glob::glob(&pattern)?.flatten());
1176+
}
10311177
}
10321178
let dest_dir = debug_dir.as_ref().join(zone_name).into_std_path_buf();
10331179
if !rotated_log_files.is_empty() {
@@ -1037,6 +1183,12 @@ impl DumpSetupWorker {
10371183
self.log,
10381184
"Archiving {count} log files from {zone_name} zone"
10391185
);
1186+
} else if include_live {
1187+
warn!(
1188+
self.log,
1189+
"Found no log files from {zone_name} zone, including live \
1190+
log files"
1191+
);
10401192
}
10411193
for entry in rotated_log_files {
10421194
let src_name = entry.file_name().unwrap();

0 commit comments

Comments
 (0)