Skip to content

Commit 0cbb775

Browse files
committed
Auto merge of #4645 - ferrous-systems:pa-tcp-user-timeout, r=Turbo87
Set `TCP_USER_TIMEOUT` to 15 seconds for database connections In the December 22, 2021 crates.io outage, we experienced full packet loss between the database server and the application server(s). While the crates.io application supports running without a database during outages, those mitigations kicked in only after a bit more than 15 minutes of the packet loss starting. The default parameters of the Linux network stack result in a TCP connection being marked as broken after roughly 15 minutes of no acknowledgements being received, which is what I believe happened during that outage. Broken database mitigations kicking in after 15 minutes is too long for crates.io, as we ideally want those mitigations to kick in as soon as possible (ensuring crates.io users can continue downloading crates). This PR tells libpq (the underlying PostgreSQL client used by diesel) to set the Linux network stack timeout to a configurable value, which we set as **15 seconds**. With this change, if an outage like the one on Dec 22 2021 one happens again, crates.io will be fully unavailable only for 15 seconds rather than 15 minutes. Note that the "15 seconds" value is a hunch I had, I'm open on feedback on different values we could use. This PR intentionally doesn't have any tests: while I have a local commit changing ChaosProxy to simulate packet loss, that's done by shelling out to `iptables`, which is both linux-only and root-only. I am not aware of any cross-platform, unprivileged tool we could use to simulate this in our test suite unfortunately. Huge thanks to `@weiznich` in diesel-rs/diesel#3016 for helping debug this issue and for trying possible fixes on the diesel side.
2 parents ae50554 + 399c8bb commit 0cbb775

File tree

15 files changed

+90
-42
lines changed

15 files changed

+90
-42
lines changed

src/admin/delete_crate.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ pub struct Opts {
1414
}
1515

1616
pub fn run(opts: Opts) {
17-
let conn = db::connect_now().unwrap();
17+
let conn = db::oneoff_connection().unwrap();
1818
conn.transaction::<_, diesel::result::Error, _>(|| {
1919
delete(opts, &conn);
2020
Ok(())

src/admin/delete_version.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ pub struct Opts {
2121
}
2222

2323
pub fn run(opts: Opts) {
24-
let conn = db::connect_now().unwrap();
24+
let conn = db::oneoff_connection().unwrap();
2525
conn.transaction::<_, diesel::result::Error, _>(|| {
2626
delete(opts, &conn);
2727
Ok(())

src/admin/migrate.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,15 @@ diesel_migrations::embed_migrations!("./migrations");
1111
pub struct Opts;
1212

1313
pub fn run(_opts: Opts) -> Result<(), Error> {
14-
let db_config = crate::config::DatabasePools::full_from_environment();
14+
let config = crate::config::DatabasePools::full_from_environment(
15+
&crate::config::Base::from_environment(),
16+
);
1517

1618
// TODO: Refactor logic so that we can also check things from App::new() here.
1719
// If the app will panic due to bad configuration, it is better to error in the release phase
1820
// to avoid launching dynos that will fail.
1921

20-
if db_config.are_all_read_only() {
22+
if config.are_all_read_only() {
2123
// TODO: Check `any_pending_migrations()` with a read-only connection and error if true.
2224
// It looks like this requires changes upstream to make this pub in `migration_macros`.
2325

@@ -29,13 +31,14 @@ pub fn run(_opts: Opts) -> Result<(), Error> {
2931
return Ok(());
3032
}
3133

32-
println!("==> migrating the database");
3334
// The primary is online, access directly via `DATABASE_URL`.
34-
let conn = crate::db::connect_now()?;
35+
let conn = crate::db::oneoff_connection_with_config(&config)?;
36+
37+
println!("==> migrating the database");
3538
embedded_migrations::run_with_output(&conn, &mut std::io::stdout())?;
3639

3740
println!("==> synchronizing crate categories");
38-
crate::boot::categories::sync(CATEGORIES_TOML).unwrap();
41+
crate::boot::categories::sync_with_connection(CATEGORIES_TOML, &conn).unwrap();
3942

4043
Ok(())
4144
}

src/admin/populate.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ pub struct Opts {
1414
}
1515

1616
pub fn run(opts: Opts) {
17-
let conn = db::connect_now().unwrap();
17+
let conn = db::oneoff_connection().unwrap();
1818
conn.transaction(|| update(opts, &conn)).unwrap();
1919
}
2020

src/admin/render_readmes.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ pub struct Opts {
4040

4141
pub fn run(opts: Opts) -> anyhow::Result<()> {
4242
let base_config = Arc::new(config::Base::from_environment());
43-
let conn = db::connect_now().unwrap();
43+
let conn = db::oneoff_connection().unwrap();
4444

4545
let start_time = Utc::now();
4646

src/admin/transfer_crates.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ pub struct Opts {
2121
}
2222

2323
pub fn run(opts: Opts) {
24-
let conn = db::connect_now().unwrap();
24+
let conn = db::oneoff_connection().unwrap();
2525
conn.transaction::<_, diesel::result::Error, _>(|| {
2626
transfer(opts, &conn);
2727
Ok(())

src/admin/verify_token.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ pub struct Opts {
1313
}
1414

1515
pub fn run(opts: Opts) -> AppResult<()> {
16-
let conn = db::connect_now()?;
16+
let conn = db::oneoff_connection()?;
1717
let user = User::find_by_api_token(&conn, &opts.api_token)?;
1818
println!("The token belongs to user {}", user.gh_login);
1919
Ok(())

src/app.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ impl App {
100100
let thread_pool = Arc::new(ScheduledThreadPool::new(db_helper_threads));
101101

102102
let primary_database = if config.use_test_database_pool {
103-
DieselPool::new_test(&config.db.primary.url)
103+
DieselPool::new_test(&config.db, &config.db.primary.url)
104104
} else {
105105
let primary_db_connection_config = ConnectionConfig {
106106
statement_timeout: db_connection_timeout,
@@ -116,6 +116,7 @@ impl App {
116116

117117
DieselPool::new(
118118
&config.db.primary.url,
119+
&config.db,
119120
primary_db_config,
120121
instance_metrics
121122
.database_time_to_obtain_connection
@@ -126,7 +127,7 @@ impl App {
126127

127128
let replica_database = if let Some(pool_config) = config.db.replica.as_ref() {
128129
if config.use_test_database_pool {
129-
Some(DieselPool::new_test(&pool_config.url))
130+
Some(DieselPool::new_test(&config.db, &pool_config.url))
130131
} else {
131132
let replica_db_connection_config = ConnectionConfig {
132133
statement_timeout: db_connection_timeout,
@@ -143,6 +144,7 @@ impl App {
143144
Some(
144145
DieselPool::new(
145146
&pool_config.url,
147+
&config.db,
146148
replica_db_config,
147149
instance_metrics
148150
.database_time_to_obtain_connection

src/bin/background-worker.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,10 @@ use std::time::Duration;
2424
fn main() {
2525
println!("Booting runner");
2626

27-
let db_config = config::DatabasePools::full_from_environment();
28-
let base_config = config::Base::from_environment();
29-
let uploader = base_config.uploader();
27+
let config = config::Server::default();
28+
let uploader = config.base.uploader();
3029

31-
if db_config.are_all_read_only() {
30+
if config.db.are_all_read_only() {
3231
loop {
3332
println!(
3433
"Cannot run background jobs with a read-only pool. Please scale background_worker \
@@ -38,7 +37,7 @@ fn main() {
3837
}
3938
}
4039

41-
let db_url = db::connection_url(&db_config.primary.url);
40+
let db_url = db::connection_url(&config.db, &config.db.primary.url);
4241

4342
let job_start_timeout = dotenv::var("BACKGROUND_JOB_TIMEOUT")
4443
.unwrap_or_else(|_| "30".into())

src/bin/enqueue-job.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use swirl::schema::background_jobs::dsl::*;
77
use swirl::Job;
88

99
fn main() -> Result<()> {
10-
let conn = db::connect_now()?;
10+
let conn = db::oneoff_connection()?;
1111
let mut args = std::env::args().skip(1);
1212

1313
let job = args.next().unwrap_or_default();

0 commit comments

Comments
 (0)