Skip to content

Commit 35f5197

Browse files
author
Kent Overstreet
committed
bcachefs: Improve journal pin flushing
Running the preempt tiering tests with a lower than normal journal reclaim delay turned up a shutdown hang - a lost wakeup, caused because flushing a journal pin (e.g. key cache/write buffer) can generate a new journal pin. The "simple" fix of adding the correct wakeup didn't work because of ordering issues; if we flush btree node pins too aggressively before other pins have completed, we end up spinning where each flush iteration generates new work. So to fix this correctly: - The list of flushed journal pins is now broken out by type, so that we can wait for key cache/write buffer pin flushing to complete before flushing dirty btree nodes - A new closure_waitlist is added for bch2_journal_flush_pins; this one is only used under or when we're taking the journal lock, so it's pretty cheap to add rigorously correct wakeups to journal_pin_set() and journal_pin_drop(). Additionally, bch2_journal_seq_pins_to_text() is moved to journal_reclaim.c, where it belongs, along with a bit of other small renaming and refactoring. Besides fixing the hang, the better ordering between key cache/write buffer flushing and btree node flushing should help or fix the "unmount taking excessively long" a few users have been noticing. Signed-off-by: Kent Overstreet <[email protected]>
1 parent 0c74c85 commit 35f5197

File tree

6 files changed

+136
-85
lines changed

6 files changed

+136
-85
lines changed

fs/bcachefs/debug.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "extents.h"
2121
#include "fsck.h"
2222
#include "inode.h"
23+
#include "journal_reclaim.h"
2324
#include "super.h"
2425

2526
#include <linux/console.h>

fs/bcachefs/journal.c

Lines changed: 4 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq)
113113

114114
static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
115115
{
116-
unsigned i;
117-
118-
for (i = 0; i < ARRAY_SIZE(p->list); i++)
119-
INIT_LIST_HEAD(&p->list[i]);
120-
INIT_LIST_HEAD(&p->flushed);
116+
for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++)
117+
INIT_LIST_HEAD(&p->unflushed[i]);
118+
for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
119+
INIT_LIST_HEAD(&p->flushed[i]);
121120
atomic_set(&p->count, count);
122121
p->devs.nr = 0;
123122
}
@@ -1626,54 +1625,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
16261625
__bch2_journal_debug_to_text(out, j);
16271626
spin_unlock(&j->lock);
16281627
}
1629-
1630-
bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
1631-
{
1632-
struct journal_entry_pin_list *pin_list;
1633-
struct journal_entry_pin *pin;
1634-
1635-
spin_lock(&j->lock);
1636-
if (!test_bit(JOURNAL_running, &j->flags)) {
1637-
spin_unlock(&j->lock);
1638-
return true;
1639-
}
1640-
1641-
*seq = max(*seq, j->pin.front);
1642-
1643-
if (*seq >= j->pin.back) {
1644-
spin_unlock(&j->lock);
1645-
return true;
1646-
}
1647-
1648-
out->atomic++;
1649-
1650-
pin_list = journal_seq_pin(j, *seq);
1651-
1652-
prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
1653-
printbuf_indent_add(out, 2);
1654-
1655-
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
1656-
list_for_each_entry(pin, &pin_list->list[i], list)
1657-
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
1658-
1659-
if (!list_empty(&pin_list->flushed))
1660-
prt_printf(out, "flushed:\n");
1661-
1662-
list_for_each_entry(pin, &pin_list->flushed, list)
1663-
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
1664-
1665-
printbuf_indent_sub(out, 2);
1666-
1667-
--out->atomic;
1668-
spin_unlock(&j->lock);
1669-
1670-
return false;
1671-
}
1672-
1673-
void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
1674-
{
1675-
u64 seq = 0;
1676-
1677-
while (!bch2_journal_seq_pins_to_text(out, j, &seq))
1678-
seq++;
1679-
}

fs/bcachefs/journal.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,8 +430,6 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u
430430

431431
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
432432
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
433-
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
434-
bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
435433

436434
int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
437435
unsigned nr);

fs/bcachefs/journal_reclaim.c

Lines changed: 121 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j)
327327
popped = true;
328328
}
329329

330-
if (popped)
330+
if (popped) {
331331
bch2_journal_space_available(j);
332+
__closure_wake_up(&j->reclaim_flush_wait);
333+
}
332334
}
333335

334336
bool __bch2_journal_pin_put(struct journal *j, u64 seq)
@@ -362,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j,
362364
pin->seq = 0;
363365
list_del_init(&pin->list);
364366

367+
if (j->reclaim_flush_wait.list.first)
368+
__closure_wake_up(&j->reclaim_flush_wait);
369+
365370
/*
366371
* Unpinning a journal entry may make journal_next_bucket() succeed, if
367372
* writing a new last_seq will now make another bucket available:
@@ -383,11 +388,11 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
383388
{
384389
if (fn == bch2_btree_node_flush0 ||
385390
fn == bch2_btree_node_flush1)
386-
return JOURNAL_PIN_btree;
391+
return JOURNAL_PIN_TYPE_btree;
387392
else if (fn == bch2_btree_key_cache_journal_flush)
388-
return JOURNAL_PIN_key_cache;
393+
return JOURNAL_PIN_TYPE_key_cache;
389394
else
390-
return JOURNAL_PIN_other;
395+
return JOURNAL_PIN_TYPE_other;
391396
}
392397

393398
static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
@@ -406,7 +411,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
406411
atomic_inc(&pin_list->count);
407412
pin->seq = seq;
408413
pin->flush = flush_fn;
409-
list_add(&pin->list, &pin_list->list[type]);
414+
415+
if (list_empty(&pin_list->unflushed[type]) &&
416+
j->reclaim_flush_wait.list.first)
417+
__closure_wake_up(&j->reclaim_flush_wait);
418+
419+
list_add(&pin->list, &pin_list->unflushed[type]);
410420
}
411421

412422
void bch2_journal_pin_copy(struct journal *j,
@@ -499,16 +509,15 @@ journal_get_next_pin(struct journal *j,
499509
{
500510
struct journal_entry_pin_list *pin_list;
501511
struct journal_entry_pin *ret = NULL;
502-
unsigned i;
503512

504513
fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
505514
if (*seq > seq_to_flush && !allowed_above_seq)
506515
break;
507516

508-
for (i = 0; i < JOURNAL_PIN_NR; i++)
509-
if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) ||
510-
((1U << i) & allowed_above_seq)) {
511-
ret = list_first_entry_or_null(&pin_list->list[i],
517+
for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
518+
if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) ||
519+
(BIT(i) & allowed_above_seq)) {
520+
ret = list_first_entry_or_null(&pin_list->unflushed[i],
512521
struct journal_entry_pin, list);
513522
if (ret)
514523
return ret;
@@ -544,16 +553,18 @@ static size_t journal_flush_pins(struct journal *j,
544553
}
545554

546555
if (min_key_cache) {
547-
allowed_above |= 1U << JOURNAL_PIN_key_cache;
548-
allowed_below |= 1U << JOURNAL_PIN_key_cache;
556+
allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache);
557+
allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache);
549558
}
550559

551560
cond_resched();
552561

553562
j->last_flushed = jiffies;
554563

555564
spin_lock(&j->lock);
556-
pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq);
565+
pin = journal_get_next_pin(j, seq_to_flush,
566+
allowed_below,
567+
allowed_above, &seq);
557568
if (pin) {
558569
BUG_ON(j->flush_in_progress);
559570
j->flush_in_progress = pin;
@@ -576,7 +587,7 @@ static size_t journal_flush_pins(struct journal *j,
576587
spin_lock(&j->lock);
577588
/* Pin might have been dropped or rearmed: */
578589
if (likely(!err && !j->flush_in_progress_dropped))
579-
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
590+
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
580591
j->flush_in_progress = NULL;
581592
j->flush_in_progress_dropped = false;
582593
spin_unlock(&j->lock);
@@ -816,23 +827,60 @@ int bch2_journal_reclaim_start(struct journal *j)
816827
return 0;
817828
}
818829

830+
static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush,
831+
unsigned types)
832+
{
833+
struct journal_entry_pin_list *pin_list;
834+
u64 seq;
835+
836+
spin_lock(&j->lock);
837+
fifo_for_each_entry_ptr(pin_list, &j->pin, seq) {
838+
if (seq > seq_to_flush)
839+
break;
840+
841+
for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++)
842+
if ((BIT(i) & types) &&
843+
(!list_empty(&pin_list->unflushed[i]) ||
844+
!list_empty(&pin_list->flushed[i]))) {
845+
spin_unlock(&j->lock);
846+
return true;
847+
}
848+
}
849+
spin_unlock(&j->lock);
850+
851+
return false;
852+
}
853+
854+
static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush,
855+
unsigned types)
856+
{
857+
return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) ||
858+
journal_pins_still_flushing(j, seq_to_flush, types);
859+
}
860+
819861
static int journal_flush_done(struct journal *j, u64 seq_to_flush,
820862
bool *did_work)
821863
{
822-
int ret;
864+
int ret = 0;
823865

824866
ret = bch2_journal_error(j);
825867
if (ret)
826868
return ret;
827869

828870
mutex_lock(&j->reclaim_lock);
829871

830-
if (journal_flush_pins(j, seq_to_flush,
831-
(1U << JOURNAL_PIN_key_cache)|
832-
(1U << JOURNAL_PIN_other), 0, 0, 0) ||
833-
journal_flush_pins(j, seq_to_flush,
834-
(1U << JOURNAL_PIN_btree), 0, 0, 0))
872+
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
873+
BIT(JOURNAL_PIN_TYPE_key_cache)|
874+
BIT(JOURNAL_PIN_TYPE_other))) {
875+
*did_work = true;
876+
goto unlock;
877+
}
878+
879+
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
880+
BIT(JOURNAL_PIN_TYPE_btree))) {
835881
*did_work = true;
882+
goto unlock;
883+
}
836884

837885
if (seq_to_flush > journal_cur_seq(j))
838886
bch2_journal_entry_close(j);
@@ -847,6 +895,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
847895
!fifo_used(&j->pin);
848896

849897
spin_unlock(&j->lock);
898+
unlock:
850899
mutex_unlock(&j->reclaim_lock);
851900

852901
return ret;
@@ -860,7 +909,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
860909
if (!test_bit(JOURNAL_running, &j->flags))
861910
return false;
862911

863-
closure_wait_event(&j->async_wait,
912+
closure_wait_event(&j->reclaim_flush_wait,
864913
journal_flush_done(j, seq_to_flush, &did_work));
865914

866915
return did_work;
@@ -926,3 +975,54 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
926975

927976
return ret;
928977
}
978+
979+
bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
980+
{
981+
struct journal_entry_pin_list *pin_list;
982+
struct journal_entry_pin *pin;
983+
984+
spin_lock(&j->lock);
985+
if (!test_bit(JOURNAL_running, &j->flags)) {
986+
spin_unlock(&j->lock);
987+
return true;
988+
}
989+
990+
*seq = max(*seq, j->pin.front);
991+
992+
if (*seq >= j->pin.back) {
993+
spin_unlock(&j->lock);
994+
return true;
995+
}
996+
997+
out->atomic++;
998+
999+
pin_list = journal_seq_pin(j, *seq);
1000+
1001+
prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
1002+
printbuf_indent_add(out, 2);
1003+
1004+
prt_printf(out, "unflushed:\n");
1005+
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
1006+
list_for_each_entry(pin, &pin_list->unflushed[i], list)
1007+
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
1008+
1009+
prt_printf(out, "flushed:\n");
1010+
for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++)
1011+
list_for_each_entry(pin, &pin_list->flushed[i], list)
1012+
prt_printf(out, "\t%px %ps\n", pin, pin->flush);
1013+
1014+
printbuf_indent_sub(out, 2);
1015+
1016+
--out->atomic;
1017+
spin_unlock(&j->lock);
1018+
1019+
return false;
1020+
}
1021+
1022+
void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
1023+
{
1024+
u64 seq = 0;
1025+
1026+
while (!bch2_journal_seq_pins_to_text(out, j, &seq))
1027+
seq++;
1028+
}

fs/bcachefs/journal_reclaim.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j)
7878

7979
int bch2_journal_flush_device_pins(struct journal *, int);
8080

81+
void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
82+
bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
83+
8184
#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */

fs/bcachefs/journal_types.h

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,15 +53,15 @@ struct journal_buf {
5353
*/
5454

5555
enum journal_pin_type {
56-
JOURNAL_PIN_btree,
57-
JOURNAL_PIN_key_cache,
58-
JOURNAL_PIN_other,
59-
JOURNAL_PIN_NR,
56+
JOURNAL_PIN_TYPE_btree,
57+
JOURNAL_PIN_TYPE_key_cache,
58+
JOURNAL_PIN_TYPE_other,
59+
JOURNAL_PIN_TYPE_NR,
6060
};
6161

6262
struct journal_entry_pin_list {
63-
struct list_head list[JOURNAL_PIN_NR];
64-
struct list_head flushed;
63+
struct list_head unflushed[JOURNAL_PIN_TYPE_NR];
64+
struct list_head flushed[JOURNAL_PIN_TYPE_NR];
6565
atomic_t count;
6666
struct bch_devs_list devs;
6767
};
@@ -226,6 +226,7 @@ struct journal {
226226
/* Used when waiting because the journal was full */
227227
wait_queue_head_t wait;
228228
struct closure_waitlist async_wait;
229+
struct closure_waitlist reclaim_flush_wait;
229230

230231
struct delayed_work write_work;
231232
struct workqueue_struct *wq;

0 commit comments

Comments
 (0)