Skip to content

Commit 839a8e8

Browse files
committed
writeback: replace custom worker pool implementation with unbound workqueue
Writeback implements its own worker pool - each bdi can be associated with a worker thread which is created and destroyed dynamically. The worker thread for the default bdi is always present and serves as the "forker" thread which forks off worker threads for other bdis. there's no reason for writeback to implement its own worker pool when using unbound workqueue instead is much simpler and more efficient. This patch replaces custom worker pool implementation in writeback with an unbound workqueue. The conversion isn't too complicated but the followings are worth mentioning. * bdi_writeback->last_active, task and wakeup_timer are removed. delayed_work ->dwork is added instead. Explicit timer handling is no longer necessary. Everything works by either queueing / modding / flushing / canceling the delayed_work item. * bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off bdi_writeback->dwork. On each execution, it processes bdi->work_list and reschedules itself if there are more things to do. The function also handles low-mem condition, which used to be handled by the forker thread. If the function is running off a rescuer thread, it only writes out limited number of pages so that the rescuer can serve other bdis too. This preserves the flusher creation failure behavior of the forker thread. * INIT_LIST_HEAD(&bdi->bdi_list) is used to tell bdi_writeback_workfn() about on-going bdi unregistration so that it always drains work_list even if it's running off the rescuer. Note that the original code was broken in this regard. Under memory pressure, a bdi could finish unregistration with non-empty work_list. * The default bdi is no longer special. It now is treated the same as any other bdi and bdi_cap_flush_forker() is removed. * BDI_pending is no longer used. Removed. * Some tracepoints become non-applicable. The following TPs are removed - writeback_nothread, writeback_wake_thread, writeback_wake_forker_thread, writeback_thread_start, writeback_thread_stop. Everything, including devices coming and going away and rescuer operation under simulated memory pressure, seems to work fine in my test setup. Signed-off-by: Tejun Heo <[email protected]> Reviewed-by: Jan Kara <[email protected]> Cc: Jens Axboe <[email protected]> Cc: Fengguang Wu <[email protected]> Cc: Jeff Moyer <[email protected]>
1 parent 181387d commit 839a8e8

File tree

4 files changed

+65
-312
lines changed

4 files changed

+65
-312
lines changed

fs/fs-writeback.c

Lines changed: 32 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#include <linux/mm.h>
2323
#include <linux/pagemap.h>
2424
#include <linux/kthread.h>
25-
#include <linux/freezer.h>
2625
#include <linux/writeback.h>
2726
#include <linux/blkdev.h>
2827
#include <linux/backing-dev.h>
@@ -88,31 +87,16 @@ static inline struct inode *wb_inode(struct list_head *head)
8887
#define CREATE_TRACE_POINTS
8988
#include <trace/events/writeback.h>
9089

91-
/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
92-
static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
93-
{
94-
if (bdi->wb.task) {
95-
wake_up_process(bdi->wb.task);
96-
} else {
97-
/*
98-
* The bdi thread isn't there, wake up the forker thread which
99-
* will create and run it.
100-
*/
101-
wake_up_process(default_backing_dev_info.wb.task);
102-
}
103-
}
104-
10590
static void bdi_queue_work(struct backing_dev_info *bdi,
10691
struct wb_writeback_work *work)
10792
{
10893
trace_writeback_queue(bdi, work);
10994

11095
spin_lock_bh(&bdi->wb_lock);
11196
list_add_tail(&work->list, &bdi->work_list);
112-
if (!bdi->wb.task)
113-
trace_writeback_nothread(bdi, work);
114-
bdi_wakeup_flusher(bdi);
11597
spin_unlock_bh(&bdi->wb_lock);
98+
99+
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
116100
}
117101

118102
static void
@@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
127111
*/
128112
work = kzalloc(sizeof(*work), GFP_ATOMIC);
129113
if (!work) {
130-
if (bdi->wb.task) {
131-
trace_writeback_nowork(bdi);
132-
wake_up_process(bdi->wb.task);
133-
}
114+
trace_writeback_nowork(bdi);
115+
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
134116
return;
135117
}
136118

@@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
177159
* writeback as soon as there is no other work to do.
178160
*/
179161
trace_writeback_wake_background(bdi);
180-
spin_lock_bh(&bdi->wb_lock);
181-
bdi_wakeup_flusher(bdi);
182-
spin_unlock_bh(&bdi->wb_lock);
162+
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
183163
}
184164

185165
/*
@@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
10201000

10211001
/*
10221002
* Handle writeback of dirty data for the device backed by this bdi. Also
1023-
* wakes up periodically and does kupdated style flushing.
1003+
* reschedules periodically and does kupdated style flushing.
10241004
*/
1025-
int bdi_writeback_thread(void *data)
1005+
void bdi_writeback_workfn(struct work_struct *work)
10261006
{
1027-
struct bdi_writeback *wb = data;
1007+
struct bdi_writeback *wb = container_of(to_delayed_work(work),
1008+
struct bdi_writeback, dwork);
10281009
struct backing_dev_info *bdi = wb->bdi;
10291010
long pages_written;
10301011

10311012
current->flags |= PF_SWAPWRITE;
1032-
set_freezable();
1033-
wb->last_active = jiffies;
1034-
1035-
/*
1036-
* Our parent may run at a different priority, just set us to normal
1037-
*/
1038-
set_user_nice(current, 0);
1039-
1040-
trace_writeback_thread_start(bdi);
10411013

1042-
while (!kthread_freezable_should_stop(NULL)) {
1014+
if (likely(!current_is_workqueue_rescuer() ||
1015+
list_empty(&bdi->bdi_list))) {
10431016
/*
1044-
* Remove own delayed wake-up timer, since we are already awake
1045-
* and we'll take care of the periodic write-back.
1017+
* The normal path. Keep writing back @bdi until its
1018+
* work_list is empty. Note that this path is also taken
1019+
* if @bdi is shutting down even when we're running off the
1020+
* rescuer as work_list needs to be drained.
10461021
*/
1047-
del_timer(&wb->wakeup_timer);
1048-
1049-
pages_written = wb_do_writeback(wb, 0);
1050-
1022+
do {
1023+
pages_written = wb_do_writeback(wb, 0);
1024+
trace_writeback_pages_written(pages_written);
1025+
} while (!list_empty(&bdi->work_list));
1026+
} else {
1027+
/*
1028+
* bdi_wq can't get enough workers and we're running off
1029+
* the emergency worker. Don't hog it. Hopefully, 1024 is
1030+
* enough for efficient IO.
1031+
*/
1032+
pages_written = writeback_inodes_wb(&bdi->wb, 1024,
1033+
WB_REASON_FORKER_THREAD);
10511034
trace_writeback_pages_written(pages_written);
1052-
1053-
if (pages_written)
1054-
wb->last_active = jiffies;
1055-
1056-
set_current_state(TASK_INTERRUPTIBLE);
1057-
if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
1058-
__set_current_state(TASK_RUNNING);
1059-
continue;
1060-
}
1061-
1062-
if (wb_has_dirty_io(wb) && dirty_writeback_interval)
1063-
schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
1064-
else {
1065-
/*
1066-
* We have nothing to do, so can go sleep without any
1067-
* timeout and save power. When a work is queued or
1068-
* something is made dirty - we will be woken up.
1069-
*/
1070-
schedule();
1071-
}
10721035
}
10731036

1074-
/* Flush any work that raced with us exiting */
1075-
if (!list_empty(&bdi->work_list))
1076-
wb_do_writeback(wb, 1);
1037+
if (!list_empty(&bdi->work_list) ||
1038+
(wb_has_dirty_io(wb) && dirty_writeback_interval))
1039+
queue_delayed_work(bdi_wq, &wb->dwork,
1040+
msecs_to_jiffies(dirty_writeback_interval * 10));
10771041

1078-
trace_writeback_thread_stop(bdi);
1079-
return 0;
1042+
current->flags &= ~PF_SWAPWRITE;
10801043
}
10811044

1082-
10831045
/*
10841046
* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
10851047
* the whole world.

include/linux/backing-dev.h

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <linux/writeback.h>
1919
#include <linux/atomic.h>
2020
#include <linux/sysctl.h>
21+
#include <linux/workqueue.h>
2122

2223
struct page;
2324
struct device;
@@ -27,7 +28,6 @@ struct dentry;
2728
* Bits in backing_dev_info.state
2829
*/
2930
enum bdi_state {
30-
BDI_pending, /* On its way to being activated */
3131
BDI_wb_alloc, /* Default embedded wb allocated */
3232
BDI_async_congested, /* The async (write) queue is getting full */
3333
BDI_sync_congested, /* The sync queue is getting full */
@@ -53,10 +53,8 @@ struct bdi_writeback {
5353
unsigned int nr;
5454

5555
unsigned long last_old_flush; /* last old data flush */
56-
unsigned long last_active; /* last time bdi thread was active */
5756

58-
struct task_struct *task; /* writeback thread */
59-
struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
57+
struct delayed_work dwork; /* work item used for writeback */
6058
struct list_head b_dirty; /* dirty inodes */
6159
struct list_head b_io; /* parked for writeback */
6260
struct list_head b_more_io; /* parked for more writeback */
@@ -123,14 +121,16 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
123121
void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
124122
enum wb_reason reason);
125123
void bdi_start_background_writeback(struct backing_dev_info *bdi);
126-
int bdi_writeback_thread(void *data);
124+
void bdi_writeback_workfn(struct work_struct *work);
127125
int bdi_has_dirty_io(struct backing_dev_info *bdi);
128126
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
129127
void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
130128

131129
extern spinlock_t bdi_lock;
132130
extern struct list_head bdi_list;
133131

132+
extern struct workqueue_struct *bdi_wq;
133+
134134
static inline int wb_has_dirty_io(struct bdi_writeback *wb)
135135
{
136136
return !list_empty(&wb->b_dirty) ||
@@ -335,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
335335
return bdi->capabilities & BDI_CAP_SWAP_BACKED;
336336
}
337337

338-
static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
339-
{
340-
return bdi == &default_backing_dev_info;
341-
}
342-
343338
static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
344339
{
345340
return bdi_cap_writeback_dirty(mapping->backing_dev_info);

include/trace/events/writeback.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class,
183183
DEFINE_EVENT(writeback_work_class, name, \
184184
TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
185185
TP_ARGS(bdi, work))
186-
DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
187186
DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
188187
DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
189188
DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \
222221

223222
DEFINE_WRITEBACK_EVENT(writeback_nowork);
224223
DEFINE_WRITEBACK_EVENT(writeback_wake_background);
225-
DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
226-
DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
227224
DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
228225
DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
229-
DEFINE_WRITEBACK_EVENT(writeback_thread_start);
230-
DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
231226

232227
DECLARE_EVENT_CLASS(wbc_class,
233228
TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),

0 commit comments

Comments
 (0)