Skip to content

Commit cecf5d8

Browse files
Ming Leiaxboe
authored andcommitted
block: split .sysfs_lock into two locks
The kernfs built-in lock of 'kn->count' is held in sysfs .show/.store path. Meantime, inside block's .show/.store callback, q->sysfs_lock is required. However, when mq & iosched kobjects are removed via blk_mq_unregister_dev() & elv_unregister_queue(), q->sysfs_lock is held too. This way causes AB-BA lock because the kernfs built-in lock of 'kn-count' is required inside kobject_del() too, see the lockdep warning[1]. On the other hand, it isn't necessary to acquire q->sysfs_lock for both blk_mq_unregister_dev() & elv_unregister_queue() because clearing REGISTERED flag prevents storing to 'queue/scheduler' from being happened. Also sysfs write(store) is exclusive, so no necessary to hold the lock for elv_unregister_queue() when it is called in switching elevator path. So split .sysfs_lock into two: one is still named as .sysfs_lock for covering sync .store, the other one is named as .sysfs_dir_lock for covering kobjects and related status change. sysfs itself can handle the race between add/remove kobjects and showing/storing attributes under kobjects. For switching scheduler via storing to 'queue/scheduler', we use the queue flag of QUEUE_FLAG_REGISTERED with .sysfs_lock for avoiding the race, then we can avoid to hold .sysfs_lock during removing/adding kobjects. [1] lockdep warning ====================================================== WARNING: possible circular locking dependency detected 5.3.0-rc3-00044-g73277fc75ea0 #1380 Not tainted ------------------------------------------------------ rmmod/777 is trying to acquire lock: 00000000ac50e981 (kn->count#202){++++}, at: kernfs_remove_by_name_ns+0x59/0x72 but task is already holding lock: 00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&q->sysfs_lock){+.+.}: __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 __mutex_lock+0x14a/0xa9b blk_mq_hw_sysfs_show+0x63/0xb6 sysfs_kf_seq_show+0x11f/0x196 seq_read+0x2cd/0x5f2 vfs_read+0xc7/0x18c ksys_read+0xc4/0x13e do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (kn->count#202){++++}: check_prev_add+0x5d2/0xc45 validate_chain+0xed3/0xf94 __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 __kernfs_remove+0x237/0x40b kernfs_remove_by_name_ns+0x59/0x72 remove_files+0x61/0x96 sysfs_remove_group+0x81/0xa4 sysfs_remove_groups+0x3b/0x44 kobject_del+0x44/0x94 blk_mq_unregister_dev+0x83/0xdd blk_unregister_queue+0xa0/0x10b del_gendisk+0x259/0x3fa null_del_dev+0x8b/0x1c3 [null_blk] null_exit+0x5c/0x95 [null_blk] __se_sys_delete_module+0x204/0x337 do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&q->sysfs_lock); lock(kn->count#202); lock(&q->sysfs_lock); lock(kn->count#202); *** DEADLOCK *** 2 locks held by rmmod/777: #0: 00000000e69bd9de (&lock){+.+.}, at: null_exit+0x2e/0x95 [null_blk] #1: 00000000fb16ae21 (&q->sysfs_lock){+.+.}, at: blk_unregister_queue+0x78/0x10b stack backtrace: CPU: 0 PID: 777 Comm: rmmod Not tainted 5.3.0-rc3-00044-g73277fc75ea0 #1380 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS ?-20180724_192412-buildhw-07.phx4 Call Trace: dump_stack+0x9a/0xe6 check_noncircular+0x207/0x251 ? print_circular_bug+0x32a/0x32a ? find_usage_backwards+0x84/0xb0 check_prev_add+0x5d2/0xc45 validate_chain+0xed3/0xf94 ? check_prev_add+0xc45/0xc45 ? mark_lock+0x11b/0x804 ? check_usage_forwards+0x1ca/0x1ca __lock_acquire+0x95f/0xa2f lock_acquire+0x1b4/0x1e8 ? kernfs_remove_by_name_ns+0x59/0x72 __kernfs_remove+0x237/0x40b ? kernfs_remove_by_name_ns+0x59/0x72 ? kernfs_next_descendant_post+0x7d/0x7d ? strlen+0x10/0x23 ? strcmp+0x22/0x44 kernfs_remove_by_name_ns+0x59/0x72 remove_files+0x61/0x96 sysfs_remove_group+0x81/0xa4 sysfs_remove_groups+0x3b/0x44 kobject_del+0x44/0x94 blk_mq_unregister_dev+0x83/0xdd blk_unregister_queue+0xa0/0x10b del_gendisk+0x259/0x3fa ? disk_events_poll_msecs_store+0x12b/0x12b ? check_flags+0x1ea/0x204 ? mark_held_locks+0x1f/0x7a null_del_dev+0x8b/0x1c3 [null_blk] null_exit+0x5c/0x95 [null_blk] __se_sys_delete_module+0x204/0x337 ? free_module+0x39f/0x39f ? blkcg_maybe_throttle_current+0x8a/0x718 ? rwlock_bug+0x62/0x62 ? __blkcg_punt_bio_submit+0xd0/0xd0 ? trace_hardirqs_on_thunk+0x1a/0x20 ? mark_held_locks+0x1f/0x7a ? do_syscall_64+0x4c/0x295 do_syscall_64+0xa7/0x295 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fb696cdbe6b Code: 73 01 c3 48 8b 0d 1d 20 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 008 RSP: 002b:00007ffec9588788 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 0000559e589137c0 RCX: 00007fb696cdbe6b RDX: 000000000000000a RSI: 0000000000000800 RDI: 0000559e58913828 RBP: 0000000000000000 R08: 00007ffec9587701 R09: 0000000000000000 R10: 00007fb696d4eae0 R11: 0000000000000206 R12: 00007ffec95889b0 R13: 00007ffec95896b3 R14: 0000559e58913260 R15: 0000559e589137c0 Cc: Christoph Hellwig <[email protected]> Cc: Hannes Reinecke <[email protected]> Cc: Greg KH <[email protected]> Cc: Mike Snitzer <[email protected]> Reviewed-by: Bart Van Assche <[email protected]> Signed-off-by: Ming Lei <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent 58c898b commit cecf5d8

File tree

6 files changed

+84
-33
lines changed

6 files changed

+84
-33
lines changed

block/blk-core.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -520,6 +520,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
520520
mutex_init(&q->blk_trace_mutex);
521521
#endif
522522
mutex_init(&q->sysfs_lock);
523+
mutex_init(&q->sysfs_dir_lock);
523524
spin_lock_init(&q->queue_lock);
524525

525526
init_waitqueue_head(&q->mq_freeze_wq);

block/blk-mq-sysfs.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -270,7 +270,7 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
270270
struct blk_mq_hw_ctx *hctx;
271271
int i;
272272

273-
lockdep_assert_held(&q->sysfs_lock);
273+
lockdep_assert_held(&q->sysfs_dir_lock);
274274

275275
queue_for_each_hw_ctx(q, hctx, i)
276276
blk_mq_unregister_hctx(hctx);
@@ -320,7 +320,7 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
320320
int ret, i;
321321

322322
WARN_ON_ONCE(!q->kobj.parent);
323-
lockdep_assert_held(&q->sysfs_lock);
323+
lockdep_assert_held(&q->sysfs_dir_lock);
324324

325325
ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
326326
if (ret < 0)
@@ -354,23 +354,23 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
354354
struct blk_mq_hw_ctx *hctx;
355355
int i;
356356

357-
mutex_lock(&q->sysfs_lock);
357+
mutex_lock(&q->sysfs_dir_lock);
358358
if (!q->mq_sysfs_init_done)
359359
goto unlock;
360360

361361
queue_for_each_hw_ctx(q, hctx, i)
362362
blk_mq_unregister_hctx(hctx);
363363

364364
unlock:
365-
mutex_unlock(&q->sysfs_lock);
365+
mutex_unlock(&q->sysfs_dir_lock);
366366
}
367367

368368
int blk_mq_sysfs_register(struct request_queue *q)
369369
{
370370
struct blk_mq_hw_ctx *hctx;
371371
int i, ret = 0;
372372

373-
mutex_lock(&q->sysfs_lock);
373+
mutex_lock(&q->sysfs_dir_lock);
374374
if (!q->mq_sysfs_init_done)
375375
goto unlock;
376376

@@ -381,7 +381,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
381381
}
382382

383383
unlock:
384-
mutex_unlock(&q->sysfs_lock);
384+
mutex_unlock(&q->sysfs_dir_lock);
385385

386386
return ret;
387387
}

block/blk-sysfs.c

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -938,14 +938,14 @@ int blk_register_queue(struct gendisk *disk)
938938
int ret;
939939
struct device *dev = disk_to_dev(disk);
940940
struct request_queue *q = disk->queue;
941+
bool has_elevator = false;
941942

942943
if (WARN_ON(!q))
943944
return -ENXIO;
944945

945946
WARN_ONCE(blk_queue_registered(q),
946947
"%s is registering an already registered queue\n",
947948
kobject_name(&dev->kobj));
948-
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
949949

950950
/*
951951
* SCSI probing may synchronously create and destroy a lot of
@@ -965,8 +965,7 @@ int blk_register_queue(struct gendisk *disk)
965965
if (ret)
966966
return ret;
967967

968-
/* Prevent changes through sysfs until registration is completed. */
969-
mutex_lock(&q->sysfs_lock);
968+
mutex_lock(&q->sysfs_dir_lock);
970969

971970
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
972971
if (ret < 0) {
@@ -987,26 +986,36 @@ int blk_register_queue(struct gendisk *disk)
987986
blk_mq_debugfs_register(q);
988987
}
989988

990-
kobject_uevent(&q->kobj, KOBJ_ADD);
991-
992-
wbt_enable_default(q);
993-
994-
blk_throtl_register_queue(q);
995-
989+
/*
990+
* The flag of QUEUE_FLAG_REGISTERED isn't set yet, so elevator
991+
* switch won't happen at all.
992+
*/
996993
if (q->elevator) {
997-
ret = elv_register_queue(q);
994+
ret = elv_register_queue(q, false);
998995
if (ret) {
999-
mutex_unlock(&q->sysfs_lock);
1000-
kobject_uevent(&q->kobj, KOBJ_REMOVE);
996+
mutex_unlock(&q->sysfs_dir_lock);
1001997
kobject_del(&q->kobj);
1002998
blk_trace_remove_sysfs(dev);
1003999
kobject_put(&dev->kobj);
10041000
return ret;
10051001
}
1002+
has_elevator = true;
10061003
}
1004+
1005+
mutex_lock(&q->sysfs_lock);
1006+
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
1007+
wbt_enable_default(q);
1008+
blk_throtl_register_queue(q);
1009+
1010+
/* Now everything is ready and send out KOBJ_ADD uevent */
1011+
kobject_uevent(&q->kobj, KOBJ_ADD);
1012+
if (has_elevator)
1013+
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
1014+
mutex_unlock(&q->sysfs_lock);
1015+
10071016
ret = 0;
10081017
unlock:
1009-
mutex_unlock(&q->sysfs_lock);
1018+
mutex_unlock(&q->sysfs_dir_lock);
10101019
return ret;
10111020
}
10121021
EXPORT_SYMBOL_GPL(blk_register_queue);
@@ -1021,6 +1030,7 @@ EXPORT_SYMBOL_GPL(blk_register_queue);
10211030
void blk_unregister_queue(struct gendisk *disk)
10221031
{
10231032
struct request_queue *q = disk->queue;
1033+
bool has_elevator;
10241034

10251035
if (WARN_ON(!q))
10261036
return;
@@ -1035,25 +1045,25 @@ void blk_unregister_queue(struct gendisk *disk)
10351045
* concurrent elv_iosched_store() calls.
10361046
*/
10371047
mutex_lock(&q->sysfs_lock);
1038-
10391048
blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
1049+
has_elevator = !!q->elevator;
1050+
mutex_unlock(&q->sysfs_lock);
10401051

1052+
mutex_lock(&q->sysfs_dir_lock);
10411053
/*
10421054
* Remove the sysfs attributes before unregistering the queue data
10431055
* structures that can be modified through sysfs.
10441056
*/
10451057
if (queue_is_mq(q))
10461058
blk_mq_unregister_dev(disk_to_dev(disk), q);
1047-
mutex_unlock(&q->sysfs_lock);
10481059

10491060
kobject_uevent(&q->kobj, KOBJ_REMOVE);
10501061
kobject_del(&q->kobj);
10511062
blk_trace_remove_sysfs(disk_to_dev(disk));
10521063

1053-
mutex_lock(&q->sysfs_lock);
1054-
if (q->elevator)
1064+
if (has_elevator)
10551065
elv_unregister_queue(q);
1056-
mutex_unlock(&q->sysfs_lock);
1066+
mutex_unlock(&q->sysfs_dir_lock);
10571067

10581068
kobject_put(&disk_to_dev(disk)->kobj);
10591069
}

block/blk.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ int elevator_init_mq(struct request_queue *q);
188188
int elevator_switch_mq(struct request_queue *q,
189189
struct elevator_type *new_e);
190190
void __elevator_exit(struct request_queue *, struct elevator_queue *);
191-
int elv_register_queue(struct request_queue *q);
191+
int elv_register_queue(struct request_queue *q, bool uevent);
192192
void elv_unregister_queue(struct request_queue *q);
193193

194194
static inline void elevator_exit(struct request_queue *q,

block/elevator.c

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -470,13 +470,16 @@ static struct kobj_type elv_ktype = {
470470
.release = elevator_release,
471471
};
472472

473-
int elv_register_queue(struct request_queue *q)
473+
/*
474+
* elv_register_queue is called from either blk_register_queue or
475+
* elevator_switch, elevator switch is prevented from being happen
476+
* in the two paths, so it is safe to not hold q->sysfs_lock.
477+
*/
478+
int elv_register_queue(struct request_queue *q, bool uevent)
474479
{
475480
struct elevator_queue *e = q->elevator;
476481
int error;
477482

478-
lockdep_assert_held(&q->sysfs_lock);
479-
480483
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
481484
if (!error) {
482485
struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -487,24 +490,34 @@ int elv_register_queue(struct request_queue *q)
487490
attr++;
488491
}
489492
}
490-
kobject_uevent(&e->kobj, KOBJ_ADD);
493+
if (uevent)
494+
kobject_uevent(&e->kobj, KOBJ_ADD);
495+
496+
mutex_lock(&q->sysfs_lock);
491497
e->registered = 1;
498+
mutex_unlock(&q->sysfs_lock);
492499
}
493500
return error;
494501
}
495502

503+
/*
504+
* elv_unregister_queue is called from either blk_unregister_queue or
505+
* elevator_switch, elevator switch is prevented from being happen
506+
* in the two paths, so it is safe to not hold q->sysfs_lock.
507+
*/
496508
void elv_unregister_queue(struct request_queue *q)
497509
{
498-
lockdep_assert_held(&q->sysfs_lock);
499-
500510
if (q) {
501511
struct elevator_queue *e = q->elevator;
502512

503513
kobject_uevent(&e->kobj, KOBJ_REMOVE);
504514
kobject_del(&e->kobj);
515+
516+
mutex_lock(&q->sysfs_lock);
505517
e->registered = 0;
506518
/* Re-enable throttling in case elevator disabled it */
507519
wbt_enable_default(q);
520+
mutex_unlock(&q->sysfs_lock);
508521
}
509522
}
510523

@@ -567,18 +580,44 @@ int elevator_switch_mq(struct request_queue *q,
567580
lockdep_assert_held(&q->sysfs_lock);
568581

569582
if (q->elevator) {
570-
if (q->elevator->registered)
583+
if (q->elevator->registered) {
584+
mutex_unlock(&q->sysfs_lock);
585+
586+
/*
587+
* Concurrent elevator switch can't happen becasue
588+
* sysfs write is always exclusively on same file.
589+
*
590+
* Also the elevator queue won't be freed after
591+
* sysfs_lock is released becasue kobject_del() in
592+
* blk_unregister_queue() waits for completion of
593+
* .store & .show on its attributes.
594+
*/
571595
elv_unregister_queue(q);
596+
597+
mutex_lock(&q->sysfs_lock);
598+
}
572599
ioc_clear_queue(q);
573600
elevator_exit(q, q->elevator);
601+
602+
/*
603+
* sysfs_lock may be dropped, so re-check if queue is
604+
* unregistered. If yes, don't switch to new elevator
605+
* any more
606+
*/
607+
if (!blk_queue_registered(q))
608+
return 0;
574609
}
575610

576611
ret = blk_mq_init_sched(q, new_e);
577612
if (ret)
578613
goto out;
579614

580615
if (new_e) {
581-
ret = elv_register_queue(q);
616+
mutex_unlock(&q->sysfs_lock);
617+
618+
ret = elv_register_queue(q, true);
619+
620+
mutex_lock(&q->sysfs_lock);
582621
if (ret) {
583622
elevator_exit(q, q->elevator);
584623
goto out;

include/linux/blkdev.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,7 @@ struct request_queue {
535535
struct delayed_work requeue_work;
536536

537537
struct mutex sysfs_lock;
538+
struct mutex sysfs_dir_lock;
538539

539540
/*
540541
* for reusing dead hctx instance in case of updating

0 commit comments

Comments
 (0)