|
| 1 | +block: Fix lockdep warning in blk_mq_mark_tag_wait |
| 2 | + |
| 3 | +jira LE-4034 |
| 4 | +Rebuild_History Non-Buildable kernel-4.18.0-553.72.1.el8_10 |
| 5 | +commit-author Li Lingfeng < [email protected]> |
| 6 | +commit b313a8c835516bdda85025500be866ac8a74e022 |
| 7 | +Empty-Commit: Cherry-Pick Conflicts during history rebuild. |
| 8 | +Will be included in final tarball splat. Ref for failed cherry-pick at: |
| 9 | +ciq/ciq_backports/kernel-4.18.0-553.72.1.el8_10/b313a8c8.failed |
| 10 | + |
| 11 | +Lockdep reported a warning in Linux version 6.6: |
| 12 | + |
| 13 | +[ 414.344659] ================================ |
| 14 | +[ 414.345155] WARNING: inconsistent lock state |
| 15 | +[ 414.345658] 6.6.0-07439-gba2303cacfda #6 Not tainted |
| 16 | +[ 414.346221] -------------------------------- |
| 17 | +[ 414.346712] inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. |
| 18 | +[ 414.347545] kworker/u10:3/1152 [HC0[0]:SC0[0]:HE0:SE1] takes: |
| 19 | +[ 414.349245] ffff88810edd1098 (&sbq->ws[i].wait){+.?.}-{2:2}, at: blk_mq_dispatch_rq_list+0x131c/0x1ee0 |
| 20 | +[ 414.351204] {IN-SOFTIRQ-W} state was registered at: |
| 21 | +[ 414.351751] lock_acquire+0x18d/0x460 |
| 22 | +[ 414.352218] _raw_spin_lock_irqsave+0x39/0x60 |
| 23 | +[ 414.352769] __wake_up_common_lock+0x22/0x60 |
| 24 | +[ 414.353289] sbitmap_queue_wake_up+0x375/0x4f0 |
| 25 | +[ 414.353829] sbitmap_queue_clear+0xdd/0x270 |
| 26 | +[ 414.354338] blk_mq_put_tag+0xdf/0x170 |
| 27 | +[ 414.354807] __blk_mq_free_request+0x381/0x4d0 |
| 28 | +[ 414.355335] blk_mq_free_request+0x28b/0x3e0 |
| 29 | +[ 414.355847] __blk_mq_end_request+0x242/0xc30 |
| 30 | +[ 414.356367] scsi_end_request+0x2c1/0x830 |
| 31 | +[ 414.345155] WARNING: inconsistent lock state |
| 32 | +[ 414.345658] 6.6.0-07439-gba2303cacfda #6 Not tainted |
| 33 | +[ 414.346221] -------------------------------- |
| 34 | +[ 414.346712] inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. |
| 35 | +[ 414.347545] kworker/u10:3/1152 [HC0[0]:SC0[0]:HE0:SE1] takes: |
| 36 | +[ 414.349245] ffff88810edd1098 (&sbq->ws[i].wait){+.?.}-{2:2}, at: blk_mq_dispatch_rq_list+0x131c/0x1ee0 |
| 37 | +[ 414.351204] {IN-SOFTIRQ-W} state was registered at: |
| 38 | +[ 414.351751] lock_acquire+0x18d/0x460 |
| 39 | +[ 414.352218] _raw_spin_lock_irqsave+0x39/0x60 |
| 40 | +[ 414.352769] __wake_up_common_lock+0x22/0x60 |
| 41 | +[ 414.353289] sbitmap_queue_wake_up+0x375/0x4f0 |
| 42 | +[ 414.353829] sbitmap_queue_clear+0xdd/0x270 |
| 43 | +[ 414.354338] blk_mq_put_tag+0xdf/0x170 |
| 44 | +[ 414.354807] __blk_mq_free_request+0x381/0x4d0 |
| 45 | +[ 414.355335] blk_mq_free_request+0x28b/0x3e0 |
| 46 | +[ 414.355847] __blk_mq_end_request+0x242/0xc30 |
| 47 | +[ 414.356367] scsi_end_request+0x2c1/0x830 |
| 48 | +[ 414.356863] scsi_io_completion+0x177/0x1610 |
| 49 | +[ 414.357379] scsi_complete+0x12f/0x260 |
| 50 | +[ 414.357856] blk_complete_reqs+0xba/0xf0 |
| 51 | +[ 414.358338] __do_softirq+0x1b0/0x7a2 |
| 52 | +[ 414.358796] irq_exit_rcu+0x14b/0x1a0 |
| 53 | +[ 414.359262] sysvec_call_function_single+0xaf/0xc0 |
| 54 | +[ 414.359828] asm_sysvec_call_function_single+0x1a/0x20 |
| 55 | +[ 414.360426] default_idle+0x1e/0x30 |
| 56 | +[ 414.360873] default_idle_call+0x9b/0x1f0 |
| 57 | +[ 414.361390] do_idle+0x2d2/0x3e0 |
| 58 | +[ 414.361819] cpu_startup_entry+0x55/0x60 |
| 59 | +[ 414.362314] start_secondary+0x235/0x2b0 |
| 60 | +[ 414.362809] secondary_startup_64_no_verify+0x18f/0x19b |
| 61 | +[ 414.363413] irq event stamp: 428794 |
| 62 | +[ 414.363825] hardirqs last enabled at (428793): [<ffffffff816bfd1c>] ktime_get+0x1dc/0x200 |
| 63 | +[ 414.364694] hardirqs last disabled at (428794): [<ffffffff85470177>] _raw_spin_lock_irq+0x47/0x50 |
| 64 | +[ 414.365629] softirqs last enabled at (428444): [<ffffffff85474780>] __do_softirq+0x540/0x7a2 |
| 65 | +[ 414.366522] softirqs last disabled at (428419): [<ffffffff813f65ab>] irq_exit_rcu+0x14b/0x1a0 |
| 66 | +[ 414.367425] |
| 67 | + other info that might help us debug this: |
| 68 | +[ 414.368194] Possible unsafe locking scenario: |
| 69 | +[ 414.368900] CPU0 |
| 70 | +[ 414.369225] ---- |
| 71 | +[ 414.369548] lock(&sbq->ws[i].wait); |
| 72 | +[ 414.370000] <Interrupt> |
| 73 | +[ 414.370342] lock(&sbq->ws[i].wait); |
| 74 | +[ 414.370802] |
| 75 | + *** DEADLOCK *** |
| 76 | +[ 414.371569] 5 locks held by kworker/u10:3/1152: |
| 77 | +[ 414.372088] #0: ffff88810130e938 ((wq_completion)writeback){+.+.}-{0:0}, at: process_scheduled_works+0x357/0x13f0 |
| 78 | +[ 414.373180] #1: ffff88810201fdb8 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}, at: process_scheduled_works+0x3a3/0x13f0 |
| 79 | +[ 414.374384] #2: ffffffff86ffbdc0 (rcu_read_lock){....}-{1:2}, at: blk_mq_run_hw_queue+0x637/0xa00 |
| 80 | +[ 414.375342] #3: ffff88810edd1098 (&sbq->ws[i].wait){+.?.}-{2:2}, at: blk_mq_dispatch_rq_list+0x131c/0x1ee0 |
| 81 | +[ 414.376377] #4: ffff888106205a08 (&hctx->dispatch_wait_lock){+.-.}-{2:2}, at: blk_mq_dispatch_rq_list+0x1337/0x1ee0 |
| 82 | +[ 414.378607] |
| 83 | + stack backtrace: |
| 84 | +[ 414.379177] CPU: 0 PID: 1152 Comm: kworker/u10:3 Not tainted 6.6.0-07439-gba2303cacfda #6 |
| 85 | +[ 414.380032] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 |
| 86 | +[ 414.381177] Workqueue: writeback wb_workfn (flush-253:0) |
| 87 | +[ 414.381805] Call Trace: |
| 88 | +[ 414.382136] <TASK> |
| 89 | +[ 414.382429] dump_stack_lvl+0x91/0xf0 |
| 90 | +[ 414.382884] mark_lock_irq+0xb3b/0x1260 |
| 91 | +[ 414.383367] ? __pfx_mark_lock_irq+0x10/0x10 |
| 92 | +[ 414.383889] ? stack_trace_save+0x8e/0xc0 |
| 93 | +[ 414.384373] ? __pfx_stack_trace_save+0x10/0x10 |
| 94 | +[ 414.384903] ? graph_lock+0xcf/0x410 |
| 95 | +[ 414.385350] ? save_trace+0x3d/0xc70 |
| 96 | +[ 414.385808] mark_lock.part.20+0x56d/0xa90 |
| 97 | +[ 414.386317] mark_held_locks+0xb0/0x110 |
| 98 | +[ 414.386791] ? __pfx_do_raw_spin_lock+0x10/0x10 |
| 99 | +[ 414.387320] lockdep_hardirqs_on_prepare+0x297/0x3f0 |
| 100 | +[ 414.387901] ? _raw_spin_unlock_irq+0x28/0x50 |
| 101 | +[ 414.388422] trace_hardirqs_on+0x58/0x100 |
| 102 | +[ 414.388917] _raw_spin_unlock_irq+0x28/0x50 |
| 103 | +[ 414.389422] __blk_mq_tag_busy+0x1d6/0x2a0 |
| 104 | +[ 414.389920] __blk_mq_get_driver_tag+0x761/0x9f0 |
| 105 | +[ 414.390899] blk_mq_dispatch_rq_list+0x1780/0x1ee0 |
| 106 | +[ 414.391473] ? __pfx_blk_mq_dispatch_rq_list+0x10/0x10 |
| 107 | +[ 414.392070] ? sbitmap_get+0x2b8/0x450 |
| 108 | +[ 414.392533] ? __blk_mq_get_driver_tag+0x210/0x9f0 |
| 109 | +[ 414.393095] __blk_mq_sched_dispatch_requests+0xd99/0x1690 |
| 110 | +[ 414.393730] ? elv_attempt_insert_merge+0x1b1/0x420 |
| 111 | +[ 414.394302] ? __pfx___blk_mq_sched_dispatch_requests+0x10/0x10 |
| 112 | +[ 414.394970] ? lock_acquire+0x18d/0x460 |
| 113 | +[ 414.395456] ? blk_mq_run_hw_queue+0x637/0xa00 |
| 114 | +[ 414.395986] ? __pfx_lock_acquire+0x10/0x10 |
| 115 | +[ 414.396499] blk_mq_sched_dispatch_requests+0x109/0x190 |
| 116 | +[ 414.397100] blk_mq_run_hw_queue+0x66e/0xa00 |
| 117 | +[ 414.397616] blk_mq_flush_plug_list.part.17+0x614/0x2030 |
| 118 | +[ 414.398244] ? __pfx_blk_mq_flush_plug_list.part.17+0x10/0x10 |
| 119 | +[ 414.398897] ? writeback_sb_inodes+0x241/0xcc0 |
| 120 | +[ 414.399429] blk_mq_flush_plug_list+0x65/0x80 |
| 121 | +[ 414.399957] __blk_flush_plug+0x2f1/0x530 |
| 122 | +[ 414.400458] ? __pfx___blk_flush_plug+0x10/0x10 |
| 123 | +[ 414.400999] blk_finish_plug+0x59/0xa0 |
| 124 | +[ 414.401467] wb_writeback+0x7cc/0x920 |
| 125 | +[ 414.401935] ? __pfx_wb_writeback+0x10/0x10 |
| 126 | +[ 414.402442] ? mark_held_locks+0xb0/0x110 |
| 127 | +[ 414.402931] ? __pfx_do_raw_spin_lock+0x10/0x10 |
| 128 | +[ 414.403462] ? lockdep_hardirqs_on_prepare+0x297/0x3f0 |
| 129 | +[ 414.404062] wb_workfn+0x2b3/0xcf0 |
| 130 | +[ 414.404500] ? __pfx_wb_workfn+0x10/0x10 |
| 131 | +[ 414.404989] process_scheduled_works+0x432/0x13f0 |
| 132 | +[ 414.405546] ? __pfx_process_scheduled_works+0x10/0x10 |
| 133 | +[ 414.406139] ? do_raw_spin_lock+0x101/0x2a0 |
| 134 | +[ 414.406641] ? assign_work+0x19b/0x240 |
| 135 | +[ 414.407106] ? lock_is_held_type+0x9d/0x110 |
| 136 | +[ 414.407604] worker_thread+0x6f2/0x1160 |
| 137 | +[ 414.408075] ? __kthread_parkme+0x62/0x210 |
| 138 | +[ 414.408572] ? lockdep_hardirqs_on_prepare+0x297/0x3f0 |
| 139 | +[ 414.409168] ? __kthread_parkme+0x13c/0x210 |
| 140 | +[ 414.409678] ? __pfx_worker_thread+0x10/0x10 |
| 141 | +[ 414.410191] kthread+0x33c/0x440 |
| 142 | +[ 414.410602] ? __pfx_kthread+0x10/0x10 |
| 143 | +[ 414.411068] ret_from_fork+0x4d/0x80 |
| 144 | +[ 414.411526] ? __pfx_kthread+0x10/0x10 |
| 145 | +[ 414.411993] ret_from_fork_asm+0x1b/0x30 |
| 146 | +[ 414.412489] </TASK> |
| 147 | + |
| 148 | +When interrupt is turned on while a lock holding by spin_lock_irq it |
| 149 | +throws a warning because of potential deadlock. |
| 150 | + |
| 151 | +blk_mq_prep_dispatch_rq |
| 152 | + blk_mq_get_driver_tag |
| 153 | + __blk_mq_get_driver_tag |
| 154 | + __blk_mq_alloc_driver_tag |
| 155 | + blk_mq_tag_busy -> tag is already busy |
| 156 | + // failed to get driver tag |
| 157 | + blk_mq_mark_tag_wait |
| 158 | + spin_lock_irq(&wq->lock) -> lock A (&sbq->ws[i].wait) |
| 159 | + __add_wait_queue(wq, wait) -> wait queue active |
| 160 | + blk_mq_get_driver_tag |
| 161 | + __blk_mq_tag_busy |
| 162 | +-> 1) tag must be idle, which means there can't be inflight IO |
| 163 | + spin_lock_irq(&tags->lock) -> lock B (hctx->tags) |
| 164 | + spin_unlock_irq(&tags->lock) -> unlock B, turn on interrupt accidentally |
| 165 | +-> 2) context must be preempt by IO interrupt to trigger deadlock. |
| 166 | + |
| 167 | +As shown above, the deadlock is not possible in theory, but the warning |
| 168 | +still need to be fixed. |
| 169 | + |
| 170 | +Fix it by using spin_lock_irqsave to get lockB instead of spin_lock_irq. |
| 171 | + |
| 172 | +Fixes: 4f1731df60f9 ("blk-mq: fix potential io hang by wrong 'wake_batch'") |
| 173 | + Signed-off-by: Li Lingfeng < [email protected]> |
| 174 | + Reviewed-by: Ming Lei < [email protected]> |
| 175 | + Reviewed-by: Yu Kuai < [email protected]> |
| 176 | + Reviewed-by: Bart Van Assche < [email protected]> |
| 177 | +Link: https://lore.kernel.org/r/ [email protected] |
| 178 | + Signed-off-by: Jens Axboe < [email protected]> |
| 179 | +(cherry picked from commit b313a8c835516bdda85025500be866ac8a74e022) |
| 180 | + Signed-off-by: Jonathan Maple < [email protected]> |
| 181 | + |
| 182 | +# Conflicts: |
| 183 | +# block/blk-mq-tag.c |
| 184 | +diff --cc block/blk-mq-tag.c |
| 185 | +index 21f98a7c6b8a,2cafcf11ee8b..000000000000 |
| 186 | +--- a/block/blk-mq-tag.c |
| 187 | ++++ b/block/blk-mq-tag.c |
| 188 | +@@@ -21,22 -35,33 +21,42 @@@ |
| 189 | + * to get tag when first time, the other shared-tag users could reserve |
| 190 | + * budget for it. |
| 191 | + */ |
| 192 | + -void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) |
| 193 | + +bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) |
| 194 | + { |
| 195 | +++<<<<<<< HEAD |
| 196 | + + if (blk_mq_is_sbitmap_shared(hctx->flags)) { |
| 197 | +++======= |
| 198 | ++ unsigned int users; |
| 199 | ++ unsigned long flags; |
| 200 | ++ struct blk_mq_tags *tags = hctx->tags; |
| 201 | ++ |
| 202 | ++ /* |
| 203 | ++ * calling test_bit() prior to test_and_set_bit() is intentional, |
| 204 | ++ * it avoids dirtying the cacheline if the queue is already active. |
| 205 | ++ */ |
| 206 | ++ if (blk_mq_is_shared_tags(hctx->flags)) { |
| 207 | +++>>>>>>> b313a8c83551 (block: Fix lockdep warning in blk_mq_mark_tag_wait) |
| 208 | + struct request_queue *q = hctx->queue; |
| 209 | + + struct blk_mq_tag_set *set = q->tag_set; |
| 210 | + |
| 211 | + - if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || |
| 212 | + - test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) |
| 213 | + - return; |
| 214 | + + if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && |
| 215 | + + !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) |
| 216 | + + atomic_inc(&set->active_queues_shared_sbitmap); |
| 217 | + } else { |
| 218 | + - if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || |
| 219 | + - test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) |
| 220 | + - return; |
| 221 | + + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && |
| 222 | + + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) |
| 223 | + + atomic_inc(&hctx->tags->active_queues); |
| 224 | + } |
| 225 | + |
| 226 | +++<<<<<<< HEAD |
| 227 | + + return true; |
| 228 | +++======= |
| 229 | ++ spin_lock_irqsave(&tags->lock, flags); |
| 230 | ++ users = tags->active_queues + 1; |
| 231 | ++ WRITE_ONCE(tags->active_queues, users); |
| 232 | ++ blk_mq_update_wake_batch(tags, users); |
| 233 | ++ spin_unlock_irqrestore(&tags->lock, flags); |
| 234 | +++>>>>>>> b313a8c83551 (block: Fix lockdep warning in blk_mq_mark_tag_wait) |
| 235 | + } |
| 236 | + |
| 237 | + /* |
| 238 | +* Unmerged path block/blk-mq-tag.c |
0 commit comments