Skip to content

Commit 726a9b6

Browse files
committed
Merge branch 'md-next-rcu-cleanup' into md-next
From Yu Kuai: md: remove rcu protection to access rdev from conf The lifetime of rdev: 1. md_import_device() generate a rdev based on underlying disk; mddev_lock() rdev = kzalloc(); rdev->bdev = blkdev_get_by_dev(); mddev_unlock() 2. bind_rdev_to_array() add this rdev to mddev->disks; mddev_lock() kobject_add(&rdev->kobj, &mddev->kobj, ...); list_add_rcu(&rdev->same_set, &mddev->disks); mddev_unlock() 3. remove_and_add_spares() add this rdev to conf; mddev_lock() rdev_addable(); pers->hot_add_disk(); rcu_assign_pointer(conf->rdev, rdev); mddev_unlock() 4. Use this array with rdev; 5. remove_and_add_spares() remove rdev from conf; // triggered by sysfs/ioctl mddev_lock() rdev_removeable(); pers->hot_remove_disk(); rcu_assign_pointer(conf->rdev, NULL); synchronize_rcu(); mddev_unlock() // triggered by daemon mddev_lock() rdev_removeable(); synchronize_rcu(); -> this can't protect accessing rdev from conf pers->hot_remove_disk(); rcu_assign_pointer(conf->rdev, NULL); mddev_unlock() 6. md_kick_rdev_from_array() remove rdev from mddev->disks; mddev_lock() list_del_rcu(&rdev->same_set); synchronize_rcu(); list_add(&rdev->same_set, &mddev->deleting) mddev_unlock() export_rdev There are two separate rcu protection for rdev, and this pathset remove the protection of conf(step 3 and 5), because it's safe to access rdev from conf in following cases: - If 'reconfig_mutex' is held, because rdev can't be added or rmoved to conf; - If there is normal IO inflight, because mddev_suspend() will wait for IO to be done and prevent rdev to be added or removed to conf; - If sync thread is running, because remove_and_add_spares() can only be called from daemon thread when sync thread is done, and 'MD_RECOVERY_RUNNING' is also checked for ioctl/sysfs; - if any spinlock or rcu_read_lock() is held, because synchronize_rcu() from step 6 prevent rdev to be freed until spinlock is released or rcu_read_unlock();
2 parents bed9e27 + 7ecab28 commit 726a9b6

File tree

9 files changed

+168
-421
lines changed

9 files changed

+168
-421
lines changed

drivers/md/md-multipath.c

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,15 @@ static int multipath_map (struct mpconf *conf)
3232
* now we use the first available disk.
3333
*/
3434

35-
rcu_read_lock();
3635
for (i = 0; i < disks; i++) {
37-
struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
36+
struct md_rdev *rdev = conf->multipaths[i].rdev;
37+
3838
if (rdev && test_bit(In_sync, &rdev->flags) &&
3939
!test_bit(Faulty, &rdev->flags)) {
4040
atomic_inc(&rdev->nr_pending);
41-
rcu_read_unlock();
4241
return i;
4342
}
4443
}
45-
rcu_read_unlock();
4644

4745
pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
4846
return (-1);
@@ -137,14 +135,16 @@ static void multipath_status(struct seq_file *seq, struct mddev *mddev)
137135
struct mpconf *conf = mddev->private;
138136
int i;
139137

138+
lockdep_assert_held(&mddev->lock);
139+
140140
seq_printf (seq, " [%d/%d] [", conf->raid_disks,
141141
conf->raid_disks - mddev->degraded);
142-
rcu_read_lock();
143142
for (i = 0; i < conf->raid_disks; i++) {
144-
struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev);
145-
seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
143+
struct md_rdev *rdev = READ_ONCE(conf->multipaths[i].rdev);
144+
145+
seq_printf(seq, "%s",
146+
rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
146147
}
147-
rcu_read_unlock();
148148
seq_putc(seq, ']');
149149
}
150150

@@ -182,7 +182,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
182182
conf->raid_disks - mddev->degraded);
183183
}
184184

185-
static void print_multipath_conf (struct mpconf *conf)
185+
static void print_multipath_conf(struct mpconf *conf)
186186
{
187187
int i;
188188
struct multipath_info *tmp;
@@ -195,6 +195,7 @@ static void print_multipath_conf (struct mpconf *conf)
195195
pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
196196
conf->raid_disks);
197197

198+
lockdep_assert_held(&conf->mddev->reconfig_mutex);
198199
for (i = 0; i < conf->raid_disks; i++) {
199200
tmp = conf->multipaths + i;
200201
if (tmp->rdev)
@@ -231,7 +232,7 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
231232
rdev->raid_disk = path;
232233
set_bit(In_sync, &rdev->flags);
233234
spin_unlock_irq(&conf->device_lock);
234-
rcu_assign_pointer(p->rdev, rdev);
235+
WRITE_ONCE(p->rdev, rdev);
235236
err = 0;
236237
break;
237238
}
@@ -257,16 +258,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
257258
err = -EBUSY;
258259
goto abort;
259260
}
260-
p->rdev = NULL;
261-
if (!test_bit(RemoveSynchronized, &rdev->flags)) {
262-
synchronize_rcu();
263-
if (atomic_read(&rdev->nr_pending)) {
264-
/* lost the race, try later */
265-
err = -EBUSY;
266-
p->rdev = rdev;
267-
goto abort;
268-
}
269-
}
261+
WRITE_ONCE(p->rdev, NULL);
270262
err = md_integrity_register(mddev);
271263
}
272264
abort:

drivers/md/md.c

Lines changed: 6 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9244,44 +9244,19 @@ static int remove_and_add_spares(struct mddev *mddev,
92449244
struct md_rdev *rdev;
92459245
int spares = 0;
92469246
int removed = 0;
9247-
bool remove_some = false;
92489247

92499248
if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
92509249
/* Mustn't remove devices when resync thread is running */
92519250
return 0;
92529251

92539252
rdev_for_each(rdev, mddev) {
9254-
if ((this == NULL || rdev == this) &&
9255-
rdev->raid_disk >= 0 &&
9256-
!test_bit(Blocked, &rdev->flags) &&
9257-
test_bit(Faulty, &rdev->flags) &&
9258-
atomic_read(&rdev->nr_pending)==0) {
9259-
/* Faulty non-Blocked devices with nr_pending == 0
9260-
* never get nr_pending incremented,
9261-
* never get Faulty cleared, and never get Blocked set.
9262-
* So we can synchronize_rcu now rather than once per device
9263-
*/
9264-
remove_some = true;
9265-
set_bit(RemoveSynchronized, &rdev->flags);
9266-
}
9267-
}
9268-
9269-
if (remove_some)
9270-
synchronize_rcu();
9271-
rdev_for_each(rdev, mddev) {
9272-
if ((this == NULL || rdev == this) &&
9273-
(test_bit(RemoveSynchronized, &rdev->flags) ||
9274-
rdev_removeable(rdev))) {
9275-
if (mddev->pers->hot_remove_disk(
9276-
mddev, rdev) == 0) {
9277-
sysfs_unlink_rdev(mddev, rdev);
9278-
rdev->saved_raid_disk = rdev->raid_disk;
9279-
rdev->raid_disk = -1;
9280-
removed++;
9281-
}
9253+
if ((this == NULL || rdev == this) && rdev_removeable(rdev) &&
9254+
!mddev->pers->hot_remove_disk(mddev, rdev)) {
9255+
sysfs_unlink_rdev(mddev, rdev);
9256+
rdev->saved_raid_disk = rdev->raid_disk;
9257+
rdev->raid_disk = -1;
9258+
removed++;
92829259
}
9283-
if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
9284-
clear_bit(RemoveSynchronized, &rdev->flags);
92859260
}
92869261

92879262
if (removed && mddev->kobj.sd)

drivers/md/md.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -190,11 +190,6 @@ enum flag_bits {
190190
* than other devices in the array
191191
*/
192192
ClusterRemove,
193-
RemoveSynchronized, /* synchronize_rcu() was called after
194-
* this device was known to be faulty,
195-
* so it is safe to remove without
196-
* another synchronize_rcu() call.
197-
*/
198193
ExternalBbl, /* External metadata provides bad
199194
* block management for a disk
200195
*/

drivers/md/raid1.c

Lines changed: 23 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -609,7 +609,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
609609
int choose_first;
610610
int choose_next_idle;
611611

612-
rcu_read_lock();
613612
/*
614613
* Check if we can balance. We can balance on the whole
615614
* device if no resync is going on, or below the resync window.
@@ -642,7 +641,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
642641
unsigned int pending;
643642
bool nonrot;
644643

645-
rdev = rcu_dereference(conf->mirrors[disk].rdev);
644+
rdev = conf->mirrors[disk].rdev;
646645
if (r1_bio->bios[disk] == IO_BLOCKED
647646
|| rdev == NULL
648647
|| test_bit(Faulty, &rdev->flags))
@@ -773,7 +772,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
773772
}
774773

775774
if (best_disk >= 0) {
776-
rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
775+
rdev = conf->mirrors[best_disk].rdev;
777776
if (!rdev)
778777
goto retry;
779778
atomic_inc(&rdev->nr_pending);
@@ -784,7 +783,6 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
784783

785784
conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
786785
}
787-
rcu_read_unlock();
788786
*max_sectors = sectors;
789787

790788
return best_disk;
@@ -1235,14 +1233,12 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
12351233

12361234
if (r1bio_existed) {
12371235
/* Need to get the block device name carefully */
1238-
struct md_rdev *rdev;
1239-
rcu_read_lock();
1240-
rdev = rcu_dereference(conf->mirrors[r1_bio->read_disk].rdev);
1236+
struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
1237+
12411238
if (rdev)
12421239
snprintf(b, sizeof(b), "%pg", rdev->bdev);
12431240
else
12441241
strcpy(b, "???");
1245-
rcu_read_unlock();
12461242
}
12471243

12481244
/*
@@ -1396,10 +1392,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
13961392

13971393
disks = conf->raid_disks * 2;
13981394
blocked_rdev = NULL;
1399-
rcu_read_lock();
14001395
max_sectors = r1_bio->sectors;
14011396
for (i = 0; i < disks; i++) {
1402-
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1397+
struct md_rdev *rdev = conf->mirrors[i].rdev;
14031398

14041399
/*
14051400
* The write-behind io is only attempted on drives marked as
@@ -1465,7 +1460,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
14651460
}
14661461
r1_bio->bios[i] = bio;
14671462
}
1468-
rcu_read_unlock();
14691463

14701464
if (unlikely(blocked_rdev)) {
14711465
/* Wait for this device to become unblocked */
@@ -1617,15 +1611,16 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
16171611
struct r1conf *conf = mddev->private;
16181612
int i;
16191613

1614+
lockdep_assert_held(&mddev->lock);
1615+
16201616
seq_printf(seq, " [%d/%d] [", conf->raid_disks,
16211617
conf->raid_disks - mddev->degraded);
1622-
rcu_read_lock();
16231618
for (i = 0; i < conf->raid_disks; i++) {
1624-
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1619+
struct md_rdev *rdev = READ_ONCE(conf->mirrors[i].rdev);
1620+
16251621
seq_printf(seq, "%s",
16261622
rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
16271623
}
1628-
rcu_read_unlock();
16291624
seq_printf(seq, "]");
16301625
}
16311626

@@ -1691,16 +1686,15 @@ static void print_conf(struct r1conf *conf)
16911686
pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
16921687
conf->raid_disks);
16931688

1694-
rcu_read_lock();
1689+
lockdep_assert_held(&conf->mddev->reconfig_mutex);
16951690
for (i = 0; i < conf->raid_disks; i++) {
1696-
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1691+
struct md_rdev *rdev = conf->mirrors[i].rdev;
16971692
if (rdev)
16981693
pr_debug(" disk %d, wo:%d, o:%d, dev:%pg\n",
16991694
i, !test_bit(In_sync, &rdev->flags),
17001695
!test_bit(Faulty, &rdev->flags),
17011696
rdev->bdev);
17021697
}
1703-
rcu_read_unlock();
17041698
}
17051699

17061700
static void close_sync(struct r1conf *conf)
@@ -1810,7 +1804,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
18101804
*/
18111805
if (rdev->saved_raid_disk < 0)
18121806
conf->fullsync = 1;
1813-
rcu_assign_pointer(p->rdev, rdev);
1807+
WRITE_ONCE(p->rdev, rdev);
18141808
break;
18151809
}
18161810
if (test_bit(WantReplacement, &p->rdev->flags) &&
@@ -1826,7 +1820,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
18261820
rdev->raid_disk = repl_slot;
18271821
err = 0;
18281822
conf->fullsync = 1;
1829-
rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
1823+
WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
18301824
}
18311825

18321826
print_conf(conf);
@@ -1862,16 +1856,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
18621856
err = -EBUSY;
18631857
goto abort;
18641858
}
1865-
p->rdev = NULL;
1866-
if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1867-
synchronize_rcu();
1868-
if (atomic_read(&rdev->nr_pending)) {
1869-
/* lost the race, try later */
1870-
err = -EBUSY;
1871-
p->rdev = rdev;
1872-
goto abort;
1873-
}
1874-
}
1859+
WRITE_ONCE(p->rdev, NULL);
18751860
if (conf->mirrors[conf->raid_disks + number].rdev) {
18761861
/* We just removed a device that is being replaced.
18771862
* Move down the replacement. We drain all IO before
@@ -1892,7 +1877,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
18921877
goto abort;
18931878
}
18941879
clear_bit(Replacement, &repl->flags);
1895-
p->rdev = repl;
1880+
WRITE_ONCE(p->rdev, repl);
18961881
conf->mirrors[conf->raid_disks + number].rdev = NULL;
18971882
unfreeze_array(conf);
18981883
}
@@ -2290,24 +2275,22 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
22902275
sector_t first_bad;
22912276
int bad_sectors;
22922277

2293-
rcu_read_lock();
2294-
rdev = rcu_dereference(conf->mirrors[d].rdev);
2278+
rdev = conf->mirrors[d].rdev;
22952279
if (rdev &&
22962280
(test_bit(In_sync, &rdev->flags) ||
22972281
(!test_bit(Faulty, &rdev->flags) &&
22982282
rdev->recovery_offset >= sect + s)) &&
22992283
is_badblock(rdev, sect, s,
23002284
&first_bad, &bad_sectors) == 0) {
23012285
atomic_inc(&rdev->nr_pending);
2302-
rcu_read_unlock();
23032286
if (sync_page_io(rdev, sect, s<<9,
23042287
conf->tmppage, REQ_OP_READ, false))
23052288
success = 1;
23062289
rdev_dec_pending(rdev, mddev);
23072290
if (success)
23082291
break;
2309-
} else
2310-
rcu_read_unlock();
2292+
}
2293+
23112294
d++;
23122295
if (d == conf->raid_disks * 2)
23132296
d = 0;
@@ -2326,29 +2309,24 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
23262309
if (d==0)
23272310
d = conf->raid_disks * 2;
23282311
d--;
2329-
rcu_read_lock();
2330-
rdev = rcu_dereference(conf->mirrors[d].rdev);
2312+
rdev = conf->mirrors[d].rdev;
23312313
if (rdev &&
23322314
!test_bit(Faulty, &rdev->flags)) {
23332315
atomic_inc(&rdev->nr_pending);
2334-
rcu_read_unlock();
23352316
r1_sync_page_io(rdev, sect, s,
23362317
conf->tmppage, WRITE);
23372318
rdev_dec_pending(rdev, mddev);
2338-
} else
2339-
rcu_read_unlock();
2319+
}
23402320
}
23412321
d = start;
23422322
while (d != read_disk) {
23432323
if (d==0)
23442324
d = conf->raid_disks * 2;
23452325
d--;
2346-
rcu_read_lock();
2347-
rdev = rcu_dereference(conf->mirrors[d].rdev);
2326+
rdev = conf->mirrors[d].rdev;
23482327
if (rdev &&
23492328
!test_bit(Faulty, &rdev->flags)) {
23502329
atomic_inc(&rdev->nr_pending);
2351-
rcu_read_unlock();
23522330
if (r1_sync_page_io(rdev, sect, s,
23532331
conf->tmppage, READ)) {
23542332
atomic_add(s, &rdev->corrected_errors);
@@ -2359,8 +2337,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
23592337
rdev->bdev);
23602338
}
23612339
rdev_dec_pending(rdev, mddev);
2362-
} else
2363-
rcu_read_unlock();
2340+
}
23642341
}
23652342
sectors -= s;
23662343
sect += s;
@@ -2741,7 +2718,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
27412718

27422719
r1_bio = raid1_alloc_init_r1buf(conf);
27432720

2744-
rcu_read_lock();
27452721
/*
27462722
* If we get a correctably read error during resync or recovery,
27472723
* we might want to read from a different device. So we
@@ -2762,7 +2738,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
27622738
struct md_rdev *rdev;
27632739
bio = r1_bio->bios[i];
27642740

2765-
rdev = rcu_dereference(conf->mirrors[i].rdev);
2741+
rdev = conf->mirrors[i].rdev;
27662742
if (rdev == NULL ||
27672743
test_bit(Faulty, &rdev->flags)) {
27682744
if (i < conf->raid_disks)
@@ -2820,7 +2796,6 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
28202796
bio->bi_opf |= MD_FAILFAST;
28212797
}
28222798
}
2823-
rcu_read_unlock();
28242799
if (disk < 0)
28252800
disk = wonly;
28262801
r1_bio->read_disk = disk;

0 commit comments

Comments
 (0)