Skip to content

Commit 2c30417

Browse files
Hou TaoAlexei Starovoitov
authored andcommitted
bpf: Support atomic update for htab of maps
As reported by Cody Haas [1], when there is concurrent map lookup and map update operation in an existing element for htab of maps, the map lookup procedure may return -ENOENT unexpectedly. The root cause is twofold: 1) the update of existing element involves two separated list operation In htab_map_update_elem(), it first inserts the new element at the head of list, then it deletes the old element. Therefore, it is possible a lookup operation has already iterated to the middle of the list when a concurrent update operation begins, and the lookup operation will fail to find the target element. 2) the immediate reuse of htab element. It is more subtle. Even through the lookup operation finds the old element, it is possible that the target element has been removed by a concurrent update operation, and the element has been reused immediately by other update operation which runs on the same CPU as the previous update operation, and the element is inserted into the same bucket list. After these steps above, when the lookup operation tries to compare the key in the old element with the expected key, the match will fail because the key in the old element have been overwritten by other update operation. The two-step update process is relatively straightforward to address. The more challenging aspect is the immediate reuse. As Alexei pointed out: So since 2022 both prealloc and no_prealloc reuse elements. We can consider a new flag for the hash map like F_REUSE_AFTER_RCU_GP that will use _rcu() flavor of freeing into bpf_ma, but it has to have a strong reason. Given that htab of maps doesn't support special field in value and directly stores the inner map pointer in htab_element, just do in-place update for htab of maps instead of attempting to address the immediate reuse issue. [1]: https://lore.kernel.org/xdp-newbies/CAH7f-ULFTwKdoH_t2SFc5rWCVYLEg-14d1fBYWH2eekudsnTRg@mail.gmail.com/ Acked-by: Andrii Nakryiko <[email protected]> Signed-off-by: Hou Tao <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexei Starovoitov <[email protected]>
1 parent 5771e30 commit 2c30417

File tree

1 file changed

+21
-23
lines changed

1 file changed

+21
-23
lines changed

kernel/bpf/hashtab.c

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,10 +1076,9 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
10761076
u64 map_flags)
10771077
{
10781078
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
1079-
struct htab_elem *l_new = NULL, *l_old;
1079+
struct htab_elem *l_new, *l_old;
10801080
struct hlist_nulls_head *head;
10811081
unsigned long flags;
1082-
void *old_map_ptr;
10831082
struct bucket *b;
10841083
u32 key_size, hash;
10851084
int ret;
@@ -1160,24 +1159,14 @@ static long htab_map_update_elem(struct bpf_map *map, void *key, void *value,
11601159
hlist_nulls_del_rcu(&l_old->hash_node);
11611160

11621161
/* l_old has already been stashed in htab->extra_elems, free
1163-
* its special fields before it is available for reuse. Also
1164-
* save the old map pointer in htab of maps before unlock
1165-
* and release it after unlock.
1162+
* its special fields before it is available for reuse.
11661163
*/
1167-
old_map_ptr = NULL;
1168-
if (htab_is_prealloc(htab)) {
1169-
if (map->ops->map_fd_put_ptr)
1170-
old_map_ptr = fd_htab_map_get_ptr(map, l_old);
1164+
if (htab_is_prealloc(htab))
11711165
check_and_free_fields(htab, l_old);
1172-
}
11731166
}
11741167
htab_unlock_bucket(b, flags);
1175-
if (l_old) {
1176-
if (old_map_ptr)
1177-
map->ops->map_fd_put_ptr(map, old_map_ptr, true);
1178-
if (!htab_is_prealloc(htab))
1179-
free_htab_elem(htab, l_old);
1180-
}
1168+
if (l_old && !htab_is_prealloc(htab))
1169+
free_htab_elem(htab, l_old);
11811170
return 0;
11821171
err:
11831172
htab_unlock_bucket(b, flags);
@@ -1265,6 +1254,7 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
12651254
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
12661255
struct htab_elem *l_new, *l_old;
12671256
struct hlist_nulls_head *head;
1257+
void *old_map_ptr = NULL;
12681258
unsigned long flags;
12691259
struct bucket *b;
12701260
u32 key_size, hash;
@@ -1296,8 +1286,15 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
12961286

12971287
if (l_old) {
12981288
/* Update value in-place */
1299-
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1300-
value, onallcpus);
1289+
if (percpu) {
1290+
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
1291+
value, onallcpus);
1292+
} else {
1293+
void **inner_map_pptr = htab_elem_value(l_old, key_size);
1294+
1295+
old_map_ptr = *inner_map_pptr;
1296+
WRITE_ONCE(*inner_map_pptr, *(void **)value);
1297+
}
13011298
} else {
13021299
l_new = alloc_htab_elem(htab, key, value, key_size,
13031300
hash, percpu, onallcpus, NULL);
@@ -1309,6 +1306,8 @@ static long htab_map_update_elem_in_place(struct bpf_map *map, void *key,
13091306
}
13101307
err:
13111308
htab_unlock_bucket(b, flags);
1309+
if (old_map_ptr)
1310+
map->ops->map_fd_put_ptr(map, old_map_ptr, true);
13121311
return ret;
13131312
}
13141313

@@ -2531,24 +2530,23 @@ int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
25312530
return ret;
25322531
}
25332532

2534-
/* only called from syscall */
2533+
/* Only called from syscall */
25352534
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
25362535
void *key, void *value, u64 map_flags)
25372536
{
25382537
void *ptr;
25392538
int ret;
2540-
u32 ufd = *(u32 *)value;
25412539

2542-
ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
2540+
ptr = map->ops->map_fd_get_ptr(map, map_file, *(int *)value);
25432541
if (IS_ERR(ptr))
25442542
return PTR_ERR(ptr);
25452543

25462544
/* The htab bucket lock is always held during update operations in fd
25472545
* htab map, and the following rcu_read_lock() is only used to avoid
2548-
* the WARN_ON_ONCE in htab_map_update_elem().
2546+
* the WARN_ON_ONCE in htab_map_update_elem_in_place().
25492547
*/
25502548
rcu_read_lock();
2551-
ret = htab_map_update_elem(map, key, &ptr, map_flags);
2549+
ret = htab_map_update_elem_in_place(map, key, &ptr, map_flags, false, false);
25522550
rcu_read_unlock();
25532551
if (ret)
25542552
map->ops->map_fd_put_ptr(map, ptr, false);

0 commit comments

Comments
 (0)