Skip to content

Commit 4b7de80

Browse files
olsajiriborkmann
authored andcommitted
bpf: Fix prog_array_map_poke_run map poke update
Lee pointed out issue found by syscaller [0] hitting BUG in prog array map poke update in prog_array_map_poke_run function due to error value returned from bpf_arch_text_poke function. There's race window where bpf_arch_text_poke can fail due to missing bpf program kallsym symbols, which is accounted for with check for -EINVAL in that BUG_ON call. The problem is that in such case we won't update the tail call jump and cause imbalance for the next tail call update check which will fail with -EBUSY in bpf_arch_text_poke. I'm hitting following race during the program load: CPU 0 CPU 1 bpf_prog_load bpf_check do_misc_fixups prog_array_map_poke_track map_update_elem bpf_fd_array_map_update_elem prog_array_map_poke_run bpf_arch_text_poke returns -EINVAL bpf_prog_kallsyms_add After bpf_arch_text_poke (CPU 1) fails to update the tail call jump, the next poke update fails on expected jump instruction check in bpf_arch_text_poke with -EBUSY and triggers the BUG_ON in prog_array_map_poke_run. Similar race exists on the program unload. Fixing this by moving the update to bpf_arch_poke_desc_update function which makes sure we call __bpf_arch_text_poke that skips the bpf address check. Each architecture has slightly different approach wrt looking up bpf address in bpf_arch_text_poke, so instead of splitting the function or adding new 'checkip' argument in previous version, it seems best to move the whole map_poke_run update as arch specific code. [0] https://syzkaller.appspot.com/bug?extid=97a4fe20470e9bc30810 Fixes: ebf7d1f ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") Reported-by: [email protected] Signed-off-by: Jiri Olsa <[email protected]> Signed-off-by: Daniel Borkmann <[email protected]> Acked-by: Yonghong Song <[email protected]> Cc: Lee Jones <[email protected]> Cc: Maciej Fijalkowski <[email protected]> Link: https://lore.kernel.org/bpf/[email protected]
1 parent e4d008d commit 4b7de80

File tree

3 files changed

+59
-48
lines changed

3 files changed

+59
-48
lines changed

arch/x86/net/bpf_jit_comp.c

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3025,3 +3025,49 @@ void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp
30253025
#endif
30263026
WARN(1, "verification of programs using bpf_throw should have failed\n");
30273027
}
3028+
3029+
void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
3030+
struct bpf_prog *new, struct bpf_prog *old)
3031+
{
3032+
u8 *old_addr, *new_addr, *old_bypass_addr;
3033+
int ret;
3034+
3035+
old_bypass_addr = old ? NULL : poke->bypass_addr;
3036+
old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
3037+
new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
3038+
3039+
/*
3040+
* On program loading or teardown, the program's kallsym entry
3041+
* might not be in place, so we use __bpf_arch_text_poke to skip
3042+
* the kallsyms check.
3043+
*/
3044+
if (new) {
3045+
ret = __bpf_arch_text_poke(poke->tailcall_target,
3046+
BPF_MOD_JUMP,
3047+
old_addr, new_addr);
3048+
BUG_ON(ret < 0);
3049+
if (!old) {
3050+
ret = __bpf_arch_text_poke(poke->tailcall_bypass,
3051+
BPF_MOD_JUMP,
3052+
poke->bypass_addr,
3053+
NULL);
3054+
BUG_ON(ret < 0);
3055+
}
3056+
} else {
3057+
ret = __bpf_arch_text_poke(poke->tailcall_bypass,
3058+
BPF_MOD_JUMP,
3059+
old_bypass_addr,
3060+
poke->bypass_addr);
3061+
BUG_ON(ret < 0);
3062+
/* let other CPUs finish the execution of program
3063+
* so that it will not possible to expose them
3064+
* to invalid nop, stack unwind, nop state
3065+
*/
3066+
if (!ret)
3067+
synchronize_rcu();
3068+
ret = __bpf_arch_text_poke(poke->tailcall_target,
3069+
BPF_MOD_JUMP,
3070+
old_addr, NULL);
3071+
BUG_ON(ret < 0);
3072+
}
3073+
}

include/linux/bpf.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3175,6 +3175,9 @@ enum bpf_text_poke_type {
31753175
int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t,
31763176
void *addr1, void *addr2);
31773177

3178+
void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
3179+
struct bpf_prog *new, struct bpf_prog *old);
3180+
31783181
void *bpf_arch_text_copy(void *dst, void *src, size_t len);
31793182
int bpf_arch_text_invalidate(void *dst, size_t len);
31803183

kernel/bpf/arraymap.c

Lines changed: 10 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1012,11 +1012,16 @@ static void prog_array_map_poke_untrack(struct bpf_map *map,
10121012
mutex_unlock(&aux->poke_mutex);
10131013
}
10141014

1015+
void __weak bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
1016+
struct bpf_prog *new, struct bpf_prog *old)
1017+
{
1018+
WARN_ON_ONCE(1);
1019+
}
1020+
10151021
static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
10161022
struct bpf_prog *old,
10171023
struct bpf_prog *new)
10181024
{
1019-
u8 *old_addr, *new_addr, *old_bypass_addr;
10201025
struct prog_poke_elem *elem;
10211026
struct bpf_array_aux *aux;
10221027

@@ -1025,7 +1030,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
10251030

10261031
list_for_each_entry(elem, &aux->poke_progs, list) {
10271032
struct bpf_jit_poke_descriptor *poke;
1028-
int i, ret;
1033+
int i;
10291034

10301035
for (i = 0; i < elem->aux->size_poke_tab; i++) {
10311036
poke = &elem->aux->poke_tab[i];
@@ -1044,21 +1049,10 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
10441049
* activated, so tail call updates can arrive from here
10451050
* while JIT is still finishing its final fixup for
10461051
* non-activated poke entries.
1047-
* 3) On program teardown, the program's kallsym entry gets
1048-
* removed out of RCU callback, but we can only untrack
1049-
* from sleepable context, therefore bpf_arch_text_poke()
1050-
* might not see that this is in BPF text section and
1051-
* bails out with -EINVAL. As these are unreachable since
1052-
* RCU grace period already passed, we simply skip them.
1053-
* 4) Also programs reaching refcount of zero while patching
1052+
* 3) Also programs reaching refcount of zero while patching
10541053
* is in progress is okay since we're protected under
10551054
* poke_mutex and untrack the programs before the JIT
1056-
* buffer is freed. When we're still in the middle of
1057-
* patching and suddenly kallsyms entry of the program
1058-
* gets evicted, we just skip the rest which is fine due
1059-
* to point 3).
1060-
* 5) Any other error happening below from bpf_arch_text_poke()
1061-
* is a unexpected bug.
1055+
* buffer is freed.
10621056
*/
10631057
if (!READ_ONCE(poke->tailcall_target_stable))
10641058
continue;
@@ -1068,39 +1062,7 @@ static void prog_array_map_poke_run(struct bpf_map *map, u32 key,
10681062
poke->tail_call.key != key)
10691063
continue;
10701064

1071-
old_bypass_addr = old ? NULL : poke->bypass_addr;
1072-
old_addr = old ? (u8 *)old->bpf_func + poke->adj_off : NULL;
1073-
new_addr = new ? (u8 *)new->bpf_func + poke->adj_off : NULL;
1074-
1075-
if (new) {
1076-
ret = bpf_arch_text_poke(poke->tailcall_target,
1077-
BPF_MOD_JUMP,
1078-
old_addr, new_addr);
1079-
BUG_ON(ret < 0 && ret != -EINVAL);
1080-
if (!old) {
1081-
ret = bpf_arch_text_poke(poke->tailcall_bypass,
1082-
BPF_MOD_JUMP,
1083-
poke->bypass_addr,
1084-
NULL);
1085-
BUG_ON(ret < 0 && ret != -EINVAL);
1086-
}
1087-
} else {
1088-
ret = bpf_arch_text_poke(poke->tailcall_bypass,
1089-
BPF_MOD_JUMP,
1090-
old_bypass_addr,
1091-
poke->bypass_addr);
1092-
BUG_ON(ret < 0 && ret != -EINVAL);
1093-
/* let other CPUs finish the execution of program
1094-
* so that it will not possible to expose them
1095-
* to invalid nop, stack unwind, nop state
1096-
*/
1097-
if (!ret)
1098-
synchronize_rcu();
1099-
ret = bpf_arch_text_poke(poke->tailcall_target,
1100-
BPF_MOD_JUMP,
1101-
old_addr, NULL);
1102-
BUG_ON(ret < 0 && ret != -EINVAL);
1103-
}
1065+
bpf_arch_poke_desc_update(poke, new, old);
11041066
}
11051067
}
11061068
}

0 commit comments

Comments
 (0)