Skip to content

Commit 08701e3

Browse files
committed
Merge branch 'bpf-introduce-bpf-arena'
Alexei Starovoitov says: ==================== bpf: Introduce BPF arena. From: Alexei Starovoitov <[email protected]> v2->v3: - contains bpf bits only, but cc-ing past audience for continuity - since prerequisite patches landed, this series focus on the main functionality of bpf_arena. - adopted Andrii's approach to support arena in libbpf. - simplified LLVM support. Instead of two instructions it's now only one. - switched to cond_break (instead of open coded iters) in selftests - implemented several follow-ups that will be sent after this set . remember first IP and bpf insn that faulted in arena. report to user space via bpftool . copy paste and tweak glob_match() aka mini-regex as a selftests/bpf - see patch 1 for detailed description of bpf_arena v1->v2: - Improved commit log with reasons for using vmap_pages_range() in arena. Thanks to Johannes - Added support for __arena global variables in bpf programs - Fixed race conditions spotted by Barret - Fixed wrap32 issue spotted by Barret - Fixed bpf_map_mmap_sz() the way Andrii suggested The work on bpf_arena was inspired by Barret's work: https://github.com/google/ghost-userspace/blob/main/lib/queue.bpf.h that implements queues, lists and AVL trees completely as bpf programs using giant bpf array map and integer indices instead of pointers. bpf_arena is a sparse array that allows to use normal C pointers to build such data structures. Last few patches implement page_frag allocator, link list and hash table as bpf programs. v1: bpf programs have multiple options to communicate with user space: - Various ring buffers (perf, ftrace, bpf): The data is streamed unidirectionally from bpf to user space. - Hash map: The bpf program populates elements, and user space consumes them via bpf syscall. - mmap()-ed array map: Libbpf creates an array map that is directly accessed by the bpf program and mmap-ed to user space. It's the fastest way. Its disadvantage is that memory for the whole array is reserved at the start. ==================== Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Andrii Nakryiko <[email protected]>
2 parents 365c2b3 + 8df839a commit 08701e3

37 files changed

+2028
-40
lines changed

arch/x86/net/bpf_jit_comp.c

Lines changed: 230 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ static int bpf_size_to_x86_bytes(int bpf_size)
113113
/* Pick a register outside of BPF range for JIT internal work */
114114
#define AUX_REG (MAX_BPF_JIT_REG + 1)
115115
#define X86_REG_R9 (MAX_BPF_JIT_REG + 2)
116+
#define X86_REG_R12 (MAX_BPF_JIT_REG + 3)
116117

117118
/*
118119
* The following table maps BPF registers to x86-64 registers.
@@ -139,6 +140,7 @@ static const int reg2hex[] = {
139140
[BPF_REG_AX] = 2, /* R10 temp register */
140141
[AUX_REG] = 3, /* R11 temp register */
141142
[X86_REG_R9] = 1, /* R9 register, 6th function argument */
143+
[X86_REG_R12] = 4, /* R12 callee saved */
142144
};
143145

144146
static const int reg2pt_regs[] = {
@@ -167,6 +169,7 @@ static bool is_ereg(u32 reg)
167169
BIT(BPF_REG_8) |
168170
BIT(BPF_REG_9) |
169171
BIT(X86_REG_R9) |
172+
BIT(X86_REG_R12) |
170173
BIT(BPF_REG_AX));
171174
}
172175

@@ -205,6 +208,17 @@ static u8 add_2mod(u8 byte, u32 r1, u32 r2)
205208
return byte;
206209
}
207210

211+
static u8 add_3mod(u8 byte, u32 r1, u32 r2, u32 index)
212+
{
213+
if (is_ereg(r1))
214+
byte |= 1;
215+
if (is_ereg(index))
216+
byte |= 2;
217+
if (is_ereg(r2))
218+
byte |= 4;
219+
return byte;
220+
}
221+
208222
/* Encode 'dst_reg' register into x86-64 opcode 'byte' */
209223
static u8 add_1reg(u8 byte, u32 dst_reg)
210224
{
@@ -645,6 +659,8 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
645659
pop_r12(&prog);
646660
} else {
647661
pop_callee_regs(&prog, callee_regs_used);
662+
if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
663+
pop_r12(&prog);
648664
}
649665

650666
EMIT1(0x58); /* pop rax */
@@ -704,6 +720,8 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
704720
pop_r12(&prog);
705721
} else {
706722
pop_callee_regs(&prog, callee_regs_used);
723+
if (bpf_arena_get_kern_vm_start(bpf_prog->aux->arena))
724+
pop_r12(&prog);
707725
}
708726

709727
EMIT1(0x58); /* pop rax */
@@ -887,6 +905,18 @@ static void emit_insn_suffix(u8 **pprog, u32 ptr_reg, u32 val_reg, int off)
887905
*pprog = prog;
888906
}
889907

908+
static void emit_insn_suffix_SIB(u8 **pprog, u32 ptr_reg, u32 val_reg, u32 index_reg, int off)
909+
{
910+
u8 *prog = *pprog;
911+
912+
if (is_imm8(off)) {
913+
EMIT3(add_2reg(0x44, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
914+
} else {
915+
EMIT2_off32(add_2reg(0x84, BPF_REG_0, val_reg), add_2reg(0, ptr_reg, index_reg) /* SIB */, off);
916+
}
917+
*pprog = prog;
918+
}
919+
890920
/*
891921
* Emit a REX byte if it will be necessary to address these registers
892922
*/
@@ -968,6 +998,37 @@ static void emit_ldsx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
968998
*pprog = prog;
969999
}
9701000

1001+
static void emit_ldx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
1002+
{
1003+
u8 *prog = *pprog;
1004+
1005+
switch (size) {
1006+
case BPF_B:
1007+
/* movzx rax, byte ptr [rax + r12 + off] */
1008+
EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB6);
1009+
break;
1010+
case BPF_H:
1011+
/* movzx rax, word ptr [rax + r12 + off] */
1012+
EMIT3(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x0F, 0xB7);
1013+
break;
1014+
case BPF_W:
1015+
/* mov eax, dword ptr [rax + r12 + off] */
1016+
EMIT2(add_3mod(0x40, src_reg, dst_reg, index_reg), 0x8B);
1017+
break;
1018+
case BPF_DW:
1019+
/* mov rax, qword ptr [rax + r12 + off] */
1020+
EMIT2(add_3mod(0x48, src_reg, dst_reg, index_reg), 0x8B);
1021+
break;
1022+
}
1023+
emit_insn_suffix_SIB(&prog, src_reg, dst_reg, index_reg, off);
1024+
*pprog = prog;
1025+
}
1026+
1027+
static void emit_ldx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
1028+
{
1029+
emit_ldx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
1030+
}
1031+
9711032
/* STX: *(u8*)(dst_reg + off) = src_reg */
9721033
static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
9731034
{
@@ -1002,6 +1063,71 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
10021063
*pprog = prog;
10031064
}
10041065

1066+
/* STX: *(u8*)(dst_reg + index_reg + off) = src_reg */
1067+
static void emit_stx_index(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, u32 index_reg, int off)
1068+
{
1069+
u8 *prog = *pprog;
1070+
1071+
switch (size) {
1072+
case BPF_B:
1073+
/* mov byte ptr [rax + r12 + off], al */
1074+
EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x88);
1075+
break;
1076+
case BPF_H:
1077+
/* mov word ptr [rax + r12 + off], ax */
1078+
EMIT3(0x66, add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
1079+
break;
1080+
case BPF_W:
1081+
/* mov dword ptr [rax + r12 + 1], eax */
1082+
EMIT2(add_3mod(0x40, dst_reg, src_reg, index_reg), 0x89);
1083+
break;
1084+
case BPF_DW:
1085+
/* mov qword ptr [rax + r12 + 1], rax */
1086+
EMIT2(add_3mod(0x48, dst_reg, src_reg, index_reg), 0x89);
1087+
break;
1088+
}
1089+
emit_insn_suffix_SIB(&prog, dst_reg, src_reg, index_reg, off);
1090+
*pprog = prog;
1091+
}
1092+
1093+
static void emit_stx_r12(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off)
1094+
{
1095+
emit_stx_index(pprog, size, dst_reg, src_reg, X86_REG_R12, off);
1096+
}
1097+
1098+
/* ST: *(u8*)(dst_reg + index_reg + off) = imm32 */
1099+
static void emit_st_index(u8 **pprog, u32 size, u32 dst_reg, u32 index_reg, int off, int imm)
1100+
{
1101+
u8 *prog = *pprog;
1102+
1103+
switch (size) {
1104+
case BPF_B:
1105+
/* mov byte ptr [rax + r12 + off], imm8 */
1106+
EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC6);
1107+
break;
1108+
case BPF_H:
1109+
/* mov word ptr [rax + r12 + off], imm16 */
1110+
EMIT3(0x66, add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
1111+
break;
1112+
case BPF_W:
1113+
/* mov dword ptr [rax + r12 + 1], imm32 */
1114+
EMIT2(add_3mod(0x40, dst_reg, 0, index_reg), 0xC7);
1115+
break;
1116+
case BPF_DW:
1117+
/* mov qword ptr [rax + r12 + 1], imm32 */
1118+
EMIT2(add_3mod(0x48, dst_reg, 0, index_reg), 0xC7);
1119+
break;
1120+
}
1121+
emit_insn_suffix_SIB(&prog, dst_reg, 0, index_reg, off);
1122+
EMIT(imm, bpf_size_to_x86_bytes(size));
1123+
*pprog = prog;
1124+
}
1125+
1126+
static void emit_st_r12(u8 **pprog, u32 size, u32 dst_reg, int off, int imm)
1127+
{
1128+
emit_st_index(pprog, size, dst_reg, X86_REG_R12, off, imm);
1129+
}
1130+
10051131
static int emit_atomic(u8 **pprog, u8 atomic_op,
10061132
u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size)
10071133
{
@@ -1043,12 +1169,15 @@ static int emit_atomic(u8 **pprog, u8 atomic_op,
10431169
return 0;
10441170
}
10451171

1172+
#define DONT_CLEAR 1
1173+
10461174
bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs)
10471175
{
10481176
u32 reg = x->fixup >> 8;
10491177

10501178
/* jump over faulting load and clear dest register */
1051-
*(unsigned long *)((void *)regs + reg) = 0;
1179+
if (reg != DONT_CLEAR)
1180+
*(unsigned long *)((void *)regs + reg) = 0;
10521181
regs->ip += x->fixup & 0xff;
10531182
return true;
10541183
}
@@ -1147,11 +1276,15 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
11471276
bool tail_call_seen = false;
11481277
bool seen_exit = false;
11491278
u8 temp[BPF_MAX_INSN_SIZE + BPF_INSN_SAFETY];
1279+
u64 arena_vm_start, user_vm_start;
11501280
int i, excnt = 0;
11511281
int ilen, proglen = 0;
11521282
u8 *prog = temp;
11531283
int err;
11541284

1285+
arena_vm_start = bpf_arena_get_kern_vm_start(bpf_prog->aux->arena);
1286+
user_vm_start = bpf_arena_get_user_vm_start(bpf_prog->aux->arena);
1287+
11551288
detect_reg_usage(insn, insn_cnt, callee_regs_used,
11561289
&tail_call_seen);
11571290

@@ -1172,8 +1305,13 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
11721305
push_r12(&prog);
11731306
push_callee_regs(&prog, all_callee_regs_used);
11741307
} else {
1308+
if (arena_vm_start)
1309+
push_r12(&prog);
11751310
push_callee_regs(&prog, callee_regs_used);
11761311
}
1312+
if (arena_vm_start)
1313+
emit_mov_imm64(&prog, X86_REG_R12,
1314+
arena_vm_start >> 32, (u32) arena_vm_start);
11771315

11781316
ilen = prog - temp;
11791317
if (rw_image)
@@ -1213,6 +1351,40 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
12131351
break;
12141352

12151353
case BPF_ALU64 | BPF_MOV | BPF_X:
1354+
if (insn->off == BPF_ADDR_SPACE_CAST &&
1355+
insn->imm == 1U << 16) {
1356+
if (dst_reg != src_reg)
1357+
/* 32-bit mov */
1358+
emit_mov_reg(&prog, false, dst_reg, src_reg);
1359+
/* shl dst_reg, 32 */
1360+
maybe_emit_1mod(&prog, dst_reg, true);
1361+
EMIT3(0xC1, add_1reg(0xE0, dst_reg), 32);
1362+
1363+
/* or dst_reg, user_vm_start */
1364+
maybe_emit_1mod(&prog, dst_reg, true);
1365+
if (is_axreg(dst_reg))
1366+
EMIT1_off32(0x0D, user_vm_start >> 32);
1367+
else
1368+
EMIT2_off32(0x81, add_1reg(0xC8, dst_reg), user_vm_start >> 32);
1369+
1370+
/* rol dst_reg, 32 */
1371+
maybe_emit_1mod(&prog, dst_reg, true);
1372+
EMIT3(0xC1, add_1reg(0xC0, dst_reg), 32);
1373+
1374+
/* xor r11, r11 */
1375+
EMIT3(0x4D, 0x31, 0xDB);
1376+
1377+
/* test dst_reg32, dst_reg32; check if lower 32-bit are zero */
1378+
maybe_emit_mod(&prog, dst_reg, dst_reg, false);
1379+
EMIT2(0x85, add_2reg(0xC0, dst_reg, dst_reg));
1380+
1381+
/* cmove r11, dst_reg; if so, set dst_reg to zero */
1382+
/* WARNING: Intel swapped src/dst register encoding in CMOVcc !!! */
1383+
maybe_emit_mod(&prog, AUX_REG, dst_reg, true);
1384+
EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg));
1385+
break;
1386+
}
1387+
fallthrough;
12161388
case BPF_ALU | BPF_MOV | BPF_X:
12171389
if (insn->off == 0)
12181390
emit_mov_reg(&prog,
@@ -1564,6 +1736,56 @@ st: if (is_imm8(insn->off))
15641736
emit_stx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
15651737
break;
15661738

1739+
case BPF_ST | BPF_PROBE_MEM32 | BPF_B:
1740+
case BPF_ST | BPF_PROBE_MEM32 | BPF_H:
1741+
case BPF_ST | BPF_PROBE_MEM32 | BPF_W:
1742+
case BPF_ST | BPF_PROBE_MEM32 | BPF_DW:
1743+
start_of_ldx = prog;
1744+
emit_st_r12(&prog, BPF_SIZE(insn->code), dst_reg, insn->off, insn->imm);
1745+
goto populate_extable;
1746+
1747+
/* LDX: dst_reg = *(u8*)(src_reg + r12 + off) */
1748+
case BPF_LDX | BPF_PROBE_MEM32 | BPF_B:
1749+
case BPF_LDX | BPF_PROBE_MEM32 | BPF_H:
1750+
case BPF_LDX | BPF_PROBE_MEM32 | BPF_W:
1751+
case BPF_LDX | BPF_PROBE_MEM32 | BPF_DW:
1752+
case BPF_STX | BPF_PROBE_MEM32 | BPF_B:
1753+
case BPF_STX | BPF_PROBE_MEM32 | BPF_H:
1754+
case BPF_STX | BPF_PROBE_MEM32 | BPF_W:
1755+
case BPF_STX | BPF_PROBE_MEM32 | BPF_DW:
1756+
start_of_ldx = prog;
1757+
if (BPF_CLASS(insn->code) == BPF_LDX)
1758+
emit_ldx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
1759+
else
1760+
emit_stx_r12(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
1761+
populate_extable:
1762+
{
1763+
struct exception_table_entry *ex;
1764+
u8 *_insn = image + proglen + (start_of_ldx - temp);
1765+
s64 delta;
1766+
1767+
if (!bpf_prog->aux->extable)
1768+
break;
1769+
1770+
if (excnt >= bpf_prog->aux->num_exentries) {
1771+
pr_err("mem32 extable bug\n");
1772+
return -EFAULT;
1773+
}
1774+
ex = &bpf_prog->aux->extable[excnt++];
1775+
1776+
delta = _insn - (u8 *)&ex->insn;
1777+
/* switch ex to rw buffer for writes */
1778+
ex = (void *)rw_image + ((void *)ex - (void *)image);
1779+
1780+
ex->insn = delta;
1781+
1782+
ex->data = EX_TYPE_BPF;
1783+
1784+
ex->fixup = (prog - start_of_ldx) |
1785+
((BPF_CLASS(insn->code) == BPF_LDX ? reg2pt_regs[dst_reg] : DONT_CLEAR) << 8);
1786+
}
1787+
break;
1788+
15671789
/* LDX: dst_reg = *(u8*)(src_reg + off) */
15681790
case BPF_LDX | BPF_MEM | BPF_B:
15691791
case BPF_LDX | BPF_PROBE_MEM | BPF_B:
@@ -2036,6 +2258,8 @@ st: if (is_imm8(insn->off))
20362258
pop_r12(&prog);
20372259
} else {
20382260
pop_callee_regs(&prog, callee_regs_used);
2261+
if (arena_vm_start)
2262+
pop_r12(&prog);
20392263
}
20402264
EMIT1(0xC9); /* leave */
20412265
emit_return(&prog, image + addrs[i - 1] + (prog - temp));
@@ -3243,6 +3467,11 @@ void bpf_arch_poke_desc_update(struct bpf_jit_poke_descriptor *poke,
32433467
}
32443468
}
32453469

3470+
bool bpf_jit_supports_arena(void)
3471+
{
3472+
return true;
3473+
}
3474+
32463475
bool bpf_jit_supports_ptr_xchg(void)
32473476
{
32483477
return true;

0 commit comments

Comments
 (0)