Skip to content

Commit d74a790

Browse files
committed
Merge branch 'bpf-perf'
Kaixu Xia says: ==================== bpf: Introduce the new ability of eBPF programs to access hardware PMU counter This patchset is base on the net-next: git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git commit 9dc20a6. Previous patch v6 url: https://lkml.org/lkml/2015/8/4/188 changes in V7: - rebase the whole patch set to net-next tree(9dc20a6); - split out the core perf APIs into Patch 1/5; - change the return value of function perf_event_attrs() from struct perf_event * to const struct perf_event * in Patch 1/5; - rename the function perf_event_read_internal() to perf_event_ read_local() and rewrite it in Patch 1/5; - rename the function check_func_limit() to check_map_func _compatibility() and remove the unnecessary pass pointer to a pointer in Patch 4/5; changes in V6: - make the Patch 1/4 commit message more meaning and readable; - remove the unnecessary comment in Patch 2/4 and make it clean; - declare the function perf_event_release_kernel() in include/ linux/perf_event.h to fix the build error when CONFIG_PERF_EVENTS isn't configured in Patch 2/4; - add function perf_event_attrs() to get the struct perf_event_attr in Patch 2/4. - move the related code from kernel/trace/bpf_trace.c to kernel/ events/core.c and add function perf_event_read_internal() to avoid poking inside of the event outside of perf code in Patch 3/4; - generial the func & map match-pair with an array in Patch 3/4; changes in V5: - move struct fd_array_map_ops* fd_ops to bpf_map; - move array perf event decrement refcnt function to map_free; - fix the NULL ptr of perf_event_get(); - move bpf_perf_event_read() to kernel/bpf/bpf_trace.c; - get rid of the remaining struct bpf_prog; - move the unnecessay cast on void *; changes in V4: - make the bpf_prog_array_map more generic; - fix the bug of event refcnt leak; - use more useful errno in bpf_perf_event_read(); changes in V3: - collapse V2 patches 1-3 into one; - drop the function map->ops->map_traverse_elem() and release the struct perf_event in map_free; - only allow to access bpf_perf_event_read() from programs; - update the perf_event_array_map elem via xchg(); - pass index directly to bpf_perf_event_read() instead of MAP_KEY; changes in V2: - put atomic_long_inc_not_zero() between fdget() and fdput(); - limit the event type to PERF_TYPE_RAW and PERF_TYPE_HARDWARE; - Only read the event counter on current CPU or on current process; - add new map type BPF_MAP_TYPE_PERF_EVENT_ARRAY to store the pointer to the struct perf_event; - according to the perf_event_map_fd and key, the function bpf_perf_event_read() can get the Hardware PMU counter value; Patch 5/5 is a simple example and shows how to use this new eBPF programs ability. The PMU counter data can be found in /sys/kernel/debug/tracing/trace(trace_pipe).(the cycles PMU value when 'kprobe/sys_write' sampling) $ cat /sys/kernel/debug/tracing/trace_pipe $ ./tracex6 ... syslog-ng-548 [000] d..1 76.905673: : CPU-0 681765271 syslog-ng-548 [000] d..1 76.905690: : CPU-0 681787855 syslog-ng-548 [000] d..1 76.905707: : CPU-0 681810504 syslog-ng-548 [000] d..1 76.905725: : CPU-0 681834771 syslog-ng-548 [000] d..1 76.905745: : CPU-0 681859519 syslog-ng-548 [000] d..1 76.905766: : CPU-0 681890419 syslog-ng-548 [000] d..1 76.905783: : CPU-0 681914045 syslog-ng-548 [000] d..1 76.905800: : CPU-0 681935950 syslog-ng-548 [000] d..1 76.905816: : CPU-0 681958299 ls-690 [005] d..1 82.241308: : CPU-5 3138451 sh-691 [004] d..1 82.244570: : CPU-4 7324988 <...>-699 [007] d..1 99.961387: : CPU-7 3194027 <...>-695 [003] d..1 99.961474: : CPU-3 288901 <...>-695 [003] d..1 99.961541: : CPU-3 383145 <...>-695 [003] d..1 99.961591: : CPU-3 450365 <...>-695 [003] d..1 99.961639: : CPU-3 515751 <...>-695 [003] d..1 99.961686: : CPU-3 579047 ... The detail of patches is as follow: Patch 1/5 add the necessary core perf APIs perf_event_attrs(), perf_event_get(),perf_event_read_local() when accessing events counters in eBPF programs Patch 2/5 rewrites part of the bpf_prog_array map code and make it more generic; Patch 3/5 introduces a new bpf map type. This map only stores the pointer to struct perf_event; Patch 4/5 implements function bpf_perf_event_read() that get the selected hardware PMU conuter; Patch 5/5 gives a simple example. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents f1d5ca4 + 47efb30 commit d74a790

File tree

14 files changed

+373
-53
lines changed

14 files changed

+373
-53
lines changed

arch/x86/net/bpf_jit_comp.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ static void emit_prologue(u8 **pprog)
246246
* goto out;
247247
* if (++tail_call_cnt > MAX_TAIL_CALL_CNT)
248248
* goto out;
249-
* prog = array->prog[index];
249+
* prog = array->ptrs[index];
250250
* if (prog == NULL)
251251
* goto out;
252252
* goto *(prog->bpf_func + prologue_size);
@@ -284,9 +284,9 @@ static void emit_bpf_tail_call(u8 **pprog)
284284
EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
285285
EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
286286

287-
/* prog = array->prog[index]; */
287+
/* prog = array->ptrs[index]; */
288288
EMIT4_off32(0x48, 0x8D, 0x84, 0xD6, /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
289-
offsetof(struct bpf_array, prog));
289+
offsetof(struct bpf_array, ptrs));
290290
EMIT3(0x48, 0x8B, 0x00); /* mov rax, qword ptr [rax] */
291291

292292
/* if (prog == NULL)

include/linux/bpf.h

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#include <uapi/linux/bpf.h>
1111
#include <linux/workqueue.h>
1212
#include <linux/file.h>
13+
#include <linux/perf_event.h>
1314

1415
struct bpf_map;
1516

@@ -24,6 +25,10 @@ struct bpf_map_ops {
2425
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
2526
int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
2627
int (*map_delete_elem)(struct bpf_map *map, void *key);
28+
29+
/* funcs called by prog_array and perf_event_array map */
30+
void *(*map_fd_get_ptr) (struct bpf_map *map, int fd);
31+
void (*map_fd_put_ptr) (void *ptr);
2732
};
2833

2934
struct bpf_map {
@@ -142,13 +147,13 @@ struct bpf_array {
142147
bool owner_jited;
143148
union {
144149
char value[0] __aligned(8);
145-
struct bpf_prog *prog[0] __aligned(8);
150+
void *ptrs[0] __aligned(8);
146151
};
147152
};
148153
#define MAX_TAIL_CALL_CNT 32
149154

150155
u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5);
151-
void bpf_prog_array_map_clear(struct bpf_map *map);
156+
void bpf_fd_array_map_clear(struct bpf_map *map);
152157
bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp);
153158
const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
154159

@@ -185,6 +190,7 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
185190
extern const struct bpf_func_proto bpf_map_update_elem_proto;
186191
extern const struct bpf_func_proto bpf_map_delete_elem_proto;
187192

193+
extern const struct bpf_func_proto bpf_perf_event_read_proto;
188194
extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
189195
extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
190196
extern const struct bpf_func_proto bpf_tail_call_proto;

include/linux/perf_event.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,8 @@ extern int perf_event_init_task(struct task_struct *child);
641641
extern void perf_event_exit_task(struct task_struct *child);
642642
extern void perf_event_free_task(struct task_struct *task);
643643
extern void perf_event_delayed_put(struct task_struct *task);
644+
extern struct perf_event *perf_event_get(unsigned int fd);
645+
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
644646
extern void perf_event_print_debug(void);
645647
extern void perf_pmu_disable(struct pmu *pmu);
646648
extern void perf_pmu_enable(struct pmu *pmu);
@@ -659,6 +661,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr,
659661
void *context);
660662
extern void perf_pmu_migrate_context(struct pmu *pmu,
661663
int src_cpu, int dst_cpu);
664+
extern u64 perf_event_read_local(struct perf_event *event);
662665
extern u64 perf_event_read_value(struct perf_event *event,
663666
u64 *enabled, u64 *running);
664667

@@ -979,6 +982,12 @@ static inline int perf_event_init_task(struct task_struct *child) { return 0; }
979982
static inline void perf_event_exit_task(struct task_struct *child) { }
980983
static inline void perf_event_free_task(struct task_struct *task) { }
981984
static inline void perf_event_delayed_put(struct task_struct *task) { }
985+
static inline struct perf_event *perf_event_get(unsigned int fd) { return ERR_PTR(-EINVAL); }
986+
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
987+
{
988+
return ERR_PTR(-EINVAL);
989+
}
990+
static inline u64 perf_event_read_local(struct perf_event *event) { return -EINVAL; }
982991
static inline void perf_event_print_debug(void) { }
983992
static inline int perf_event_task_disable(void) { return -EINVAL; }
984993
static inline int perf_event_task_enable(void) { return -EINVAL; }
@@ -1011,6 +1020,7 @@ static inline void perf_event_enable(struct perf_event *event) { }
10111020
static inline void perf_event_disable(struct perf_event *event) { }
10121021
static inline int __perf_event_disable(void *info) { return -1; }
10131022
static inline void perf_event_task_tick(void) { }
1023+
static inline int perf_event_release_kernel(struct perf_event *event) { return 0; }
10141024
#endif
10151025

10161026
#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)

include/uapi/linux/bpf.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ enum bpf_map_type {
114114
BPF_MAP_TYPE_HASH,
115115
BPF_MAP_TYPE_ARRAY,
116116
BPF_MAP_TYPE_PROG_ARRAY,
117+
BPF_MAP_TYPE_PERF_EVENT_ARRAY,
117118
};
118119

119120
enum bpf_prog_type {
@@ -270,6 +271,7 @@ enum bpf_func_id {
270271
*/
271272
BPF_FUNC_skb_get_tunnel_key,
272273
BPF_FUNC_skb_set_tunnel_key,
274+
BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */
273275
__BPF_FUNC_MAX_ID,
274276
};
275277

kernel/bpf/arraymap.c

Lines changed: 106 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
150150
}
151151
late_initcall(register_array_map);
152152

153-
static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
153+
static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
154154
{
155-
/* only bpf_prog file descriptors can be stored in prog_array map */
155+
/* only file descriptors can be stored in this type of map */
156156
if (attr->value_size != sizeof(u32))
157157
return ERR_PTR(-EINVAL);
158158
return array_map_alloc(attr);
159159
}
160160

161-
static void prog_array_map_free(struct bpf_map *map)
161+
static void fd_array_map_free(struct bpf_map *map)
162162
{
163163
struct bpf_array *array = container_of(map, struct bpf_array, map);
164164
int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
167167

168168
/* make sure it's empty */
169169
for (i = 0; i < array->map.max_entries; i++)
170-
BUG_ON(array->prog[i] != NULL);
170+
BUG_ON(array->ptrs[i] != NULL);
171171
kvfree(array);
172172
}
173173

174-
static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key)
174+
static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
175175
{
176176
return NULL;
177177
}
178178

179179
/* only called from syscall */
180-
static int prog_array_map_update_elem(struct bpf_map *map, void *key,
181-
void *value, u64 map_flags)
180+
static int fd_array_map_update_elem(struct bpf_map *map, void *key,
181+
void *value, u64 map_flags)
182182
{
183183
struct bpf_array *array = container_of(map, struct bpf_array, map);
184-
struct bpf_prog *prog, *old_prog;
184+
void *new_ptr, *old_ptr;
185185
u32 index = *(u32 *)key, ufd;
186186

187187
if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
191191
return -E2BIG;
192192

193193
ufd = *(u32 *)value;
194-
prog = bpf_prog_get(ufd);
195-
if (IS_ERR(prog))
196-
return PTR_ERR(prog);
197-
198-
if (!bpf_prog_array_compatible(array, prog)) {
199-
bpf_prog_put(prog);
200-
return -EINVAL;
201-
}
194+
new_ptr = map->ops->map_fd_get_ptr(map, ufd);
195+
if (IS_ERR(new_ptr))
196+
return PTR_ERR(new_ptr);
202197

203-
old_prog = xchg(array->prog + index, prog);
204-
if (old_prog)
205-
bpf_prog_put_rcu(old_prog);
198+
old_ptr = xchg(array->ptrs + index, new_ptr);
199+
if (old_ptr)
200+
map->ops->map_fd_put_ptr(old_ptr);
206201

207202
return 0;
208203
}
209204

210-
static int prog_array_map_delete_elem(struct bpf_map *map, void *key)
205+
static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
211206
{
212207
struct bpf_array *array = container_of(map, struct bpf_array, map);
213-
struct bpf_prog *old_prog;
208+
void *old_ptr;
214209
u32 index = *(u32 *)key;
215210

216211
if (index >= array->map.max_entries)
217212
return -E2BIG;
218213

219-
old_prog = xchg(array->prog + index, NULL);
220-
if (old_prog) {
221-
bpf_prog_put_rcu(old_prog);
214+
old_ptr = xchg(array->ptrs + index, NULL);
215+
if (old_ptr) {
216+
map->ops->map_fd_put_ptr(old_ptr);
222217
return 0;
223218
} else {
224219
return -ENOENT;
225220
}
226221
}
227222

223+
static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
224+
{
225+
struct bpf_array *array = container_of(map, struct bpf_array, map);
226+
struct bpf_prog *prog = bpf_prog_get(fd);
227+
if (IS_ERR(prog))
228+
return prog;
229+
230+
if (!bpf_prog_array_compatible(array, prog)) {
231+
bpf_prog_put(prog);
232+
return ERR_PTR(-EINVAL);
233+
}
234+
return prog;
235+
}
236+
237+
static void prog_fd_array_put_ptr(void *ptr)
238+
{
239+
struct bpf_prog *prog = ptr;
240+
241+
bpf_prog_put_rcu(prog);
242+
}
243+
228244
/* decrement refcnt of all bpf_progs that are stored in this map */
229-
void bpf_prog_array_map_clear(struct bpf_map *map)
245+
void bpf_fd_array_map_clear(struct bpf_map *map)
230246
{
231247
struct bpf_array *array = container_of(map, struct bpf_array, map);
232248
int i;
233249

234250
for (i = 0; i < array->map.max_entries; i++)
235-
prog_array_map_delete_elem(map, &i);
251+
fd_array_map_delete_elem(map, &i);
236252
}
237253

238254
static const struct bpf_map_ops prog_array_ops = {
239-
.map_alloc = prog_array_map_alloc,
240-
.map_free = prog_array_map_free,
255+
.map_alloc = fd_array_map_alloc,
256+
.map_free = fd_array_map_free,
241257
.map_get_next_key = array_map_get_next_key,
242-
.map_lookup_elem = prog_array_map_lookup_elem,
243-
.map_update_elem = prog_array_map_update_elem,
244-
.map_delete_elem = prog_array_map_delete_elem,
258+
.map_lookup_elem = fd_array_map_lookup_elem,
259+
.map_update_elem = fd_array_map_update_elem,
260+
.map_delete_elem = fd_array_map_delete_elem,
261+
.map_fd_get_ptr = prog_fd_array_get_ptr,
262+
.map_fd_put_ptr = prog_fd_array_put_ptr,
245263
};
246264

247265
static struct bpf_map_type_list prog_array_type __read_mostly = {
@@ -255,3 +273,60 @@ static int __init register_prog_array_map(void)
255273
return 0;
256274
}
257275
late_initcall(register_prog_array_map);
276+
277+
static void perf_event_array_map_free(struct bpf_map *map)
278+
{
279+
bpf_fd_array_map_clear(map);
280+
fd_array_map_free(map);
281+
}
282+
283+
static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
284+
{
285+
struct perf_event *event;
286+
const struct perf_event_attr *attr;
287+
288+
event = perf_event_get(fd);
289+
if (IS_ERR(event))
290+
return event;
291+
292+
attr = perf_event_attrs(event);
293+
if (IS_ERR(attr))
294+
return (void *)attr;
295+
296+
if (attr->type != PERF_TYPE_RAW &&
297+
attr->type != PERF_TYPE_HARDWARE) {
298+
perf_event_release_kernel(event);
299+
return ERR_PTR(-EINVAL);
300+
}
301+
return event;
302+
}
303+
304+
static void perf_event_fd_array_put_ptr(void *ptr)
305+
{
306+
struct perf_event *event = ptr;
307+
308+
perf_event_release_kernel(event);
309+
}
310+
311+
static const struct bpf_map_ops perf_event_array_ops = {
312+
.map_alloc = fd_array_map_alloc,
313+
.map_free = perf_event_array_map_free,
314+
.map_get_next_key = array_map_get_next_key,
315+
.map_lookup_elem = fd_array_map_lookup_elem,
316+
.map_update_elem = fd_array_map_update_elem,
317+
.map_delete_elem = fd_array_map_delete_elem,
318+
.map_fd_get_ptr = perf_event_fd_array_get_ptr,
319+
.map_fd_put_ptr = perf_event_fd_array_put_ptr,
320+
};
321+
322+
static struct bpf_map_type_list perf_event_array_type __read_mostly = {
323+
.ops = &perf_event_array_ops,
324+
.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
325+
};
326+
327+
static int __init register_perf_event_array_map(void)
328+
{
329+
bpf_register_map_type(&perf_event_array_type);
330+
return 0;
331+
}
332+
late_initcall(register_perf_event_array_map);

kernel/bpf/core.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,7 +450,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
450450

451451
tail_call_cnt++;
452452

453-
prog = READ_ONCE(array->prog[index]);
453+
prog = READ_ONCE(array->ptrs[index]);
454454
if (unlikely(!prog))
455455
goto out;
456456

kernel/bpf/syscall.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
7272
/* prog_array stores refcnt-ed bpf_prog pointers
7373
* release them all when user space closes prog_array_fd
7474
*/
75-
bpf_prog_array_map_clear(map);
75+
bpf_fd_array_map_clear(map);
7676

7777
bpf_map_put(map);
7878
return 0;

0 commit comments

Comments
 (0)