Skip to content

Commit 4e6430c

Browse files
AGSaidiacmel
authored andcommitted
perf arm-spe: Use SPE data source for neoverse cores
When synthesizing data from SPE, augment the type with source information for Arm Neoverse cores. The field is IMPLDEF but the Neoverse cores all use the same encoding. I can't find encoding information for any other SPE implementations to unify their choices with Arm's thus that is left for future work. This change populates the mem_lvl_num for Neoverse cores as well as the deprecated mem_lvl namespace. Reviewed-by: German Gomez <[email protected]> Reviewed-by: Leo Yan <[email protected]> Signed-off-by: Ali Saidi <[email protected]> Tested-by: Leo Yan <[email protected]> Cc: Adrian Hunter <[email protected]> Cc: Alexander Shishkin <[email protected]> Cc: Anshuman Khandual <[email protected]> Cc: Gustavo A. R. Silva <[email protected]> Cc: Ian Rogers <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: James Clark <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: John Garry <[email protected]> Cc: Kajol Jain <[email protected]> Cc: Like Xu <[email protected]> Cc: Mark Rutland <[email protected]> Cc: Mike Leach <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Timothy Hayes <[email protected]> Cc: Will Deacon <[email protected]> Cc: [email protected] Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Leo Yan <[email protected]> Signed-off-by: Arnaldo Carvalho de Melo <[email protected]>
1 parent f78d625 commit 4e6430c

File tree

3 files changed

+127
-16
lines changed

3 files changed

+127
-16
lines changed

tools/perf/util/arm-spe-decoder/arm-spe-decoder.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder)
220220

221221
break;
222222
case ARM_SPE_DATA_SOURCE:
223+
decoder->record.source = payload;
223224
break;
224225
case ARM_SPE_BAD:
225226
break;

tools/perf/util/arm-spe-decoder/arm-spe-decoder.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,17 @@ enum arm_spe_op_type {
2929
ARM_SPE_ST = 1 << 1,
3030
};
3131

32+
enum arm_spe_neoverse_data_source {
33+
ARM_SPE_NV_L1D = 0x0,
34+
ARM_SPE_NV_L2 = 0x8,
35+
ARM_SPE_NV_PEER_CORE = 0x9,
36+
ARM_SPE_NV_LOCAL_CLUSTER = 0xa,
37+
ARM_SPE_NV_SYS_CACHE = 0xb,
38+
ARM_SPE_NV_PEER_CLUSTER = 0xc,
39+
ARM_SPE_NV_REMOTE = 0xd,
40+
ARM_SPE_NV_DRAM = 0xe,
41+
};
42+
3243
struct arm_spe_record {
3344
enum arm_spe_sample_type type;
3445
int err;
@@ -40,6 +51,7 @@ struct arm_spe_record {
4051
u64 virt_addr;
4152
u64 phys_addr;
4253
u64 context_id;
54+
u16 source;
4355
};
4456

4557
struct arm_spe_insn;

tools/perf/util/arm-spe.c

Lines changed: 114 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include "arm-spe-decoder/arm-spe-decoder.h"
3535
#include "arm-spe-decoder/arm-spe-pkt-decoder.h"
3636

37+
#include "../../arch/arm64/include/asm/cputype.h"
3738
#define MAX_TIMESTAMP (~0ULL)
3839

3940
struct arm_spe {
@@ -45,6 +46,7 @@ struct arm_spe {
4546
struct perf_session *session;
4647
struct machine *machine;
4748
u32 pmu_type;
49+
u64 midr;
4850

4951
struct perf_tsc_conversion tc;
5052

@@ -387,35 +389,128 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq,
387389
return arm_spe_deliver_synth_event(spe, speq, event, &sample);
388390
}
389391

390-
static u64 arm_spe__synth_data_source(const struct arm_spe_record *record)
392+
static const struct midr_range neoverse_spe[] = {
393+
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
394+
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
395+
MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
396+
{},
397+
};
398+
399+
static void arm_spe__synth_data_source_neoverse(const struct arm_spe_record *record,
400+
union perf_mem_data_src *data_src)
391401
{
392-
union perf_mem_data_src data_src = { 0 };
402+
/*
403+
* Even though four levels of cache hierarchy are possible, no known
404+
* production Neoverse systems currently include more than three levels
405+
* so for the time being we assume three exist. If a production system
406+
* is built with four the this function would have to be changed to
407+
* detect the number of levels for reporting.
408+
*/
393409

394-
if (record->op == ARM_SPE_LD)
395-
data_src.mem_op = PERF_MEM_OP_LOAD;
396-
else if (record->op == ARM_SPE_ST)
397-
data_src.mem_op = PERF_MEM_OP_STORE;
398-
else
399-
return 0;
410+
/*
411+
* We have no data on the hit level or data source for stores in the
412+
* Neoverse SPE records.
413+
*/
414+
if (record->op & ARM_SPE_ST) {
415+
data_src->mem_lvl = PERF_MEM_LVL_NA;
416+
data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA;
417+
data_src->mem_snoop = PERF_MEM_SNOOP_NA;
418+
return;
419+
}
420+
421+
switch (record->source) {
422+
case ARM_SPE_NV_L1D:
423+
data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT;
424+
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1;
425+
data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
426+
break;
427+
case ARM_SPE_NV_L2:
428+
data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
429+
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
430+
data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
431+
break;
432+
case ARM_SPE_NV_PEER_CORE:
433+
data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT;
434+
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2;
435+
data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
436+
break;
437+
/*
438+
* We don't know if this is L1, L2 but we do know it was a cache-2-cache
439+
* transfer, so set SNOOPX_PEER
440+
*/
441+
case ARM_SPE_NV_LOCAL_CLUSTER:
442+
case ARM_SPE_NV_PEER_CLUSTER:
443+
data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
444+
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
445+
data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
446+
break;
447+
/*
448+
* System cache is assumed to be L3
449+
*/
450+
case ARM_SPE_NV_SYS_CACHE:
451+
data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT;
452+
data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3;
453+
data_src->mem_snoop = PERF_MEM_SNOOP_HIT;
454+
break;
455+
/*
456+
* We don't know what level it hit in, except it came from the other
457+
* socket
458+
*/
459+
case ARM_SPE_NV_REMOTE:
460+
data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1;
461+
data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE;
462+
data_src->mem_remote = PERF_MEM_REMOTE_REMOTE;
463+
data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER;
464+
break;
465+
case ARM_SPE_NV_DRAM:
466+
data_src->mem_lvl = PERF_MEM_LVL_LOC_RAM | PERF_MEM_LVL_HIT;
467+
data_src->mem_lvl_num = PERF_MEM_LVLNUM_RAM;
468+
data_src->mem_snoop = PERF_MEM_SNOOP_NONE;
469+
break;
470+
default:
471+
break;
472+
}
473+
}
400474

475+
static void arm_spe__synth_data_source_generic(const struct arm_spe_record *record,
476+
union perf_mem_data_src *data_src)
477+
{
401478
if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) {
402-
data_src.mem_lvl = PERF_MEM_LVL_L3;
479+
data_src->mem_lvl = PERF_MEM_LVL_L3;
403480

404481
if (record->type & ARM_SPE_LLC_MISS)
405-
data_src.mem_lvl |= PERF_MEM_LVL_MISS;
482+
data_src->mem_lvl |= PERF_MEM_LVL_MISS;
406483
else
407-
data_src.mem_lvl |= PERF_MEM_LVL_HIT;
484+
data_src->mem_lvl |= PERF_MEM_LVL_HIT;
408485
} else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) {
409-
data_src.mem_lvl = PERF_MEM_LVL_L1;
486+
data_src->mem_lvl = PERF_MEM_LVL_L1;
410487

411488
if (record->type & ARM_SPE_L1D_MISS)
412-
data_src.mem_lvl |= PERF_MEM_LVL_MISS;
489+
data_src->mem_lvl |= PERF_MEM_LVL_MISS;
413490
else
414-
data_src.mem_lvl |= PERF_MEM_LVL_HIT;
491+
data_src->mem_lvl |= PERF_MEM_LVL_HIT;
415492
}
416493

417494
if (record->type & ARM_SPE_REMOTE_ACCESS)
418-
data_src.mem_lvl |= PERF_MEM_LVL_REM_CCE1;
495+
data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1;
496+
}
497+
498+
static u64 arm_spe__synth_data_source(const struct arm_spe_record *record, u64 midr)
499+
{
500+
union perf_mem_data_src data_src = { 0 };
501+
bool is_neoverse = is_midr_in_range(midr, neoverse_spe);
502+
503+
if (record->op == ARM_SPE_LD)
504+
data_src.mem_op = PERF_MEM_OP_LOAD;
505+
else if (record->op == ARM_SPE_ST)
506+
data_src.mem_op = PERF_MEM_OP_STORE;
507+
else
508+
return 0;
509+
510+
if (is_neoverse)
511+
arm_spe__synth_data_source_neoverse(record, &data_src);
512+
else
513+
arm_spe__synth_data_source_generic(record, &data_src);
419514

420515
if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) {
421516
data_src.mem_dtlb = PERF_MEM_TLB_WK;
@@ -436,7 +531,7 @@ static int arm_spe_sample(struct arm_spe_queue *speq)
436531
u64 data_src;
437532
int err;
438533

439-
data_src = arm_spe__synth_data_source(record);
534+
data_src = arm_spe__synth_data_source(record, spe->midr);
440535

441536
if (spe->sample_flc) {
442537
if (record->type & ARM_SPE_L1D_MISS) {
@@ -1178,6 +1273,8 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
11781273
struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info;
11791274
size_t min_sz = sizeof(u64) * ARM_SPE_AUXTRACE_PRIV_MAX;
11801275
struct perf_record_time_conv *tc = &session->time_conv;
1276+
const char *cpuid = perf_env__cpuid(session->evlist->env);
1277+
u64 midr = strtol(cpuid, NULL, 16);
11811278
struct arm_spe *spe;
11821279
int err;
11831280

@@ -1197,6 +1294,7 @@ int arm_spe_process_auxtrace_info(union perf_event *event,
11971294
spe->machine = &session->machines.host; /* No kvm support */
11981295
spe->auxtrace_type = auxtrace_info->type;
11991296
spe->pmu_type = auxtrace_info->priv[ARM_SPE_PMU_TYPE];
1297+
spe->midr = midr;
12001298

12011299
spe->timeless_decoding = arm_spe__is_timeless_decoding(spe);
12021300

0 commit comments

Comments
 (0)