Skip to content

Commit 688eb81

Browse files
goldsteinnhansendc
authored andcommitted
x86/csum: Improve performance of csum_partial
1) Add special case for len == 40 as that is the hottest value. The nets a ~8-9% latency improvement and a ~30% throughput improvement in the len == 40 case. 2) Use multiple accumulators in the 64-byte loop. This dramatically improves ILP and results in up to a 40% latency/throughput improvement (better for more iterations). Results from benchmarking on Icelake. Times measured with rdtsc() len lat_new lat_old r tput_new tput_old r 8 3.58 3.47 1.032 3.58 3.51 1.021 16 4.14 4.02 1.028 3.96 3.78 1.046 24 4.99 5.03 0.992 4.23 4.03 1.050 32 5.09 5.08 1.001 4.68 4.47 1.048 40 5.57 6.08 0.916 3.05 4.43 0.690 48 6.65 6.63 1.003 4.97 4.69 1.059 56 7.74 7.72 1.003 5.22 4.95 1.055 64 6.65 7.22 0.921 6.38 6.42 0.994 96 9.43 9.96 0.946 7.46 7.54 0.990 128 9.39 12.15 0.773 8.90 8.79 1.012 200 12.65 18.08 0.699 11.63 11.60 1.002 272 15.82 23.37 0.677 14.43 14.35 1.005 440 24.12 36.43 0.662 21.57 22.69 0.951 952 46.20 74.01 0.624 42.98 53.12 0.809 1024 47.12 78.24 0.602 46.36 58.83 0.788 1552 72.01 117.30 0.614 71.92 96.78 0.743 2048 93.07 153.25 0.607 93.28 137.20 0.680 2600 114.73 194.30 0.590 114.28 179.32 0.637 3608 156.34 268.41 0.582 154.97 254.02 0.610 4096 175.01 304.03 0.576 175.89 292.08 0.602 There is no such thing as a free lunch, however, and the special case for len == 40 does add overhead to the len != 40 cases. This seems to amount to be ~5% throughput and slightly less in terms of latency. Testing: Part of this change is a new kunit test. The tests check all alignment X length pairs in [0, 64) X [0, 512). There are three cases. 1) Precomputed random inputs/seed. The expected results where generated use the generic implementation (which is assumed to be non-buggy). 2) An input of all 1s. The goal of this test is to catch any case a carry is missing. 3) An input that never carries. The goal of this test si to catch any case of incorrectly carrying. More exhaustive tests that test all alignment X length pairs in [0, 8192) X [0, 8192] on random data are also available here: https://github.com/goldsteinn/csum-reproduction The reposity also has the code for reproducing the above benchmark numbers. Signed-off-by: Noah Goldstein <[email protected]> Signed-off-by: Dave Hansen <[email protected]> Link: https://lore.kernel.org/all/20230511011002.935690-1-goldstein.w.n%40gmail.com
1 parent b2ad431 commit 688eb81

File tree

4 files changed

+417
-32
lines changed

4 files changed

+417
-32
lines changed

arch/x86/lib/csum-partial_64.c

Lines changed: 65 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,32 @@
55
* This file contains network checksum routines that are better done
66
* in an architecture-specific manner due to speed.
77
*/
8-
8+
99
#include <linux/compiler.h>
1010
#include <linux/export.h>
1111
#include <asm/checksum.h>
1212
#include <asm/word-at-a-time.h>
1313

14-
static inline unsigned short from32to16(unsigned a)
14+
static inline unsigned short from32to16(unsigned a)
1515
{
16-
unsigned short b = a >> 16;
16+
unsigned short b = a >> 16;
1717
asm("addw %w2,%w0\n\t"
18-
"adcw $0,%w0\n"
18+
"adcw $0,%w0\n"
1919
: "=r" (b)
2020
: "0" (b), "r" (a));
2121
return b;
2222
}
2323

24+
static inline __wsum csum_tail(unsigned int result, u64 temp64, int odd)
25+
{
26+
result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
27+
if (unlikely(odd)) {
28+
result = from32to16(result);
29+
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
30+
}
31+
return (__force __wsum)result;
32+
}
33+
2434
/*
2535
* Do a checksum on an arbitrary memory area.
2636
* Returns a 32bit checksum.
@@ -47,21 +57,52 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
4757
buff++;
4858
}
4959

50-
while (unlikely(len >= 64)) {
60+
/*
61+
* len == 40 is the hot case due to IPv6 headers, but annotating it likely()
62+
* has noticeable negative affect on codegen for all other cases with
63+
* minimal performance benefit here.
64+
*/
65+
if (len == 40) {
5166
asm("addq 0*8(%[src]),%[res]\n\t"
5267
"adcq 1*8(%[src]),%[res]\n\t"
5368
"adcq 2*8(%[src]),%[res]\n\t"
5469
"adcq 3*8(%[src]),%[res]\n\t"
5570
"adcq 4*8(%[src]),%[res]\n\t"
56-
"adcq 5*8(%[src]),%[res]\n\t"
57-
"adcq 6*8(%[src]),%[res]\n\t"
58-
"adcq 7*8(%[src]),%[res]\n\t"
5971
"adcq $0,%[res]"
60-
: [res] "+r" (temp64)
61-
: [src] "r" (buff)
62-
: "memory");
63-
buff += 64;
64-
len -= 64;
72+
: [res] "+r"(temp64)
73+
: [src] "r"(buff), "m"(*(const char(*)[40])buff));
74+
return csum_tail(result, temp64, odd);
75+
}
76+
if (unlikely(len >= 64)) {
77+
/*
78+
* Extra accumulators for better ILP in the loop.
79+
*/
80+
u64 tmp_accum, tmp_carries;
81+
82+
asm("xorl %k[tmp_accum],%k[tmp_accum]\n\t"
83+
"xorl %k[tmp_carries],%k[tmp_carries]\n\t"
84+
"subl $64, %[len]\n\t"
85+
"1:\n\t"
86+
"addq 0*8(%[src]),%[res]\n\t"
87+
"adcq 1*8(%[src]),%[res]\n\t"
88+
"adcq 2*8(%[src]),%[res]\n\t"
89+
"adcq 3*8(%[src]),%[res]\n\t"
90+
"adcl $0,%k[tmp_carries]\n\t"
91+
"addq 4*8(%[src]),%[tmp_accum]\n\t"
92+
"adcq 5*8(%[src]),%[tmp_accum]\n\t"
93+
"adcq 6*8(%[src]),%[tmp_accum]\n\t"
94+
"adcq 7*8(%[src]),%[tmp_accum]\n\t"
95+
"adcl $0,%k[tmp_carries]\n\t"
96+
"addq $64, %[src]\n\t"
97+
"subl $64, %[len]\n\t"
98+
"jge 1b\n\t"
99+
"addq %[tmp_accum],%[res]\n\t"
100+
"adcq %[tmp_carries],%[res]\n\t"
101+
"adcq $0,%[res]"
102+
: [tmp_accum] "=&r"(tmp_accum),
103+
[tmp_carries] "=&r"(tmp_carries), [res] "+r"(temp64),
104+
[len] "+r"(len), [src] "+r"(buff)
105+
: "m"(*(const char *)buff));
65106
}
66107

67108
if (len & 32) {
@@ -70,45 +111,37 @@ __wsum csum_partial(const void *buff, int len, __wsum sum)
70111
"adcq 2*8(%[src]),%[res]\n\t"
71112
"adcq 3*8(%[src]),%[res]\n\t"
72113
"adcq $0,%[res]"
73-
: [res] "+r" (temp64)
74-
: [src] "r" (buff)
75-
: "memory");
114+
: [res] "+r"(temp64)
115+
: [src] "r"(buff), "m"(*(const char(*)[32])buff));
76116
buff += 32;
77117
}
78118
if (len & 16) {
79119
asm("addq 0*8(%[src]),%[res]\n\t"
80120
"adcq 1*8(%[src]),%[res]\n\t"
81121
"adcq $0,%[res]"
82-
: [res] "+r" (temp64)
83-
: [src] "r" (buff)
84-
: "memory");
122+
: [res] "+r"(temp64)
123+
: [src] "r"(buff), "m"(*(const char(*)[16])buff));
85124
buff += 16;
86125
}
87126
if (len & 8) {
88127
asm("addq 0*8(%[src]),%[res]\n\t"
89128
"adcq $0,%[res]"
90-
: [res] "+r" (temp64)
91-
: [src] "r" (buff)
92-
: "memory");
129+
: [res] "+r"(temp64)
130+
: [src] "r"(buff), "m"(*(const char(*)[8])buff));
93131
buff += 8;
94132
}
95133
if (len & 7) {
96-
unsigned int shift = (8 - (len & 7)) * 8;
134+
unsigned int shift = (-len << 3) & 63;
97135
unsigned long trail;
98136

99137
trail = (load_unaligned_zeropad(buff) << shift) >> shift;
100138

101139
asm("addq %[trail],%[res]\n\t"
102140
"adcq $0,%[res]"
103-
: [res] "+r" (temp64)
104-
: [trail] "r" (trail));
141+
: [res] "+r"(temp64)
142+
: [trail] "r"(trail));
105143
}
106-
result = add32_with_carry(temp64 >> 32, temp64 & 0xffffffff);
107-
if (unlikely(odd)) {
108-
result = from32to16(result);
109-
result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
110-
}
111-
return (__force __wsum)result;
144+
return csum_tail(result, temp64, odd);
112145
}
113146
EXPORT_SYMBOL(csum_partial);
114147

@@ -118,6 +151,6 @@ EXPORT_SYMBOL(csum_partial);
118151
*/
119152
__sum16 ip_compute_csum(const void *buff, int len)
120153
{
121-
return csum_fold(csum_partial(buff,len,0));
154+
return csum_fold(csum_partial(buff, len, 0));
122155
}
123156
EXPORT_SYMBOL(ip_compute_csum);

lib/Kconfig.debug

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2453,6 +2453,23 @@ config BITFIELD_KUNIT
24532453

24542454
If unsure, say N.
24552455

2456+
config CHECKSUM_KUNIT
2457+
tristate "KUnit test checksum functions at runtime" if !KUNIT_ALL_TESTS
2458+
depends on KUNIT
2459+
default KUNIT_ALL_TESTS
2460+
help
2461+
Enable this option to test the checksum functions at boot.
2462+
2463+
KUnit tests run during boot and output the results to the debug log
2464+
in TAP format (http://testanything.org/). Only useful for kernel devs
2465+
running the KUnit test harness, and not intended for inclusion into a
2466+
production build.
2467+
2468+
For more information on KUnit and unit tests in general please refer
2469+
to the KUnit documentation in Documentation/dev-tools/kunit/.
2470+
2471+
If unsure, say N.
2472+
24562473
config HASH_KUNIT_TEST
24572474
tristate "KUnit Test for integer hash functions" if !KUNIT_ALL_TESTS
24582475
depends on KUNIT

lib/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,7 @@ obj-$(CONFIG_PLDMFW) += pldmfw/
377377
# KUnit tests
378378
CFLAGS_bitfield_kunit.o := $(DISABLE_STRUCTLEAK_PLUGIN)
379379
obj-$(CONFIG_BITFIELD_KUNIT) += bitfield_kunit.o
380+
obj-$(CONFIG_CHECKSUM_KUNIT) += checksum_kunit.o
380381
obj-$(CONFIG_LIST_KUNIT_TEST) += list-test.o
381382
obj-$(CONFIG_HASHTABLE_KUNIT_TEST) += hashtable_test.o
382383
obj-$(CONFIG_LINEAR_RANGES_TEST) += test_linear_ranges.o

0 commit comments

Comments
 (0)