From ac889d94cd37127e67572dd1b2cb3fc7aa19c4b8 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Fri, 28 Mar 2025 08:10:44 -0700 Subject: [PATCH 01/23] add usleep option --- dto.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/dto.c b/dto.c index b7a3a1c..605b59b 100644 --- a/dto.c +++ b/dto.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -79,7 +80,8 @@ struct dto_device { enum wait_options { WAIT_BUSYPOLL = 0, WAIT_UMWAIT, - WAIT_YIELD + WAIT_YIELD, + WAIT_SLEEP }; enum numa_aware { @@ -171,6 +173,7 @@ static const char * const wait_names[] = { [WAIT_BUSYPOLL] = "busypoll", [WAIT_UMWAIT] = "umwait", [WAIT_YIELD] = "yield", + [WAIT_SLEEP] = "sleep" }; static int collect_stats; @@ -387,14 +390,30 @@ static __always_inline void __dsa_wait(const volatile uint8_t *comp) static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp) { - if (wait_method == WAIT_YIELD) - dsa_wait_yield(comp); - else if (wait_method == WAIT_UMWAIT) - dsa_wait_umwait(comp); - else - dsa_wait_busy_poll(comp); + switch (wait_method) { + case WAIT_YIELD: + dsa_wait_yield(comp); + break; + case WAIT_UMWAIT: + dsa_wait_umwait(comp); + break; + case WAIT_BUSYPOLL: + dsa_wait_busy_poll(comp); + break; + case WAIT_SLEEP: + // This method is not typically used in high-performance scenarios, + // but included for completeness. It can be implemented with a sleep. + // For example, using usleep or sleep for a short duration. + // This is a placeholder for actual sleep implementation. + do { + usleep(20); // Sleep for 20 microseconds + } while (*comp == 0); + default: + dsa_wait_busy_poll(comp); + } } + /* A simple auto-tuning heuristic. * Goal of the Heuristic: * - CPU and DSA should complete their fraction of the job roughly simultaneously. From 91105d38791ef7f4e8fefa85f79d2e9ec4eec448 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Fri, 28 Mar 2025 08:12:10 -0700 Subject: [PATCH 02/23] accel config script --- accelConfig.sh | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100755 accelConfig.sh diff --git a/accelConfig.sh b/accelConfig.sh new file mode 100755 index 0000000..962f83f --- /dev/null +++ b/accelConfig.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +echo "OPTIONAL Arg-1: DSA device id. Default: 0" +echo "OPTIONAL Arg-2: Enable/Disable DSA device. Default: yes" +echo "OPTIONAL Arg-3: SHARED WQ id. Default: 1" +echo "OPTIONAL Arg-4: ENGINE count. Default: 4" + +if [ "$#" -ge 5 ]; then + echo "ERROR: Incorrect argument count. Expected arg count <= 4" + exit 1 +fi + +DEVID=${1:-0} +ENABLE=${2:-yes} +SWQID=${3:-1} +NENGS=${4:-4} + +DEV=dsa${DEVID} +SWQ=${DEV}/wq${DEVID}.${SWQID} + +echo "=> ${SWQ}:" +accel-config disable-wq ${SWQ} + +echo "=> ${DEV}:" +accel-config disable-device ${DEV} + +if [ "${ENABLE}" != "yes" ]; then + echo "Exit after disabling ${DEV}." + exit 1 +fi + +for ((i=0; i < ${NENGS}; i++)) +do + echo "=> ${DEV}/engine${DEVID}.${i}" + echo "configured" + accel-config config-engine ${DEV}/engine${DEVID}.${i} --group-id=0 +done + +accel-config config-wq ${SWQ} --group-id=0 +accel-config config-wq ${SWQ} --priority=1 +accel-config config-wq ${SWQ} --wq-size=128 +accel-config config-wq ${SWQ} --max-batch-size=1024 +accel-config config-wq ${SWQ} --max-transfer-size=2147483648 +accel-config config-wq ${SWQ} --block-on-fault=0 +accel-config config-wq ${SWQ} --type=user +accel-config config-wq ${SWQ} --name="dsa-test" +accel-config config-wq ${SWQ} --mode=shared +accel-config config-wq ${SWQ} --threshold=127 +accel-config config-wq ${SWQ} --driver-name="user" +#accel-config config-group dsa0/group0.1 --traffic-class-a=0 +#accel-config config-group dsa0/group0.1 --traffic-class-b=0 + +echo "=> ${DEV}:" +accel-config enable-device ${DEV} + +echo "=> ${SWQ}:" +accel-config enable-wq ${SWQ} + From 4abdd0d524410c57eb3a97fd779b7e47e46aedcc Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Fri, 28 Mar 2025 08:12:28 -0700 Subject: [PATCH 03/23] cachelib usage --- CACHELIB_USAGE_GUIDE.md | 146 ++++++++++++++++++++++++++++++++++++++++ cachelib.patch | 55 +++++++++++++++ dto_cachelib_env.sh | 9 +++ 3 files changed, 210 insertions(+) create mode 100644 CACHELIB_USAGE_GUIDE.md create mode 100644 cachelib.patch create mode 100755 dto_cachelib_env.sh diff --git a/CACHELIB_USAGE_GUIDE.md b/CACHELIB_USAGE_GUIDE.md new file mode 100644 index 0000000..ddbd9b6 --- /dev/null +++ b/CACHELIB_USAGE_GUIDE.md @@ -0,0 +1,146 @@ +# Intel DSA Usage with CacheLib + +## Platform Configuration + +### BIOS Configuration +- Enable **Intel Virtualization Technology for Directed I/O (VT-d)**. + +### Linux Kernel Configuration +Ensure the following kernel options are enabled: +```plaintext +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +CONFIG_INTEL_IOMMU_DEFAULT_ON=y +CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON=y +``` +If either `CONFIG_INTEL_IOMMU_DEFAULT_ON` or `CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON` is not enabled, add the following to the kernel boot parameters: +```plaintext +intel_iommu=on,sm_on +``` + +### Intel DSA Driver (IDXD) +Enable the following kernel options while building or installing the Linux kernel (version 5.18 or later is needed for shared work queues used by DTO): +```plaintext +CONFIG_INTEL_IDXD=m +CONFIG_INTEL_IDXD_SVM=y +CONFIG_INTEL_IDXD_PERFMON=y +``` + +### Work Queues (WQs) +- WQs are on-device queues that contain descriptors submitted to the device. +- They can be configured in two modes: + - **Dedicated (DWQ)** + - **Shared (SWQ)** +- **SWQ is preferred** as it allows multiple clients to submit descriptors simultaneously, improving device utilization compared to DWQs. +- **DTO uses SWQ**. + +### Verify IDXD Driver Initialization +Use the following command to check the kernel message buffer: +```sh +dmesg | grep "idxd " +``` + +## Initialize the DSA Device + +### List All DSA Devices +```sh +lspci | grep 0b25 +``` +### Initialize DSA device(s) +```sh +./accelConfig.sh +``` +To enable the first DSA device (assuming it is device 0) with a shared work queue and 4 engines, you would run: +```sh +./accelConfig.sh 0 +``` + +## Using the DTO library + +### Pre-requisite Packages +- **glibc version 2.36 or later** is recommended for best DTO performance. +- Check your glibc version: +```sh +ldd --version +``` +- Systems with glibc versions **less than 2.36** may experience reduced DTO performance due to a known bug. + +#### Package Requirements +- **Fedora/CentOS/RHEL**: +```sh +sudo dnf install -y \ + kernel-headers \ + accel-config-devel \ + libuuid-devel +``` +- **Ubuntu/Debian**: +```sh +sudo apt-get install -y \ + linux-libc-dev \ + libaccel-config-dev \ + uuid-dev +``` + +### Build DTO Library +```sh +make libdto +sudo make install +``` + +### Build the Test Application +```sh +make dto-test +source dto_env.sh +./dto-test +``` + +## Using with CacheLib + +To connect to CacheLib (OSS version) using the DTO library, you will need to ensure that your application is linked against the `libdto` library. + +### Linking to CacheLib with build patch + +Cachebench needs a change for setStringItem to use 'std::memmove' instead of 'strncpy' as DTO does not intercept strncpy call. The patch also adds the build option for DTO in cmake. + +```sh +cd ~/CacheLib/ +git apply ~/DTO/cachelib.patch +cd build-cachelib/ +cmake .. \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DBUILD_WITH_DTO=ON +make -j$(nproc) +make install +``` +### Example Usage + +You can set the following environment variables to use the DTO library with CacheLib: + +```plaintext +LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib + +DTO_USESTDC_CALLS=0 // Set to 1 to use standard C library calls (like memcpy) instead of DSA enabled calls + +DTO_COLLECT_STATS=0 // Set to 1 to collect stats on DSA performance + +DTO_WAIT_METHOD=umwait // sleep, umwait, yield, or busypoll after submitting to DSA + +DTO_MIN_BYTES=32768 // Minimum bytes for DSA to process, smaller requests will be done on CPU + +DTO_CPU_SIZE_FRACTION=0.0 //offload the entire call to DSA + +DTO_AUTO_ADJUST_KNOBS=0 //Tries to find the optimal split between CPU fraction and DSA fraction, attempting to minimize the difference in time between the two + +DTO_DSA_CC=0 // Set to 0 to bypass CPU cache for the destination buffer, this will ensure that the data is written from DSA directly to memory avoidng polluting the cache with a large buffer that has low probablity of being reaccessed. +``` + +```sh +source ~/DTO/dto_cachelib_env.sh +./opt/cachelib/bin/cachebench --json_test_config ~/DTO/sample_cachebench_config.json +``` + +### Additional References +- [Data Streaming Accelerator User Guide](https://www.intel.com/content/www/us/en/content-details/759709/intel-data-streaming-accelerator-user-guide.html) +- [Data Streaming Accelerator Architecture Specification](https://cdrdv2-public.intel.com/671116/341204-intel-data-streaming-accelerator-spec.pdf) + + diff --git a/cachelib.patch b/cachelib.patch new file mode 100644 index 0000000..28a3b38 --- /dev/null +++ b/cachelib.patch @@ -0,0 +1,55 @@ +From eaa6ded0b338f6b1accf82b35393fef8c8b03ba2 Mon Sep 17 00:00:00 2001 +From: Daniel Byrne +Date: Wed, 12 Mar 2025 04:49:29 -0700 +Subject: [PATCH] dto enable + +--- + cachelib/CMakeLists.txt | 2 ++ + cachelib/cachebench/CMakeLists.txt | 4 ++++ + cachelib/cachebench/cache/Cache.h | 2 +- + 3 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt +index 506ba66b..07aef841 100644 +--- a/cachelib/CMakeLists.txt ++++ b/cachelib/CMakeLists.txt +@@ -44,6 +44,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) + + option(BUILD_TESTS "If enabled, compile the tests." ON) + ++option(BUILD_WITH_DTO "If enabled, build with the DTO library for DSA support." ON) ++ + + set(BIN_INSTALL_DIR bin CACHE STRING + "The subdirectory where binaries should be installed") +diff --git a/cachelib/cachebench/CMakeLists.txt b/cachelib/cachebench/CMakeLists.txt +index f08e08f5..349b71f7 100644 +--- a/cachelib/cachebench/CMakeLists.txt ++++ b/cachelib/cachebench/CMakeLists.txt +@@ -72,6 +72,10 @@ add_executable (binary_trace_gen binary_trace_gen.cpp) + target_link_libraries(cachebench cachelib_cachebench) + target_link_libraries(binary_trace_gen cachelib_binary_trace_gen) + ++if (BUILD_WITH_DTO) ++ target_link_libraries(cachebench accel-config dto) ++endif () ++ + install( + TARGETS + cachebench +diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h +index 17a4dc15..effacac8 100644 +--- a/cachelib/cachebench/cache/Cache.h ++++ b/cachelib/cachebench/cache/Cache.h +@@ -1330,7 +1330,7 @@ void Cache::setStringItem(WriteHandle& handle, + return; + + auto ptr = reinterpret_cast(getMemory(handle)); +- std::strncpy(ptr, str.c_str(), dataSize); ++ std::memmove(ptr, str.c_str(), dataSize); + + // Make sure the copied string ends with null char + if (str.size() + 1 > dataSize) { +-- +2.47.1 + diff --git a/dto_cachelib_env.sh b/dto_cachelib_env.sh new file mode 100755 index 0000000..160c552 --- /dev/null +++ b/dto_cachelib_env.sh @@ -0,0 +1,9 @@ +#!/bin/bash +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib +export DTO_USESTDC_CALLS=0 +export DTO_COLLECT_STATS=0 +export DTO_WAIT_METHOD=sleep +export DTO_MIN_BYTES=32768 +export DTO_CPU_SIZE_FRACTION=0.0 +export DTO_AUTO_ADJUST_KNOBS=0 +export DTO_DSA_CC=0 From e3fc81bcfef59589ae152c87d0b879903afcf724 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Fri, 28 Mar 2025 08:24:32 -0700 Subject: [PATCH 04/23] update with cachebench config --- CACHELIB_USAGE_GUIDE.md | 2 +- cachebench_config.json | 55 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 cachebench_config.json diff --git a/CACHELIB_USAGE_GUIDE.md b/CACHELIB_USAGE_GUIDE.md index ddbd9b6..0c6273f 100644 --- a/CACHELIB_USAGE_GUIDE.md +++ b/CACHELIB_USAGE_GUIDE.md @@ -136,7 +136,7 @@ DTO_DSA_CC=0 // Set to 0 to bypass CPU cache for the destination buffer, this wi ```sh source ~/DTO/dto_cachelib_env.sh -./opt/cachelib/bin/cachebench --json_test_config ~/DTO/sample_cachebench_config.json +./opt/cachelib/bin/cachebench --json_test_config ~/DTO/cachebench_config.json ``` ### Additional References diff --git a/cachebench_config.json b/cachebench_config.json new file mode 100644 index 0000000..4324f2e --- /dev/null +++ b/cachebench_config.json @@ -0,0 +1,55 @@ +{ + "cache_config": { + "cacheSizeMB": 40000, + "poolRebalanceIntervalSec": 0, + "moveOnSlabRelease": "false", + "numPools": 2, + "poolSizes": [ + 0.5, + 0.5 + ] + }, + "test_config": { + "numOps": 2500000.0, + "numThreads": 32, + "numKeys": 1000000, + "enableLookaside": "true", + "keySizeRange": [ + 1, + 8, + 64 + ], + "keySizeRangeProbability": [ + 0.3, + 0.7 + ], + "opRatePerSec": 300000, + "opRateBurstSize": 600000, + "valSizeRange": [ + 64, + 256, + 32240, + 80155.0, + 128248, + 205196 + ], + "valSizeRangeProbability": [ + 0.05, + 0.05, + 0.1, + 0.4, + 0.4 + ], + "getRatio": 0.9, + "setRatio": 0.05, + "delRatio": 0.05, + "keyPoolDistribution": [ + 0.5, + 0.5 + ], + "opPoolDistribution": [ + 0.5, + 0.5 + ] + } +} From b07de5d949282b6d8d21eff4d94eec1ec63a56f8 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Mon, 31 Mar 2025 07:55:39 -0700 Subject: [PATCH 05/23] update makefile opt3 and native arch --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 0d6c0dc..664e7c1 100644 --- a/Makefile +++ b/Makefile @@ -7,10 +7,10 @@ all: libdto dto-test-wodto DML_LIB_CXX=-D_GNU_SOURCE libdto: dto.c - gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl -lnuma + gcc -shared -O3 -march=native -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl -lnuma libdto_nostats: dto.c - gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl -lnuma + gcc -shared -O3 -march=native -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl -lnuma install: cp libdto.so.1.0 /usr/lib64/ From db6b52c06f0cc6a2fe30ddb2a2ecf2f12cb1aadd Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Mon, 31 Mar 2025 07:58:08 -0700 Subject: [PATCH 06/23] add tpause and restrict usleep --- dto.c | 125 +++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 84 insertions(+), 41 deletions(-) diff --git a/dto.c b/dto.c index 605b59b..3c2470f 100644 --- a/dto.c +++ b/dto.c @@ -32,8 +32,10 @@ #define GENCAP_CC_MEMORY 0x4 #define UMWAIT_DELAY_DEFAULT 100000 -/* C0.1 state */ -#define UMWAIT_STATE 1 + +#define C01_STATE 1 +#define C02_STATE 0 +#define TPAUSE_DELAY 1000 #define USE_ORIG_FUNC(n, use_dsa) (use_std_lib_calls == 1 || !use_dsa || n < dsa_min_size) #define TS_NS(s, e) (((e.tv_sec*1000000000) + e.tv_nsec) - ((s.tv_sec*1000000000) + s.tv_nsec)) @@ -45,7 +47,8 @@ */ #define MAX_WQS 32 #define MAX_NUMA_NODES 32 -#define DTO_DEFAULT_MIN_SIZE 16384 +#define DTO_DEFAULT_MIN_SIZE 32768 +#define DTO_DEFAULT_USLEEP 20 #define DTO_INITIALIZED 0 #define DTO_INITIALIZING 1 @@ -81,6 +84,7 @@ enum wait_options { WAIT_BUSYPOLL = 0, WAIT_UMWAIT, WAIT_YIELD, + WAIT_TPAUSE, WAIT_SLEEP }; @@ -116,6 +120,9 @@ static uint8_t dto_dsa_memset = 1; static uint8_t dto_dsa_memcmp = 1; static uint8_t dto_dsa_cc = 1; +static bool dto_use_c02 = true; //C02 state is default - + //C02 avg exit latency is ~500 ns + //and C01 is about ~240 ns on SPR static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT; @@ -173,6 +180,7 @@ static const char * const wait_names[] = { [WAIT_BUSYPOLL] = "busypoll", [WAIT_UMWAIT] = "umwait", [WAIT_YIELD] = "yield", + [WAIT_TPAUSE] = "tpause", [WAIT_SLEEP] = "sleep" }; @@ -344,72 +352,92 @@ static __always_inline int umwait(unsigned long timeout, unsigned int state) return r; } +static inline void tpause(unsigned long timeout, unsigned int state) +{ + uint32_t timeout_low = (uint32_t)(timeout); + uint32_t timeout_high = (uint32_t)(timeout >> 32); + asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n" + : + : "c"(state), "a"(timeout_low), "d"(timeout_high)); +} + + static __always_inline void dsa_wait_yield(const volatile uint8_t *comp) { while (*comp == 0) { - sched_yield(); + sched_yield(); } } static __always_inline void dsa_wait_busy_poll(const volatile uint8_t *comp) { while (*comp == 0) { - _mm_pause(); + _mm_pause(); + } +} + +static __always_inline void dsa_wait_tpause(const volatile uint8_t *comp) +{ + while (*comp == 0) { + tpause(__rdtsc() + TPAUSE_DELAY, + dto_use_c02 ? C02_STATE : C01_STATE); //default is 1000 cycles } } static __always_inline void __dsa_wait_umwait(const volatile uint8_t *comp) { umonitor(comp); - - // Hardware never writes 0 to this field. Software should initialize this field to 0 - // so it can detect when the completion record has been written - if (*comp == 0) { - uint64_t delay = __rdtsc() + dto_umwait_delay; - - umwait(delay, UMWAIT_STATE); - } + + uint64_t delay = __rdtsc() + dto_umwait_delay; + umwait(delay, dto_use_c02 ? C02_STATE : C01_STATE); } static __always_inline void dsa_wait_umwait(const volatile uint8_t *comp) { - while (*comp == 0) - __dsa_wait_umwait(comp); + while (*comp == 0) { + __dsa_wait_umwait(comp); + } } static __always_inline void __dsa_wait(const volatile uint8_t *comp) { - if (wait_method == WAIT_YIELD) + switch(wait_method) { + case WAIT_YIELD: sched_yield(); - else if (wait_method == WAIT_UMWAIT) - __dsa_wait_umwait(comp); - else - _mm_pause(); + break; + case WAIT_UMWAIT: + __dsa_wait_umwait(comp); + break; + case WAIT_TPAUSE: + tpause(__rdtsc() + TPAUSE_DELAY, + dto_use_c02 ? C02_STATE : C01_STATE); //default is 1000 cycles + break; + default: + _mm_pause(); + } } static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp) { switch (wait_method) { - case WAIT_YIELD: - dsa_wait_yield(comp); - break; - case WAIT_UMWAIT: - dsa_wait_umwait(comp); - break; - case WAIT_BUSYPOLL: - dsa_wait_busy_poll(comp); - break; - case WAIT_SLEEP: - // This method is not typically used in high-performance scenarios, - // but included for completeness. It can be implemented with a sleep. - // For example, using usleep or sleep for a short duration. - // This is a placeholder for actual sleep implementation. - do { - usleep(20); // Sleep for 20 microseconds - } while (*comp == 0); - default: - dsa_wait_busy_poll(comp); + case WAIT_YIELD: + dsa_wait_yield(comp); + break; + case WAIT_UMWAIT: + dsa_wait_umwait(comp); + break; + case WAIT_BUSYPOLL: + dsa_wait_busy_poll(comp); + break; + case WAIT_SLEEP: + // This method is not typically used in high-performance scenarios but + // gives a good demonstration of how much CPU time can be reduced + do { + usleep(DTO_DEFAULT_USLEEP); // Sleep for 20 microseconds + } while (*comp == 0); + default: + dsa_wait_busy_poll(comp); } } @@ -437,8 +465,9 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp) uint64_t local_num_waits = 0; if ((++num_descs & DESCS_PER_RUN) != DESCS_PER_RUN) { - while (*comp == 0) + while (*comp == 0) { __dsa_wait(comp); + } return; } @@ -1140,7 +1169,21 @@ static int dsa_init(void) max_avg_waits = MAX_AVG_POLL_WAITS; } else LOG_ERROR("umwait not supported. Falling back to default wait method\n"); - } + } else if (!strncmp(env_str, wait_names[WAIT_TPAUSE], strlen(wait_names[WAIT_TPAUSE]))) { + if (umwait_support) { + wait_method = WAIT_TPAUSE; + } else { + LOG_ERROR("tpause not supported. Falling back to busypoll\n"); + wait_method = WAIT_BUSYPOLL; + } + } else if (!strncmp(env_str, wait_names[WAIT_SLEEP], strlen(wait_names[WAIT_SLEEP]))) { + if (cpu_size_fraction == 0.0 && auto_adjust_knobs == 0) { + wait_method = WAIT_SLEEP; + } else { + LOG_ERROR("sleep not supported for partial offloading (fraction > 0 and/or autotuning is selected\n"); + wait_method = WAIT_BUSYPOLL; + } + } } env_str = getenv("DTO_WQ_LIST"); From 47a2586d5b0464ee96512ba61c925c84a664be60 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Mon, 31 Mar 2025 10:40:56 -0700 Subject: [PATCH 07/23] big commit containing auto_tune_v2 and bounds on sleep --- dto.c | 151 +++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 123 insertions(+), 28 deletions(-) diff --git a/dto.c b/dto.c index 3c2470f..f39ce87 100644 --- a/dto.c +++ b/dto.c @@ -28,6 +28,9 @@ #define likely(x) __builtin_expect((x), 1) #define unlikely(x) __builtin_expect((x), 0) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + // DSA capabilities #define GENCAP_CC_MEMORY 0x4 @@ -35,7 +38,6 @@ #define C01_STATE 1 #define C02_STATE 0 -#define TPAUSE_DELAY 1000 #define USE_ORIG_FUNC(n, use_dsa) (use_std_lib_calls == 1 || !use_dsa || n < dsa_min_size) #define TS_NS(s, e) (((e.tv_sec*1000000000) + e.tv_nsec) - ((s.tv_sec*1000000000) + s.tv_nsec)) @@ -45,6 +47,7 @@ * Allocating memory dynamically may create cyclic dependency and may cause * a hang (e.g., memset --> malloc --> alloc library calls memset --> memset) */ +#define WQ_MMAPPED 1 #define MAX_WQS 32 #define MAX_NUMA_NODES 32 #define DTO_DEFAULT_MIN_SIZE 32768 @@ -124,6 +127,12 @@ static bool dto_use_c02 = true; //C02 state is default - //C02 avg exit latency is ~500 ns //and C01 is about ~240 ns on SPR +#define TPAUSE_C02_DELAY 10000 //in this case we are offloading so delay can + //be 4~5 us + +#define TPAUSE_C01_DELAY 1000 //keep smaller because we want to wake up + //with lower latency + static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT; static uint8_t fork_handler_registered; @@ -263,6 +272,20 @@ static unsigned int log_level = LOG_LEVEL_FATAL; #define DMS_STEP_INCREMENT 1024 #define DMS_STEP_DECREMENT 1024 +/* Auto tune v2 */ +#define KP 0.5 +#define KI 0.1 +#define SAMPLE_INTERVAL 10000 +#define AUTO_ADJUST_KNOBS 1 +#define AUTO_ADJUST_KNOBS_V2 2 + +#define AUTO_TUNE_V2_TARGET 1 + +static __thread uint64_t tl_num_descs = 0; +static __thread uint64_t tl_next_sample = 0; +static __thread uint64_t tl_integral = 0; +static __thread uint64_t tl_cpu_size_fraction = 0; + /* Auto tuning variables */ static atomic_ullong num_descs; static atomic_ullong adjust_num_descs; @@ -379,8 +402,8 @@ static __always_inline void dsa_wait_busy_poll(const volatile uint8_t *comp) static __always_inline void dsa_wait_tpause(const volatile uint8_t *comp) { while (*comp == 0) { - tpause(__rdtsc() + TPAUSE_DELAY, - dto_use_c02 ? C02_STATE : C01_STATE); //default is 1000 cycles + tpause(__rdtsc() + dto_use_c02 ? TPAUSE_C02_DELAY : TPAUSE_C01_DELAY, + dto_use_c02 ? C02_STATE : C01_STATE); } } @@ -410,8 +433,8 @@ static __always_inline void __dsa_wait(const volatile uint8_t *comp) __dsa_wait_umwait(comp); break; case WAIT_TPAUSE: - tpause(__rdtsc() + TPAUSE_DELAY, - dto_use_c02 ? C02_STATE : C01_STATE); //default is 1000 cycles + tpause(__rdtsc() + dto_use_c02 ? TPAUSE_C02_DELAY : TPAUSE_C01_DELAY, + dto_use_c02 ? C02_STATE : C01_STATE); break; default: _mm_pause(); @@ -502,13 +525,42 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp) } } +static __always_inline void dsa_wait_and_adjust_v2(const volatile uint8_t *comp) +{ + + if (++tl_num_descs == tl_next_sample) { + uint64_t local_num_waits = 0; + while (*comp == 0) { + __dsa_wait(comp); + local_num_waits++; + } + int64_t error = local_num_waits - AUTO_TUNE_V2_TARGET; + tl_integral += error; + uint64_t new_frac = tl_cpu_size_fraction - KP * error + KI * tl_integral; + + // Clamp within valid range + tl_cpu_size_fraction = MAX(1, MIN(MAX_CPU_SIZE_FRACTION, new_frac)); + tl_next_sample += rand() % SAMPLE_INTERVAL*2 + 1; + } else { + while (*comp == 0) { + __dsa_wait(comp); + } + } +} + static __always_inline int dsa_wait(struct dto_wq *wq, struct dsa_hw_desc *hw, volatile uint8_t *comp) { - if (auto_adjust_knobs) - dsa_wait_and_adjust(comp); - else - dsa_wait_no_adjust(comp); + switch (auto_adjust_knobs) { + case AUTO_ADJUST_KNOBS: + dsa_wait_and_adjust(comp); + break; + case AUTO_ADJUST_KNOBS_V2: + dsa_wait_and_adjust_v2(comp); + break; + default: + dsa_wait_no_adjust(comp); + } if (likely(*comp == DSA_COMP_SUCCESS)) { thr_bytes_completed += hw->xfer_size; @@ -550,21 +602,32 @@ static __always_inline int dsa_execute(struct dto_wq *wq, //LOG_TRACE("desc flags: 0x%x, opcode: 0x%x\n", hw->flags, hw->opcode); __builtin_ia32_sfence(); - if (wq->wq_mmapped) + switch (wq->wq_mmapped) { + case WQ_MMAPPED: ret = enqcmd(hw, wq->wq_portal); - - else { + break; + default: ret = write(wq->wq_fd, hw, sizeof(*hw)); - if (ret != sizeof(*hw)) + if (ret != sizeof(*hw)) { return FAIL_OTHERS; - else + } + else { ret = 0; - } + } + break; + } + if (!ret) { - if (auto_adjust_knobs) - dsa_wait_and_adjust(comp); - else - dsa_wait_no_adjust(comp); + switch (auto_adjust_knobs) { + case AUTO_ADJUST_KNOBS: + dsa_wait_and_adjust(comp); + break; + case AUTO_ADJUST_KNOBS_V2: + dsa_wait_and_adjust_v2(comp); + break; + default: + dsa_wait_no_adjust(comp); + } if (*comp == DSA_COMP_SUCCESS) { thr_bytes_completed += hw->xfer_size; @@ -1177,7 +1240,24 @@ static int dsa_init(void) wait_method = WAIT_BUSYPOLL; } } else if (!strncmp(env_str, wait_names[WAIT_SLEEP], strlen(wait_names[WAIT_SLEEP]))) { - if (cpu_size_fraction == 0.0 && auto_adjust_knobs == 0) { + double local_cpu_size_fraction = 0; + env_str = getenv("DTO_CPU_SIZE_FRACTION"); + + if (env_str != NULL) { + errno = 0; + local_cpu_size_fraction = strtod(env_str, NULL); + } + + uint64_t local_auto_adjust_knobs = 0; + env_str = getenv("DTO_AUTO_ADJUST_KNOBS"); + + if (env_str != NULL) { + errno = 0; + local_auto_adjust_knobs = strtoul(env_str, NULL, 10); + if (errno) + local_auto_adjust_knobs = 1; + } + if (local_cpu_size_fraction < 0.001 && local_auto_adjust_knobs == 0) { wait_method = WAIT_SLEEP; } else { LOG_ERROR("sleep not supported for partial offloading (fraction > 0 and/or autotuning is selected\n"); @@ -1365,6 +1445,7 @@ static int init_dto(void) } /* Use only 2 digits after decimal point */ cpu_size_fraction = cpu_size_fraction_float * 100; + tl_cpu_size_fraction = cpu_size_fraction; } env_str = getenv("DTO_AUTO_ADJUST_KNOBS"); @@ -1374,10 +1455,19 @@ static int init_dto(void) auto_adjust_knobs = strtoul(env_str, NULL, 10); if (errno) auto_adjust_knobs = 1; - - auto_adjust_knobs = !!auto_adjust_knobs; + if (auto_adjust_knobs == AUTO_ADJUST_KNOBS_V2) { + tl_next_sample = rand() % (SAMPLE_INTERVAL*2) + 1; + } } + // Only use c02 if we are offloading a significant chunk to + // DSA so we amortize the exit latency of C02 state + if (cpu_size_fraction <= 20 && auto_adjust_knobs == 0) { + dto_use_c02 = true; + } else { + dto_use_c02 = false; + } + if (numa_available() != -1) { env_str = getenv("DTO_IS_NUMA_AWARE"); if (env_str != NULL) { @@ -1405,9 +1495,9 @@ static int init_dto(void) // display configuration LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, " - "cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s, dto_dsa_cc: %d\n", + "cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s, dto_dsa_cc: %d, dto_use_c02: %d\n", log_level, collect_stats, use_std_lib_calls, dsa_min_size, - cpu_size_fraction_float, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware], dto_dsa_cc); + cpu_size_fraction_float, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware], dto_dsa_cc, dto_use_c02); for (int i = 0; i < num_wqs; i++) LOG_TRACE("[%d] wq_path: %s, wq_size: %d, dsa_cap: %lx\n", i, wqs[i].wq_path, wqs[i].wq_size, wqs[i].dsa_gencap); @@ -1481,7 +1571,9 @@ static void dto_memset(void *s, int c, size_t n, int *result) thr_desc.pattern = memset_pattern; /* cpu_size_fraction guaranteed to be >= 0 and < 100 */ - cpu_size = n * cpu_size_fraction / 100; + uint64_t cpu_frac = auto_adjust_knobs == AUTO_ADJUST_KNOBS_V2 ? + tl_cpu_size_fraction : cpu_size_fraction; + cpu_size = n * cpu_frac / 100; dsa_size = n - cpu_size; thr_bytes_completed = 0; @@ -1499,7 +1591,7 @@ static void dto_memset(void *s, int c, size_t n, int *result) } } else { uint32_t threshold; - size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm + size_t current_cpu_size_fraction = cpu_frac; // the cpu_size_fraction might be changed by the auto tune algorithm threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction); do { @@ -1557,11 +1649,14 @@ static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy thr_desc.flags |= IDXD_OP_FLAG_CC; thr_desc.completion_addr = (uint64_t)&thr_comp; + uint64_t cpu_frac = auto_adjust_knobs == AUTO_ADJUST_KNOBS_V2 ? + tl_cpu_size_fraction : cpu_size_fraction; + /* cpu_size_fraction guaranteed to be >= 0 and < 1 */ if (!is_memcpy && is_overlapping_buffers(dest, src, n)) cpu_size = 0; else - cpu_size = n * cpu_size_fraction / 100; + cpu_size = n * cpu_frac / 100; dsa_size = n - cpu_size; @@ -1585,7 +1680,7 @@ static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy } } else { uint32_t threshold; - size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm + size_t current_cpu_size_fraction = cpu_frac; // the cpu_size_fraction might be changed by the auto tune algorithm threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction); do { size_t len; From 117bb8036e1fad3810309e8f308ff31734aa9f25 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Mon, 31 Mar 2025 16:24:32 -0700 Subject: [PATCH 08/23] update readme for cachelib --- CACHELIB_USAGE_GUIDE.md | 75 ++++++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 15 deletions(-) diff --git a/CACHELIB_USAGE_GUIDE.md b/CACHELIB_USAGE_GUIDE.md index 0c6273f..64cee21 100644 --- a/CACHELIB_USAGE_GUIDE.md +++ b/CACHELIB_USAGE_GUIDE.md @@ -47,12 +47,28 @@ dmesg | grep "idxd " lspci | grep 0b25 ``` ### Initialize DSA device(s) + + +#### Accel Config Script ```sh ./accelConfig.sh ``` To enable the first DSA device (assuming it is device 0) with a shared work queue and 4 engines, you would run: ```sh -./accelConfig.sh 0 +sudo ./accelConfig.sh 0 yes 0 4 +``` +#### DSA Device Permissions +To allow userpace applications to submit work to the DSA, you need to give the appropriate permissions, +for example to give all users the ability to submit work to device 0 work queue: +```sh +sudo chmod 0777 /dev/dsa/wq0.0 +``` +Of course, regular Unix permissions can be used to control access to the device. + +#### Accel config +You can also setup the device using the accel-config tool with a configuration file: +```sh +sudo accel-config -c ``` ## Using the DTO library @@ -71,14 +87,16 @@ ldd --version sudo dnf install -y \ kernel-headers \ accel-config-devel \ - libuuid-devel + libuuid-devel \ + numactl-devel ``` - **Ubuntu/Debian**: ```sh sudo apt-get install -y \ linux-libc-dev \ libaccel-config-dev \ - uuid-dev + uuid-dev \ + numactl-dev ``` ### Build DTO Library @@ -87,7 +105,7 @@ make libdto sudo make install ``` -### Build the Test Application +### Build the Test Application With Stats Output ```sh make dto-test source dto_env.sh @@ -112,25 +130,17 @@ cmake .. \ make -j$(nproc) make install ``` -### Example Usage +### Example Usage For Complete Offload You can set the following environment variables to use the DTO library with CacheLib: ```plaintext -LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib - DTO_USESTDC_CALLS=0 // Set to 1 to use standard C library calls (like memcpy) instead of DSA enabled calls - DTO_COLLECT_STATS=0 // Set to 1 to collect stats on DSA performance - -DTO_WAIT_METHOD=umwait // sleep, umwait, yield, or busypoll after submitting to DSA - -DTO_MIN_BYTES=32768 // Minimum bytes for DSA to process, smaller requests will be done on CPU - +DTO_WAIT_METHOD=sleep // sleep, umwait, tpause, yield, or busypoll after submitting to DSA +DTO_MIN_BYTES=32768 // Minimum bytes for DSA to process, smaller requests will be done on CPU to avoid latency increase DTO_CPU_SIZE_FRACTION=0.0 //offload the entire call to DSA - DTO_AUTO_ADJUST_KNOBS=0 //Tries to find the optimal split between CPU fraction and DSA fraction, attempting to minimize the difference in time between the two - DTO_DSA_CC=0 // Set to 0 to bypass CPU cache for the destination buffer, this will ensure that the data is written from DSA directly to memory avoidng polluting the cache with a large buffer that has low probablity of being reaccessed. ``` @@ -139,6 +149,41 @@ source ~/DTO/dto_cachelib_env.sh ./opt/cachelib/bin/cachebench --json_test_config ~/DTO/cachebench_config.json ``` +### Example Usage For Partial Offload + +Set the following enviroment variables to use the DTO library with CacheLib: +```plaintext +DTO_USESTDC_CALLS=0 // Set to 1 to use standard C library calls (like memcpy) instead of DSA enabled calls +DTO_COLLECT_STATS=0 // Set to 1 to collect stats on DSA performance +DTO_WAIT_METHOD=tpause // or umait +DTO_MIN_BYTES=32768 // Minimum bytes for DSA to process, smaller requests will be done on CPU +DTO_CPU_SIZE_FRACTION=0.0 //offload the entire call to DSA +DTO_AUTO_ADJUST_KNOBS=2 //Tries to find the optimal split between CPU fraction and DSA fraction, attempting to minimize the difference in time between the two +DTO_DSA_CC=0 // Set to 0 to bypass CPU cache for the destination buffer, this will ensure that the data is written from DSA directly to memory avoidng polluting the cache with a large buffer that has low probablity of being reaccessed. +``` +```sh +./opt/cachelib/bin/cachebench --json_test_config ~/DTO/cachebench_config.json +``` + +## A note on the wait methods + +The wait method is used to wait for the DSA to complete the operation. The following methods are supported: +- **sleep**: The thread will sleep for a fixed amount of time. +- **umwait**: The thread will use the umwait instruction to wait for the DSA to complete the operation. Umwait +monitors the address of the DSA descriptor and waits for the DSA to complete the operation. +- **tpause**: The thread will use the tpause instruction to wait for the DSA to complete the operation. +- **yield**: The thread will yield the CPU to the OS. +- **busypoll**: The thread will busy poll the DSA to check if the operation is complete. It uses the +regular mm_pause instruction to send noops to the core while waiting. + +Both umwait and tpause are recommended as they are more efficient than sleep and yield since they do not +cause a context switch. In addition, umwait and tpause are more efficient than busypoll as the allow +the CPU to enter C0.1 or C0.2 states which are low power states, C0.2 has ~0.5us exit latency while C0.1 +has ~0.25us exit latency. Therefore, C0.1 is used when autotuning is enabled to do a portion of the work +on the CPU and the remainder on the core as our goal is to minimize the time gap between the two. If we +are doing a complete offload we enter C0.2. + + ### Additional References - [Data Streaming Accelerator User Guide](https://www.intel.com/content/www/us/en/content-details/759709/intel-data-streaming-accelerator-user-guide.html) - [Data Streaming Accelerator Architecture Specification](https://cdrdv2-public.intel.com/671116/341204-intel-data-streaming-accelerator-spec.pdf) From 5c4bc803794019abbf27f4a90eaf023b6ba336b2 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Tue, 1 Apr 2025 07:29:36 -0700 Subject: [PATCH 09/23] test enviroment variables --- dto_env.sh | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100755 dto_env.sh diff --git a/dto_env.sh b/dto_env.sh new file mode 100755 index 0000000..8f34761 --- /dev/null +++ b/dto_env.sh @@ -0,0 +1,7 @@ +#!/bin/bash +export DTO_USESTDC_CALLS=0 +export DTO_COLLECT_STATS=1 +export DTO_WAIT_METHOD=umwait +export DTO_MIN_BYTES=32768 +export DTO_CPU_SIZE_FRACTION=0.0 +export DTO_AUTO_ADJUST_KNOBS=0 From c3ab1c4b7d45f839984f188debf56fab12f64baa Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Tue, 1 Apr 2025 07:37:35 -0700 Subject: [PATCH 10/23] add crb --- CACHELIB_USAGE_GUIDE.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/CACHELIB_USAGE_GUIDE.md b/CACHELIB_USAGE_GUIDE.md index 64cee21..9a76b7b 100644 --- a/CACHELIB_USAGE_GUIDE.md +++ b/CACHELIB_USAGE_GUIDE.md @@ -80,15 +80,17 @@ sudo accel-config -c ldd --version ``` - Systems with glibc versions **less than 2.36** may experience reduced DTO performance due to a known bug. - +- Centos 10 requires the CRB repo to be enabled for 'accel-config-devel' #### Package Requirements - **Fedora/CentOS/RHEL**: ```sh +sudo dnf config-manager --set-enabled crb sudo dnf install -y \ kernel-headers \ accel-config-devel \ - libuuid-devel \ - numactl-devel + accel-config \ + numactl-devel \ + libuuid-devel ``` - **Ubuntu/Debian**: ```sh From e7b91a3949806b9bb772a72661b4923715f2ed01 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Tue, 1 Apr 2025 10:44:37 -0700 Subject: [PATCH 11/23] update cmake for cachelib --- CACHELIB_USAGE_GUIDE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CACHELIB_USAGE_GUIDE.md b/CACHELIB_USAGE_GUIDE.md index 9a76b7b..8a07820 100644 --- a/CACHELIB_USAGE_GUIDE.md +++ b/CACHELIB_USAGE_GUIDE.md @@ -126,7 +126,7 @@ Cachebench needs a change for setStringItem to use 'std::memmove' instead of 'st cd ~/CacheLib/ git apply ~/DTO/cachelib.patch cd build-cachelib/ -cmake .. \ +cmake ../cachelib \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DBUILD_WITH_DTO=ON make -j$(nproc) From 4e65625d07f3f1136b83ff4a2ba8048495305498 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 2 Apr 2025 10:34:18 -0700 Subject: [PATCH 12/23] update size check for CPU vs. DSA --- dto.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dto.c b/dto.c index f39ce87..84a379c 100644 --- a/dto.c +++ b/dto.c @@ -39,7 +39,7 @@ #define C01_STATE 1 #define C02_STATE 0 -#define USE_ORIG_FUNC(n, use_dsa) (use_std_lib_calls == 1 || !use_dsa || n < dsa_min_size) +#define USE_ORIG_FUNC(n, use_dsa) (use_std_lib_calls == 1 || !use_dsa || (n*(100-cpu_size_fraction)/100) < dsa_min_size) #define TS_NS(s, e) (((e.tv_sec*1000000000) + e.tv_nsec) - ((s.tv_sec*1000000000) + s.tv_nsec)) /* Maximum WQs that DTO will use. It is rather an arbitrary limit From 113ed1176173b1b18adfe210082e359c7859d7ba Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Thu, 12 Oct 2023 17:20:34 -0400 Subject: [PATCH 13/23] initial cmake --- CMakeLists.txt | 28 ++++++++++++++++++++++++++++ Makefile | 31 ------------------------------- 2 files changed, 28 insertions(+), 31 deletions(-) create mode 100644 CMakeLists.txt delete mode 100644 Makefile diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..30c28f4 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.0) +project(libdto) + +# Build the shared library +add_library(dto SHARED dto.c) + +# set gnu source everywhere +add_compile_definitions(_GNU_SOURCE) + +# Add the -DDTO_STATS_SUPPORT preprocessor definition +target_compile_definitions(dto PRIVATE DTO_STATS_SUPPORT) + +# Link libraries +target_link_libraries(dto accel-config dl) + +# Install targets +install( + TARGETS dto + LIBRARY DESTINATION lib +) + +# Build dto-test and dto-test-wodto +add_executable(dto-test dto-test.c) +target_link_libraries(dto-test PRIVATE dto pthread) + +add_executable(dto-test-wodto dto-test.c) +target_link_libraries(dto-test-wodto PRIVATE pthread) + diff --git a/Makefile b/Makefile deleted file mode 100644 index 664e7c1..0000000 --- a/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (C) 2023 Intel Corporation -# -# SPDX-License-Identifier: MIT - -all: libdto dto-test-wodto - -DML_LIB_CXX=-D_GNU_SOURCE - -libdto: dto.c - gcc -shared -O3 -march=native -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl -lnuma - -libdto_nostats: dto.c - gcc -shared -O3 -march=native -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl -lnuma - -install: - cp libdto.so.1.0 /usr/lib64/ - ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so.1 - ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so - -install-local: - ln -sf ./libdto.so.1.0 ./libdto.so.1 - ln -sf ./libdto.so.1.0 ./libdto.so - -dto-test: dto-test.c - gcc -g dto-test.c $(DML_LIB_CXX) -o dto-test -ldto -lpthread - -dto-test-wodto: dto-test.c - gcc -g dto-test.c $(DML_LIB_CXX) -o dto-test-wodto -lpthread - -clean: - rm -rf *.o *.so dto-test From 3afabc86671c547c47fe1b89a535ded2d882f50d Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Tue, 9 Sep 2025 06:24:44 -0700 Subject: [PATCH 14/23] add libnuma --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 30c28f4..4fd729a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ add_compile_definitions(_GNU_SOURCE) target_compile_definitions(dto PRIVATE DTO_STATS_SUPPORT) # Link libraries -target_link_libraries(dto accel-config dl) +target_link_libraries(dto accel-config dl numa) # Install targets install( From 9b75566cbb02061fc837dddd33ee68238350d51a Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Tue, 9 Sep 2025 17:41:29 -0700 Subject: [PATCH 15/23] added cmake pc --- CMakeLists.txt | 42 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fd729a..26f604e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,12 @@ cmake_minimum_required(VERSION 3.0) -project(libdto) +project(DTO VERSION 1.0 LANGUAGES C) + +include(GNUInstallDirs) +set(CMAKE_INSTALL_LIBDIR lib) # Build the shared library add_library(dto SHARED dto.c) +add_library(DTO::dto ALIAS dto) # set gnu source everywhere add_compile_definitions(_GNU_SOURCE) @@ -13,16 +17,38 @@ target_compile_definitions(dto PRIVATE DTO_STATS_SUPPORT) # Link libraries target_link_libraries(dto accel-config dl numa) -# Install targets -install( - TARGETS dto - LIBRARY DESTINATION lib -) - # Build dto-test and dto-test-wodto add_executable(dto-test dto-test.c) -target_link_libraries(dto-test PRIVATE dto pthread) +target_link_libraries(dto-test PRIVATE DTO::dto pthread) add_executable(dto-test-wodto dto-test.c) target_link_libraries(dto-test-wodto PRIVATE pthread) +# Install and export the library +install(TARGETS dto + EXPORT DTOTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +install(EXPORT DTOTargets + FILE DTOTargets.cmake + NAMESPACE DTO:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/DTO) + +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/DTOConfigVersion.cmake" + VERSION ${PROJECT_VERSION} + COMPATIBILITY AnyNewerVersion) + +configure_package_config_file( + "${CMAKE_CURRENT_SOURCE_DIR}/DTOConfig.cmake.in" + "${CMAKE_CURRENT_BINARY_DIR}/DTOConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/DTO) + +install(FILES + "${CMAKE_CURRENT_BINARY_DIR}/DTOConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/DTOConfigVersion.cmake" + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/DTO) + From ffec551e21694f8a48430885e92d308d67eba07d Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 9 Apr 2025 17:50:35 -0700 Subject: [PATCH 16/23] the header --- dto.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 dto.h diff --git a/dto.h b/dto.h new file mode 100644 index 0000000..30045dd --- /dev/null +++ b/dto.h @@ -0,0 +1,18 @@ + +#ifndef DTO_H +#define DTO_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void(*callback_t)(void*); + +void dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args); + +#ifdef __cplusplus +} +#endif + +#endif + From 5a7643298fcd33b1e619eeb6b2fb1af644999166 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Mon, 7 Jul 2025 06:50:55 -0700 Subject: [PATCH 17/23] CRC offload --- accelConfig.sh | 2 +- dto.c | 168 +++++++++++++++++++++++++++++++++++++++++++++++++ dto.h | 2 + 3 files changed, 171 insertions(+), 1 deletion(-) diff --git a/accelConfig.sh b/accelConfig.sh index 962f83f..d0225da 100755 --- a/accelConfig.sh +++ b/accelConfig.sh @@ -41,7 +41,7 @@ accel-config config-wq ${SWQ} --priority=1 accel-config config-wq ${SWQ} --wq-size=128 accel-config config-wq ${SWQ} --max-batch-size=1024 accel-config config-wq ${SWQ} --max-transfer-size=2147483648 -accel-config config-wq ${SWQ} --block-on-fault=0 +accel-config config-wq ${SWQ} --block-on-fault=1 accel-config config-wq ${SWQ} --type=user accel-config config-wq ${SWQ} --name="dsa-test" accel-config config-wq ${SWQ} --mode=shared diff --git a/dto.c b/dto.c index 84a379c..052f56b 100644 --- a/dto.c +++ b/dto.c @@ -24,6 +24,8 @@ #include #include #include +#include "dto.h" +#include // For _mm_crc32_u32 etc. #define likely(x) __builtin_expect((x), 1) #define unlikely(x) __builtin_expect((x), 0) @@ -297,6 +299,29 @@ static uint8_t auto_adjust_knobs = 1; extern char *__progname; +uint32_t crc32c_hw(const uint8_t* data, size_t len) { + uint32_t crc = 0; // Initial value, can be 0 or 0xFFFFFFFF depending on convention + + while (len >= sizeof(uint64_t)) { + crc = _mm_crc32_u64(crc, *(uint64_t*)data); + data += sizeof(uint64_t); + len -= sizeof(uint64_t); + } + + while (len >= sizeof(uint32_t)) { + crc = _mm_crc32_u32(crc, *(uint32_t*)data); + data += sizeof(uint32_t); + len -= sizeof(uint32_t); + } + + while (len--) { + crc = _mm_crc32_u8(crc, *data++); + } + + return crc; +} + + static void dto_log(int req_log_level, const char *fmt, ...) { char buf[512]; @@ -1638,6 +1663,149 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n) return true; } +__attribute__((visibility("default"))) uint64_t dto_crc(const void *src, size_t n, callback_t cb, void* args) { + //submit dsa work if successful, call the callback + if (use_std_lib_calls || n < dsa_min_size) { + if (cb) { + cb(args); + } + return crc32c_hw(src, n); + } + int result = 0; + struct dto_wq *wq = get_wq(src); + size_t dsa_size = n; +#ifdef DTO_STATS_SUPPORT + struct timespec st, et; + size_t orig_n = n; + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + thr_desc.opcode = DSA_OPCODE_CRCGEN; + thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_BOF; + if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + thr_desc.flags |= IDXD_OP_FLAG_CC; + thr_desc.completion_addr = (uint64_t)&thr_comp; + + thr_bytes_completed = 0; + thr_desc.src_addr = (uint64_t) src; + thr_desc.dst_addr = 0; // dst_addr is not used for CRC generation + thr_desc.xfer_size = (uint32_t) dsa_size; + thr_desc.crc_seed = 0; // default seed valuie + thr_desc.rsvd = 0; + thr_comp.status = 0; + result = dsa_submit(wq, &thr_desc); + if (result == SUCCESS) { + if (cb) { + cb(args); + } + result = dsa_wait(wq, &thr_desc, &thr_comp.status); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY_ASYNC, n, thr_bytes_completed, result); +#endif + if (thr_bytes_completed < n) { + return 0; + } + return thr_comp.crc_val; +} + +__attribute__((visibility("default"))) uint64_t dto_memcpy_crc_async(void *dest, const void *src, size_t n, callback_t cb, void* args) { + //submit dsa work if successful, call the callback + if (use_std_lib_calls || n < dsa_min_size) { + if (cb) { + cb(args); + } + orig_memcpy(dest, src, n); + return crc32c_hw(src, n); + } + int result = 0; + struct dto_wq *wq = get_wq(dest); + size_t dsa_size = n; +#ifdef DTO_STATS_SUPPORT + struct timespec st, et; + size_t orig_n = n; + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + thr_desc.opcode = DSA_OPCODE_COPY_CRC; + thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_BOF; + if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + thr_desc.flags |= IDXD_OP_FLAG_CC; + thr_desc.completion_addr = (uint64_t)&thr_comp; + + thr_bytes_completed = 0; + thr_desc.src_addr = (uint64_t) src; + thr_desc.dst_addr = (uint64_t) dest; + thr_desc.xfer_size = (uint32_t) dsa_size; + thr_desc.crc_seed = 0; // default seed valuie + thr_desc.rsvd = 0; + thr_comp.status = 0; + result = dsa_submit(wq, &thr_desc); + if (result == SUCCESS) { + if (cb) { + cb(args); + } + result = dsa_wait(wq, &thr_desc, &thr_comp.status); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY_ASYNC, n, thr_bytes_completed, result); +#endif + if (thr_bytes_completed < n) { + return 0; + } + return thr_comp.crc_val; +} + +__attribute__((visibility("default"))) void dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args) { + //submit dsa work if successful, call the callback + int result = 0; + struct dto_wq *wq = get_wq(dest); + size_t dsa_size = n; +#ifdef DTO_STATS_SUPPORT + struct timespec st, et; + size_t orig_n = n; + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + thr_desc.opcode = DSA_OPCODE_MEMMOVE; + thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR; + if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + thr_desc.flags |= IDXD_OP_FLAG_CC; + thr_desc.completion_addr = (uint64_t)&thr_comp; + + thr_bytes_completed = 0; + + thr_desc.src_addr = (uint64_t) src; + thr_desc.dst_addr = (uint64_t) dest; + thr_desc.xfer_size = (uint32_t) dsa_size; + thr_comp.status = 0; + result = dsa_submit(wq, &thr_desc); + if (result == SUCCESS) { + cb(args); + result = dsa_wait(wq, &thr_desc, &thr_comp.status); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY_ASYNC, n, thr_bytes_completed, result); +#endif + if (thr_bytes_completed != n) { + /* fallback to std call if job is only partially completed */ + n -= thr_bytes_completed; + if (thr_comp.result == 0) { + dest = (void *)((uint64_t)dest + thr_bytes_completed); + src = (const void *)((uint64_t)src + thr_bytes_completed); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + orig_memcpy(dest, src, n); + +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCOPY, n, orig_n); +#endif + } +} + static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result) { struct dto_wq *wq = get_wq(dest); diff --git a/dto.h b/dto.h index 30045dd..4ca65a9 100644 --- a/dto.h +++ b/dto.h @@ -9,6 +9,8 @@ extern "C" { typedef void(*callback_t)(void*); void dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args); +uint64_t dto_memcpy_crc_async(void *dest, const void *src, size_t n, callback_t cb, void* args); +uint64_t dto_crc(const void *src, size_t n, callback_t cb, void* args); #ifdef __cplusplus } From 037bdf43b519974b7c78d81909b646478f813983 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 9 Apr 2025 17:47:58 -0700 Subject: [PATCH 18/23] tests --- dto.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 68 insertions(+), 11 deletions(-) diff --git a/dto.c b/dto.c index 052f56b..e44e13c 100644 --- a/dto.c +++ b/dto.c @@ -57,6 +57,10 @@ #define DTO_INITIALIZED 0 #define DTO_INITIALIZING 1 +#define NSEC_PER_SEC (1000000000) +#define MSEC_PER_SEC (1000) +#define NSEC_PER_MSEC (NSEC_PER_SEC/MSEC_PER_SEC) + // thread specific variables static __thread struct dsa_hw_desc thr_desc; static __thread struct dsa_completion_record thr_comp __attribute__((aligned(32))); @@ -118,6 +122,7 @@ static enum numa_aware is_numa_aware; static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE; static int wait_method = WAIT_YIELD; static size_t cpu_size_fraction; // range of values is 0 to 99 +static uint64_t wait_time = 100000; //10K nanoseconds static uint8_t dto_dsa_memcpy = 1; static uint8_t dto_dsa_memmove = 1; @@ -142,6 +147,7 @@ static uint8_t fork_handler_registered; enum memop { MEMSET = 0x0, MEMCOPY, + MEMCOPY_ASYNC, MEMMOVE, MEMCMP, MAX_MEMOP, @@ -150,6 +156,7 @@ enum memop { static const char * const memop_names[] = { [MEMSET] = "set", [MEMCOPY] = "cpy", + [MEMCOPY_ASYNC] = "cpy_async", [MEMMOVE] = "mov", [MEMCMP] = "cmp" }; @@ -424,28 +431,57 @@ static __always_inline void dsa_wait_busy_poll(const volatile uint8_t *comp) } } -static __always_inline void dsa_wait_tpause(const volatile uint8_t *comp) +//static __always_inline void dsa_wait_tpause(const volatile uint8_t *comp) +static void dsa_wait_tpause(const volatile uint8_t *comp) { - while (*comp == 0) { - tpause(__rdtsc() + dto_use_c02 ? TPAUSE_C02_DELAY : TPAUSE_C01_DELAY, - dto_use_c02 ? C02_STATE : C01_STATE); - } + do { + uint64_t delay = 0; + _mm_mfence(); + _mm_lfence(); + delay = _rdtsc(); + _mm_lfence(); + //delay = delay + dto_use_c02 ? TPAUSE_C02_DELAY : TPAUSE_C01_DELAY; + delay = delay + wait_time; + //unsigned int state = dto_use_c02 ? C02_STATE : C01_STATE; + //while (_tpause( 1 , delay) == 1); + _tpause( 0 , delay); + } while (*comp == 0); + //tpause(__rdtsc() + dto_use_c02 ? TPAUSE_C02_DELAY : TPAUSE_C01_DELAY, + //dto_use_c02 ? C02_STATE : C01_STATE); + + //} } static __always_inline void __dsa_wait_umwait(const volatile uint8_t *comp) { - umonitor(comp); + _umonitor((void*)comp); - uint64_t delay = __rdtsc() + dto_umwait_delay; - umwait(delay, dto_use_c02 ? C02_STATE : C01_STATE); + uint64_t delay = 0; + _mm_mfence(); + _mm_lfence(); + delay = _rdtsc(); + uint64_t start = delay; + _mm_lfence(); + //delay = delay + dto_use_c02 ? TPAUSE_C02_DELAY : TPAUSE_C01_DELAY; + delay = delay + wait_time*10; + //umwait(delay, dto_use_c02 ? C02_STATE : C01_STATE); + + _umwait(1, delay); + _mm_mfence(); + _mm_lfence(); + uint64_t end = _rdtsc(); + uint64_t actual = end - start; + _mm_lfence(); + if (rand() % (SAMPLE_INTERVAL) == 0) { + LOG_TRACE("actual delay %d\n", actual); + } } static __always_inline void dsa_wait_umwait(const volatile uint8_t *comp) { - - while (*comp == 0) { + do { __dsa_wait_umwait(comp); - } + } while (*comp == 0); } static __always_inline void __dsa_wait(const volatile uint8_t *comp) @@ -475,6 +511,9 @@ static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp) case WAIT_UMWAIT: dsa_wait_umwait(comp); break; + case WAIT_TPAUSE: + dsa_wait_tpause(comp); + break; case WAIT_BUSYPOLL: dsa_wait_busy_poll(comp); break; @@ -693,6 +732,7 @@ static void print_stats(void) clock_gettime(CLOCK_BOOTTIME, &dto_end_time); LOG_TRACE("DTO Run Time: %ld ms\n", TS_NS(dto_start_time, dto_end_time)/1000000); + LOG_TRACE("DTO CPU Fraction: %.2f \n", cpu_size_fraction/100.0); // display stats for (int t = 0; t < 2; ++t) { @@ -1517,6 +1557,23 @@ static int init_dto(void) LOG_ERROR("Didn't find any usable DSAs. Falling back to using CPUs.\n"); use_std_lib_calls = 1; } + unsigned int num, den, freq; + unsigned int unused; + unsigned long long tmp; + __get_cpuid( 0x15, &den, &num, &freq, &unused ); + freq /= 1000; + LOG_TRACE( "Core Freq = %u kHz\n", freq ); + LOG_TRACE( "TSC Mult = %u\n", num ); + LOG_TRACE( "TSC Den = %u\n", den ); + freq *= num; + freq /= den; + LOG_TRACE( "CPU freq = %u kHz\n", freq ); + LOG_TRACE( "Requested wait: %llu nsec\n", wait_time ); + tmp = wait_time; + tmp *= freq; + wait_time = tmp / NSEC_PER_MSEC; + LOG_TRACE( "Requested wait duration: %llu cycles\n", wait_time ); + // display configuration LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, " From b278db7c82bd63420c43a18c059885c4878372da Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 30 Apr 2025 16:58:36 -0700 Subject: [PATCH 19/23] clean up wait methods --- dto.c | 73 ++++++++++++++++++----------------------------------------- 1 file changed, 22 insertions(+), 51 deletions(-) diff --git a/dto.c b/dto.c index e44e13c..03f110d 100644 --- a/dto.c +++ b/dto.c @@ -122,7 +122,6 @@ static enum numa_aware is_numa_aware; static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE; static int wait_method = WAIT_YIELD; static size_t cpu_size_fraction; // range of values is 0 to 99 -static uint64_t wait_time = 100000; //10K nanoseconds static uint8_t dto_dsa_memcpy = 1; static uint8_t dto_dsa_memmove = 1; @@ -134,11 +133,14 @@ static bool dto_use_c02 = true; //C02 state is default - //C02 avg exit latency is ~500 ns //and C01 is about ~240 ns on SPR -#define TPAUSE_C02_DELAY 10000 //in this case we are offloading so delay can - //be 4~5 us +#define TPAUSE_C02_DELAY_NS 6000 //in this case we are offloading so delay can + //be ~6 us as this is around the time a > 64KB + //copy takes to complete -#define TPAUSE_C01_DELAY 1000 //keep smaller because we want to wake up - //with lower latency +#define TPAUSE_C01_DELAY_NS 1000 //keep smaller because we want to wake up + //with lower latency + +static uint64_t tpause_wait_time = TPAUSE_C02_DELAY_NS; static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT; @@ -431,57 +433,27 @@ static __always_inline void dsa_wait_busy_poll(const volatile uint8_t *comp) } } -//static __always_inline void dsa_wait_tpause(const volatile uint8_t *comp) -static void dsa_wait_tpause(const volatile uint8_t *comp) +static __always_inline void dsa_wait_tpause(const volatile uint8_t *comp) { - do { - uint64_t delay = 0; - _mm_mfence(); - _mm_lfence(); - delay = _rdtsc(); - _mm_lfence(); - //delay = delay + dto_use_c02 ? TPAUSE_C02_DELAY : TPAUSE_C01_DELAY; - delay = delay + wait_time; - //unsigned int state = dto_use_c02 ? C02_STATE : C01_STATE; - //while (_tpause( 1 , delay) == 1); - _tpause( 0 , delay); - } while (*comp == 0); - //tpause(__rdtsc() + dto_use_c02 ? TPAUSE_C02_DELAY : TPAUSE_C01_DELAY, - //dto_use_c02 ? C02_STATE : C01_STATE); - - //} + while (*comp == 0) { + uint64_t delay = _rdtsc() + tpause_wait_time; + _tpause(C02_STATE, delay); + } } static __always_inline void __dsa_wait_umwait(const volatile uint8_t *comp) { _umonitor((void*)comp); - - uint64_t delay = 0; - _mm_mfence(); - _mm_lfence(); - delay = _rdtsc(); - uint64_t start = delay; - _mm_lfence(); - //delay = delay + dto_use_c02 ? TPAUSE_C02_DELAY : TPAUSE_C01_DELAY; - delay = delay + wait_time*10; - //umwait(delay, dto_use_c02 ? C02_STATE : C01_STATE); - - _umwait(1, delay); - _mm_mfence(); - _mm_lfence(); - uint64_t end = _rdtsc(); - uint64_t actual = end - start; - _mm_lfence(); - if (rand() % (SAMPLE_INTERVAL) == 0) { - LOG_TRACE("actual delay %d\n", actual); - } + + uint64_t delay = _rdtsc() + UMWAIT_DELAY_DEFAULT; + _umwait(C02_STATE, delay); } static __always_inline void dsa_wait_umwait(const volatile uint8_t *comp) { - do { + while (*comp == 0) { __dsa_wait_umwait(comp); - } while (*comp == 0); + } } static __always_inline void __dsa_wait(const volatile uint8_t *comp) @@ -494,8 +466,7 @@ static __always_inline void __dsa_wait(const volatile uint8_t *comp) __dsa_wait_umwait(comp); break; case WAIT_TPAUSE: - tpause(__rdtsc() + dto_use_c02 ? TPAUSE_C02_DELAY : TPAUSE_C01_DELAY, - dto_use_c02 ? C02_STATE : C01_STATE); + _tpause( C01_STATE, _rdtsc() + TPAUSE_C01_DELAY); break; default: _mm_pause(); @@ -1568,11 +1539,11 @@ static int init_dto(void) freq *= num; freq /= den; LOG_TRACE( "CPU freq = %u kHz\n", freq ); - LOG_TRACE( "Requested wait: %llu nsec\n", wait_time ); - tmp = wait_time; + LOG_TRACE( "Requested wait: %llu nsec\n", tpause_wait_time ); + tmp = tpause_wait_time; tmp *= freq; - wait_time = tmp / NSEC_PER_MSEC; - LOG_TRACE( "Requested wait duration: %llu cycles\n", wait_time ); + tpause_wait_time = tmp / NSEC_PER_MSEC; + LOG_TRACE( "Requested wait duration: %llu cycles\n", tpause_wait_time ); // display configuration From b755d9f34b7d06d7aded6a0b7f3cbdf999f8e7e2 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Thu, 1 May 2025 10:42:05 -0700 Subject: [PATCH 20/23] update compile --- dto.c | 2 +- dto_cachelib_env.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dto.c b/dto.c index 03f110d..8497874 100644 --- a/dto.c +++ b/dto.c @@ -466,7 +466,7 @@ static __always_inline void __dsa_wait(const volatile uint8_t *comp) __dsa_wait_umwait(comp); break; case WAIT_TPAUSE: - _tpause( C01_STATE, _rdtsc() + TPAUSE_C01_DELAY); + _tpause( C01_STATE, _rdtsc() + TPAUSE_C01_DELAY_NS); break; default: _mm_pause(); diff --git a/dto_cachelib_env.sh b/dto_cachelib_env.sh index 160c552..0fcf16f 100755 --- a/dto_cachelib_env.sh +++ b/dto_cachelib_env.sh @@ -1,8 +1,8 @@ #!/bin/bash export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib export DTO_USESTDC_CALLS=0 -export DTO_COLLECT_STATS=0 -export DTO_WAIT_METHOD=sleep +export DTO_COLLECT_STATS=1 +export DTO_WAIT_METHOD=umwait export DTO_MIN_BYTES=32768 export DTO_CPU_SIZE_FRACTION=0.0 export DTO_AUTO_ADJUST_KNOBS=0 From 56cc932d2d32528cf119cadce4fc52e0cec97b47 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 10 Sep 2025 06:52:09 -0700 Subject: [PATCH 21/23] update cmake compile options --- CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 26f604e..e812936 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,7 @@ project(DTO VERSION 1.0 LANGUAGES C) include(GNUInstallDirs) set(CMAKE_INSTALL_LIBDIR lib) + # Build the shared library add_library(dto SHARED dto.c) add_library(DTO::dto ALIAS dto) @@ -17,6 +18,12 @@ target_compile_definitions(dto PRIVATE DTO_STATS_SUPPORT) # Link libraries target_link_libraries(dto accel-config dl numa) +include(CheckCCompilerFlag) +check_c_compiler_flag("-mwaitpkg" HAS_WAITPKG) +if (HAS_WAITPKG) + target_compile_options(dto PRIVATE -mwaitpkg -march=native) +endif() + # Build dto-test and dto-test-wodto add_executable(dto-test dto-test.c) target_link_libraries(dto-test PRIVATE DTO::dto pthread) From 59bd4556266e33e74fbfee578e08d79c6b7d0812 Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 10 Sep 2025 06:57:00 -0700 Subject: [PATCH 22/23] DTO config cmake in --- DTOConfig.cmake.in | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 DTOConfig.cmake.in diff --git a/DTOConfig.cmake.in b/DTOConfig.cmake.in new file mode 100644 index 0000000..56e57bd --- /dev/null +++ b/DTOConfig.cmake.in @@ -0,0 +1,5 @@ +@PACKAGE_INIT@ + +include("${CMAKE_CURRENT_LIST_DIR}/DTOTargets.cmake") + +check_required_components(DTO) From 6eac456eb32042a2c2c2a63319bacb3f828d193f Mon Sep 17 00:00:00 2001 From: Daniel Byrne Date: Wed, 10 Sep 2025 07:06:44 -0700 Subject: [PATCH 23/23] add dto.h to cmake install --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index e812936..3dd5952 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,10 @@ set(CMAKE_INSTALL_LIBDIR lib) # Build the shared library add_library(dto SHARED dto.c) add_library(DTO::dto ALIAS dto) +target_include_directories(dto + PUBLIC + $ + $) # set gnu source everywhere add_compile_definitions(_GNU_SOURCE) @@ -38,6 +42,8 @@ install(TARGETS dto LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) +install(FILES dto.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + install(EXPORT DTOTargets FILE DTOTargets.cmake NAMESPACE DTO::