diff --git a/CACHELIB_USAGE_GUIDE.md b/CACHELIB_USAGE_GUIDE.md new file mode 100644 index 0000000..8a07820 --- /dev/null +++ b/CACHELIB_USAGE_GUIDE.md @@ -0,0 +1,193 @@ +# Intel DSA Usage with CacheLib + +## Platform Configuration + +### BIOS Configuration +- Enable **Intel Virtualization Technology for Directed I/O (VT-d)**. + +### Linux Kernel Configuration +Ensure the following kernel options are enabled: +```plaintext +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +CONFIG_INTEL_IOMMU_DEFAULT_ON=y +CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON=y +``` +If either `CONFIG_INTEL_IOMMU_DEFAULT_ON` or `CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON` is not enabled, add the following to the kernel boot parameters: +```plaintext +intel_iommu=on,sm_on +``` + +### Intel DSA Driver (IDXD) +Enable the following kernel options while building or installing the Linux kernel (version 5.18 or later is needed for shared work queues used by DTO): +```plaintext +CONFIG_INTEL_IDXD=m +CONFIG_INTEL_IDXD_SVM=y +CONFIG_INTEL_IDXD_PERFMON=y +``` + +### Work Queues (WQs) +- WQs are on-device queues that contain descriptors submitted to the device. +- They can be configured in two modes: + - **Dedicated (DWQ)** + - **Shared (SWQ)** +- **SWQ is preferred** as it allows multiple clients to submit descriptors simultaneously, improving device utilization compared to DWQs. +- **DTO uses SWQ**. + +### Verify IDXD Driver Initialization +Use the following command to check the kernel message buffer: +```sh +dmesg | grep "idxd " +``` + +## Initialize the DSA Device + +### List All DSA Devices +```sh +lspci | grep 0b25 +``` +### Initialize DSA device(s) + + +#### Accel Config Script +```sh +./accelConfig.sh +``` +To enable the first DSA device (assuming it is device 0) with a shared work queue and 4 engines, you would run: +```sh +sudo ./accelConfig.sh 0 yes 0 4 +``` +#### DSA Device Permissions +To allow userpace applications to submit work to the DSA, you need to give the appropriate permissions, +for example to give all users the ability to submit work to device 0 work queue: +```sh +sudo chmod 0777 /dev/dsa/wq0.0 +``` +Of course, regular Unix permissions can be used to control access to the device. + +#### Accel config +You can also setup the device using the accel-config tool with a configuration file: +```sh +sudo accel-config -c +``` + +## Using the DTO library + +### Pre-requisite Packages +- **glibc version 2.36 or later** is recommended for best DTO performance. +- Check your glibc version: +```sh +ldd --version +``` +- Systems with glibc versions **less than 2.36** may experience reduced DTO performance due to a known bug. +- Centos 10 requires the CRB repo to be enabled for 'accel-config-devel' +#### Package Requirements +- **Fedora/CentOS/RHEL**: +```sh +sudo dnf config-manager --set-enabled crb +sudo dnf install -y \ + kernel-headers \ + accel-config-devel \ + accel-config \ + numactl-devel \ + libuuid-devel +``` +- **Ubuntu/Debian**: +```sh +sudo apt-get install -y \ + linux-libc-dev \ + libaccel-config-dev \ + uuid-dev \ + numactl-dev +``` + +### Build DTO Library +```sh +make libdto +sudo make install +``` + +### Build the Test Application With Stats Output +```sh +make dto-test +source dto_env.sh +./dto-test +``` + +## Using with CacheLib + +To connect to CacheLib (OSS version) using the DTO library, you will need to ensure that your application is linked against the `libdto` library. + +### Linking to CacheLib with build patch + +Cachebench needs a change for setStringItem to use 'std::memmove' instead of 'strncpy' as DTO does not intercept strncpy call. The patch also adds the build option for DTO in cmake. + +```sh +cd ~/CacheLib/ +git apply ~/DTO/cachelib.patch +cd build-cachelib/ +cmake ../cachelib \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DBUILD_WITH_DTO=ON +make -j$(nproc) +make install +``` +### Example Usage For Complete Offload + +You can set the following environment variables to use the DTO library with CacheLib: + +```plaintext +DTO_USESTDC_CALLS=0 // Set to 1 to use standard C library calls (like memcpy) instead of DSA enabled calls +DTO_COLLECT_STATS=0 // Set to 1 to collect stats on DSA performance +DTO_WAIT_METHOD=sleep // sleep, umwait, tpause, yield, or busypoll after submitting to DSA +DTO_MIN_BYTES=32768 // Minimum bytes for DSA to process, smaller requests will be done on CPU to avoid latency increase +DTO_CPU_SIZE_FRACTION=0.0 //offload the entire call to DSA +DTO_AUTO_ADJUST_KNOBS=0 //Tries to find the optimal split between CPU fraction and DSA fraction, attempting to minimize the difference in time between the two +DTO_DSA_CC=0 // Set to 0 to bypass CPU cache for the destination buffer, this will ensure that the data is written from DSA directly to memory avoidng polluting the cache with a large buffer that has low probablity of being reaccessed. +``` + +```sh +source ~/DTO/dto_cachelib_env.sh +./opt/cachelib/bin/cachebench --json_test_config ~/DTO/cachebench_config.json +``` + +### Example Usage For Partial Offload + +Set the following enviroment variables to use the DTO library with CacheLib: +```plaintext +DTO_USESTDC_CALLS=0 // Set to 1 to use standard C library calls (like memcpy) instead of DSA enabled calls +DTO_COLLECT_STATS=0 // Set to 1 to collect stats on DSA performance +DTO_WAIT_METHOD=tpause // or umait +DTO_MIN_BYTES=32768 // Minimum bytes for DSA to process, smaller requests will be done on CPU +DTO_CPU_SIZE_FRACTION=0.0 //offload the entire call to DSA +DTO_AUTO_ADJUST_KNOBS=2 //Tries to find the optimal split between CPU fraction and DSA fraction, attempting to minimize the difference in time between the two +DTO_DSA_CC=0 // Set to 0 to bypass CPU cache for the destination buffer, this will ensure that the data is written from DSA directly to memory avoidng polluting the cache with a large buffer that has low probablity of being reaccessed. +``` +```sh +./opt/cachelib/bin/cachebench --json_test_config ~/DTO/cachebench_config.json +``` + +## A note on the wait methods + +The wait method is used to wait for the DSA to complete the operation. The following methods are supported: +- **sleep**: The thread will sleep for a fixed amount of time. +- **umwait**: The thread will use the umwait instruction to wait for the DSA to complete the operation. Umwait +monitors the address of the DSA descriptor and waits for the DSA to complete the operation. +- **tpause**: The thread will use the tpause instruction to wait for the DSA to complete the operation. +- **yield**: The thread will yield the CPU to the OS. +- **busypoll**: The thread will busy poll the DSA to check if the operation is complete. It uses the +regular mm_pause instruction to send noops to the core while waiting. + +Both umwait and tpause are recommended as they are more efficient than sleep and yield since they do not +cause a context switch. In addition, umwait and tpause are more efficient than busypoll as the allow +the CPU to enter C0.1 or C0.2 states which are low power states, C0.2 has ~0.5us exit latency while C0.1 +has ~0.25us exit latency. Therefore, C0.1 is used when autotuning is enabled to do a portion of the work +on the CPU and the remainder on the core as our goal is to minimize the time gap between the two. If we +are doing a complete offload we enter C0.2. + + +### Additional References +- [Data Streaming Accelerator User Guide](https://www.intel.com/content/www/us/en/content-details/759709/intel-data-streaming-accelerator-user-guide.html) +- [Data Streaming Accelerator Architecture Specification](https://cdrdv2-public.intel.com/671116/341204-intel-data-streaming-accelerator-spec.pdf) + + diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..3dd5952 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,67 @@ +cmake_minimum_required(VERSION 3.0) +project(DTO VERSION 1.0 LANGUAGES C) + +include(GNUInstallDirs) +set(CMAKE_INSTALL_LIBDIR lib) + + +# Build the shared library +add_library(dto SHARED dto.c) +add_library(DTO::dto ALIAS dto) +target_include_directories(dto + PUBLIC + $ + $) + +# set gnu source everywhere +add_compile_definitions(_GNU_SOURCE) + +# Add the -DDTO_STATS_SUPPORT preprocessor definition +target_compile_definitions(dto PRIVATE DTO_STATS_SUPPORT) + +# Link libraries +target_link_libraries(dto accel-config dl numa) + +include(CheckCCompilerFlag) +check_c_compiler_flag("-mwaitpkg" HAS_WAITPKG) +if (HAS_WAITPKG) + target_compile_options(dto PRIVATE -mwaitpkg -march=native) +endif() + +# Build dto-test and dto-test-wodto +add_executable(dto-test dto-test.c) +target_link_libraries(dto-test PRIVATE DTO::dto pthread) + +add_executable(dto-test-wodto dto-test.c) +target_link_libraries(dto-test-wodto PRIVATE pthread) + +# Install and export the library +install(TARGETS dto + EXPORT DTOTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) + +install(FILES dto.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + +install(EXPORT DTOTargets + FILE DTOTargets.cmake + NAMESPACE DTO:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/DTO) + +include(CMakePackageConfigHelpers) +write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/DTOConfigVersion.cmake" + VERSION ${PROJECT_VERSION} + COMPATIBILITY AnyNewerVersion) + +configure_package_config_file( + "${CMAKE_CURRENT_SOURCE_DIR}/DTOConfig.cmake.in" + "${CMAKE_CURRENT_BINARY_DIR}/DTOConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/DTO) + +install(FILES + "${CMAKE_CURRENT_BINARY_DIR}/DTOConfig.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/DTOConfigVersion.cmake" + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/DTO) + diff --git a/DTOConfig.cmake.in b/DTOConfig.cmake.in new file mode 100644 index 0000000..56e57bd --- /dev/null +++ b/DTOConfig.cmake.in @@ -0,0 +1,5 @@ +@PACKAGE_INIT@ + +include("${CMAKE_CURRENT_LIST_DIR}/DTOTargets.cmake") + +check_required_components(DTO) diff --git a/Makefile b/Makefile deleted file mode 100644 index 0d6c0dc..0000000 --- a/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (C) 2023 Intel Corporation -# -# SPDX-License-Identifier: MIT - -all: libdto dto-test-wodto - -DML_LIB_CXX=-D_GNU_SOURCE - -libdto: dto.c - gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl -lnuma - -libdto_nostats: dto.c - gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl -lnuma - -install: - cp libdto.so.1.0 /usr/lib64/ - ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so.1 - ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so - -install-local: - ln -sf ./libdto.so.1.0 ./libdto.so.1 - ln -sf ./libdto.so.1.0 ./libdto.so - -dto-test: dto-test.c - gcc -g dto-test.c $(DML_LIB_CXX) -o dto-test -ldto -lpthread - -dto-test-wodto: dto-test.c - gcc -g dto-test.c $(DML_LIB_CXX) -o dto-test-wodto -lpthread - -clean: - rm -rf *.o *.so dto-test diff --git a/accelConfig.sh b/accelConfig.sh new file mode 100755 index 0000000..d0225da --- /dev/null +++ b/accelConfig.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +echo "OPTIONAL Arg-1: DSA device id. Default: 0" +echo "OPTIONAL Arg-2: Enable/Disable DSA device. Default: yes" +echo "OPTIONAL Arg-3: SHARED WQ id. Default: 1" +echo "OPTIONAL Arg-4: ENGINE count. Default: 4" + +if [ "$#" -ge 5 ]; then + echo "ERROR: Incorrect argument count. Expected arg count <= 4" + exit 1 +fi + +DEVID=${1:-0} +ENABLE=${2:-yes} +SWQID=${3:-1} +NENGS=${4:-4} + +DEV=dsa${DEVID} +SWQ=${DEV}/wq${DEVID}.${SWQID} + +echo "=> ${SWQ}:" +accel-config disable-wq ${SWQ} + +echo "=> ${DEV}:" +accel-config disable-device ${DEV} + +if [ "${ENABLE}" != "yes" ]; then + echo "Exit after disabling ${DEV}." + exit 1 +fi + +for ((i=0; i < ${NENGS}; i++)) +do + echo "=> ${DEV}/engine${DEVID}.${i}" + echo "configured" + accel-config config-engine ${DEV}/engine${DEVID}.${i} --group-id=0 +done + +accel-config config-wq ${SWQ} --group-id=0 +accel-config config-wq ${SWQ} --priority=1 +accel-config config-wq ${SWQ} --wq-size=128 +accel-config config-wq ${SWQ} --max-batch-size=1024 +accel-config config-wq ${SWQ} --max-transfer-size=2147483648 +accel-config config-wq ${SWQ} --block-on-fault=1 +accel-config config-wq ${SWQ} --type=user +accel-config config-wq ${SWQ} --name="dsa-test" +accel-config config-wq ${SWQ} --mode=shared +accel-config config-wq ${SWQ} --threshold=127 +accel-config config-wq ${SWQ} --driver-name="user" +#accel-config config-group dsa0/group0.1 --traffic-class-a=0 +#accel-config config-group dsa0/group0.1 --traffic-class-b=0 + +echo "=> ${DEV}:" +accel-config enable-device ${DEV} + +echo "=> ${SWQ}:" +accel-config enable-wq ${SWQ} + diff --git a/cachebench_config.json b/cachebench_config.json new file mode 100644 index 0000000..4324f2e --- /dev/null +++ b/cachebench_config.json @@ -0,0 +1,55 @@ +{ + "cache_config": { + "cacheSizeMB": 40000, + "poolRebalanceIntervalSec": 0, + "moveOnSlabRelease": "false", + "numPools": 2, + "poolSizes": [ + 0.5, + 0.5 + ] + }, + "test_config": { + "numOps": 2500000.0, + "numThreads": 32, + "numKeys": 1000000, + "enableLookaside": "true", + "keySizeRange": [ + 1, + 8, + 64 + ], + "keySizeRangeProbability": [ + 0.3, + 0.7 + ], + "opRatePerSec": 300000, + "opRateBurstSize": 600000, + "valSizeRange": [ + 64, + 256, + 32240, + 80155.0, + 128248, + 205196 + ], + "valSizeRangeProbability": [ + 0.05, + 0.05, + 0.1, + 0.4, + 0.4 + ], + "getRatio": 0.9, + "setRatio": 0.05, + "delRatio": 0.05, + "keyPoolDistribution": [ + 0.5, + 0.5 + ], + "opPoolDistribution": [ + 0.5, + 0.5 + ] + } +} diff --git a/cachelib.patch b/cachelib.patch new file mode 100644 index 0000000..28a3b38 --- /dev/null +++ b/cachelib.patch @@ -0,0 +1,55 @@ +From eaa6ded0b338f6b1accf82b35393fef8c8b03ba2 Mon Sep 17 00:00:00 2001 +From: Daniel Byrne +Date: Wed, 12 Mar 2025 04:49:29 -0700 +Subject: [PATCH] dto enable + +--- + cachelib/CMakeLists.txt | 2 ++ + cachelib/cachebench/CMakeLists.txt | 4 ++++ + cachelib/cachebench/cache/Cache.h | 2 +- + 3 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/cachelib/CMakeLists.txt b/cachelib/CMakeLists.txt +index 506ba66b..07aef841 100644 +--- a/cachelib/CMakeLists.txt ++++ b/cachelib/CMakeLists.txt +@@ -44,6 +44,8 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON) + + option(BUILD_TESTS "If enabled, compile the tests." ON) + ++option(BUILD_WITH_DTO "If enabled, build with the DTO library for DSA support." ON) ++ + + set(BIN_INSTALL_DIR bin CACHE STRING + "The subdirectory where binaries should be installed") +diff --git a/cachelib/cachebench/CMakeLists.txt b/cachelib/cachebench/CMakeLists.txt +index f08e08f5..349b71f7 100644 +--- a/cachelib/cachebench/CMakeLists.txt ++++ b/cachelib/cachebench/CMakeLists.txt +@@ -72,6 +72,10 @@ add_executable (binary_trace_gen binary_trace_gen.cpp) + target_link_libraries(cachebench cachelib_cachebench) + target_link_libraries(binary_trace_gen cachelib_binary_trace_gen) + ++if (BUILD_WITH_DTO) ++ target_link_libraries(cachebench accel-config dto) ++endif () ++ + install( + TARGETS + cachebench +diff --git a/cachelib/cachebench/cache/Cache.h b/cachelib/cachebench/cache/Cache.h +index 17a4dc15..effacac8 100644 +--- a/cachelib/cachebench/cache/Cache.h ++++ b/cachelib/cachebench/cache/Cache.h +@@ -1330,7 +1330,7 @@ void Cache::setStringItem(WriteHandle& handle, + return; + + auto ptr = reinterpret_cast(getMemory(handle)); +- std::strncpy(ptr, str.c_str(), dataSize); ++ std::memmove(ptr, str.c_str(), dataSize); + + // Make sure the copied string ends with null char + if (str.size() + 1 > dataSize) { +-- +2.47.1 + diff --git a/dto.c b/dto.c index b7a3a1c..8497874 100644 --- a/dto.c +++ b/dto.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -23,18 +24,24 @@ #include #include #include +#include "dto.h" +#include // For _mm_crc32_u32 etc. #define likely(x) __builtin_expect((x), 1) #define unlikely(x) __builtin_expect((x), 0) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + // DSA capabilities #define GENCAP_CC_MEMORY 0x4 #define UMWAIT_DELAY_DEFAULT 100000 -/* C0.1 state */ -#define UMWAIT_STATE 1 -#define USE_ORIG_FUNC(n, use_dsa) (use_std_lib_calls == 1 || !use_dsa || n < dsa_min_size) +#define C01_STATE 1 +#define C02_STATE 0 + +#define USE_ORIG_FUNC(n, use_dsa) (use_std_lib_calls == 1 || !use_dsa || (n*(100-cpu_size_fraction)/100) < dsa_min_size) #define TS_NS(s, e) (((e.tv_sec*1000000000) + e.tv_nsec) - ((s.tv_sec*1000000000) + s.tv_nsec)) /* Maximum WQs that DTO will use. It is rather an arbitrary limit @@ -42,12 +49,18 @@ * Allocating memory dynamically may create cyclic dependency and may cause * a hang (e.g., memset --> malloc --> alloc library calls memset --> memset) */ +#define WQ_MMAPPED 1 #define MAX_WQS 32 #define MAX_NUMA_NODES 32 -#define DTO_DEFAULT_MIN_SIZE 16384 +#define DTO_DEFAULT_MIN_SIZE 32768 +#define DTO_DEFAULT_USLEEP 20 #define DTO_INITIALIZED 0 #define DTO_INITIALIZING 1 +#define NSEC_PER_SEC (1000000000) +#define MSEC_PER_SEC (1000) +#define NSEC_PER_MSEC (NSEC_PER_SEC/MSEC_PER_SEC) + // thread specific variables static __thread struct dsa_hw_desc thr_desc; static __thread struct dsa_completion_record thr_comp __attribute__((aligned(32))); @@ -79,7 +92,9 @@ struct dto_device { enum wait_options { WAIT_BUSYPOLL = 0, WAIT_UMWAIT, - WAIT_YIELD + WAIT_YIELD, + WAIT_TPAUSE, + WAIT_SLEEP }; enum numa_aware { @@ -114,6 +129,18 @@ static uint8_t dto_dsa_memset = 1; static uint8_t dto_dsa_memcmp = 1; static uint8_t dto_dsa_cc = 1; +static bool dto_use_c02 = true; //C02 state is default - + //C02 avg exit latency is ~500 ns + //and C01 is about ~240 ns on SPR + +#define TPAUSE_C02_DELAY_NS 6000 //in this case we are offloading so delay can + //be ~6 us as this is around the time a > 64KB + //copy takes to complete + +#define TPAUSE_C01_DELAY_NS 1000 //keep smaller because we want to wake up + //with lower latency + +static uint64_t tpause_wait_time = TPAUSE_C02_DELAY_NS; static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT; @@ -122,6 +149,7 @@ static uint8_t fork_handler_registered; enum memop { MEMSET = 0x0, MEMCOPY, + MEMCOPY_ASYNC, MEMMOVE, MEMCMP, MAX_MEMOP, @@ -130,6 +158,7 @@ enum memop { static const char * const memop_names[] = { [MEMSET] = "set", [MEMCOPY] = "cpy", + [MEMCOPY_ASYNC] = "cpy_async", [MEMMOVE] = "mov", [MEMCMP] = "cmp" }; @@ -171,6 +200,8 @@ static const char * const wait_names[] = { [WAIT_BUSYPOLL] = "busypoll", [WAIT_UMWAIT] = "umwait", [WAIT_YIELD] = "yield", + [WAIT_TPAUSE] = "tpause", + [WAIT_SLEEP] = "sleep" }; static int collect_stats; @@ -252,6 +283,20 @@ static unsigned int log_level = LOG_LEVEL_FATAL; #define DMS_STEP_INCREMENT 1024 #define DMS_STEP_DECREMENT 1024 +/* Auto tune v2 */ +#define KP 0.5 +#define KI 0.1 +#define SAMPLE_INTERVAL 10000 +#define AUTO_ADJUST_KNOBS 1 +#define AUTO_ADJUST_KNOBS_V2 2 + +#define AUTO_TUNE_V2_TARGET 1 + +static __thread uint64_t tl_num_descs = 0; +static __thread uint64_t tl_next_sample = 0; +static __thread uint64_t tl_integral = 0; +static __thread uint64_t tl_cpu_size_fraction = 0; + /* Auto tuning variables */ static atomic_ullong num_descs; static atomic_ullong adjust_num_descs; @@ -263,6 +308,29 @@ static uint8_t auto_adjust_knobs = 1; extern char *__progname; +uint32_t crc32c_hw(const uint8_t* data, size_t len) { + uint32_t crc = 0; // Initial value, can be 0 or 0xFFFFFFFF depending on convention + + while (len >= sizeof(uint64_t)) { + crc = _mm_crc32_u64(crc, *(uint64_t*)data); + data += sizeof(uint64_t); + len -= sizeof(uint64_t); + } + + while (len >= sizeof(uint32_t)) { + crc = _mm_crc32_u32(crc, *(uint32_t*)data); + data += sizeof(uint32_t); + len -= sizeof(uint32_t); + } + + while (len--) { + crc = _mm_crc32_u8(crc, *data++); + } + + return crc; +} + + static void dto_log(int req_log_level, const char *fmt, ...) { char buf[512]; @@ -341,60 +409,97 @@ static __always_inline int umwait(unsigned long timeout, unsigned int state) return r; } +static inline void tpause(unsigned long timeout, unsigned int state) +{ + uint32_t timeout_low = (uint32_t)(timeout); + uint32_t timeout_high = (uint32_t)(timeout >> 32); + asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n" + : + : "c"(state), "a"(timeout_low), "d"(timeout_high)); +} + + static __always_inline void dsa_wait_yield(const volatile uint8_t *comp) { while (*comp == 0) { - sched_yield(); + sched_yield(); } } static __always_inline void dsa_wait_busy_poll(const volatile uint8_t *comp) { while (*comp == 0) { - _mm_pause(); + _mm_pause(); } } -static __always_inline void __dsa_wait_umwait(const volatile uint8_t *comp) +static __always_inline void dsa_wait_tpause(const volatile uint8_t *comp) { - umonitor(comp); + while (*comp == 0) { + uint64_t delay = _rdtsc() + tpause_wait_time; + _tpause(C02_STATE, delay); + } +} - // Hardware never writes 0 to this field. Software should initialize this field to 0 - // so it can detect when the completion record has been written - if (*comp == 0) { - uint64_t delay = __rdtsc() + dto_umwait_delay; +static __always_inline void __dsa_wait_umwait(const volatile uint8_t *comp) +{ + _umonitor((void*)comp); - umwait(delay, UMWAIT_STATE); - } + uint64_t delay = _rdtsc() + UMWAIT_DELAY_DEFAULT; + _umwait(C02_STATE, delay); } static __always_inline void dsa_wait_umwait(const volatile uint8_t *comp) { - - while (*comp == 0) - __dsa_wait_umwait(comp); + while (*comp == 0) { + __dsa_wait_umwait(comp); + } } static __always_inline void __dsa_wait(const volatile uint8_t *comp) { - if (wait_method == WAIT_YIELD) + switch(wait_method) { + case WAIT_YIELD: sched_yield(); - else if (wait_method == WAIT_UMWAIT) - __dsa_wait_umwait(comp); - else - _mm_pause(); + break; + case WAIT_UMWAIT: + __dsa_wait_umwait(comp); + break; + case WAIT_TPAUSE: + _tpause( C01_STATE, _rdtsc() + TPAUSE_C01_DELAY_NS); + break; + default: + _mm_pause(); + } } static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp) { - if (wait_method == WAIT_YIELD) - dsa_wait_yield(comp); - else if (wait_method == WAIT_UMWAIT) - dsa_wait_umwait(comp); - else - dsa_wait_busy_poll(comp); + switch (wait_method) { + case WAIT_YIELD: + dsa_wait_yield(comp); + break; + case WAIT_UMWAIT: + dsa_wait_umwait(comp); + break; + case WAIT_TPAUSE: + dsa_wait_tpause(comp); + break; + case WAIT_BUSYPOLL: + dsa_wait_busy_poll(comp); + break; + case WAIT_SLEEP: + // This method is not typically used in high-performance scenarios but + // gives a good demonstration of how much CPU time can be reduced + do { + usleep(DTO_DEFAULT_USLEEP); // Sleep for 20 microseconds + } while (*comp == 0); + default: + dsa_wait_busy_poll(comp); + } } + /* A simple auto-tuning heuristic. * Goal of the Heuristic: * - CPU and DSA should complete their fraction of the job roughly simultaneously. @@ -418,8 +523,9 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp) uint64_t local_num_waits = 0; if ((++num_descs & DESCS_PER_RUN) != DESCS_PER_RUN) { - while (*comp == 0) + while (*comp == 0) { __dsa_wait(comp); + } return; } @@ -454,13 +560,42 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp) } } +static __always_inline void dsa_wait_and_adjust_v2(const volatile uint8_t *comp) +{ + + if (++tl_num_descs == tl_next_sample) { + uint64_t local_num_waits = 0; + while (*comp == 0) { + __dsa_wait(comp); + local_num_waits++; + } + int64_t error = local_num_waits - AUTO_TUNE_V2_TARGET; + tl_integral += error; + uint64_t new_frac = tl_cpu_size_fraction - KP * error + KI * tl_integral; + + // Clamp within valid range + tl_cpu_size_fraction = MAX(1, MIN(MAX_CPU_SIZE_FRACTION, new_frac)); + tl_next_sample += rand() % SAMPLE_INTERVAL*2 + 1; + } else { + while (*comp == 0) { + __dsa_wait(comp); + } + } +} + static __always_inline int dsa_wait(struct dto_wq *wq, struct dsa_hw_desc *hw, volatile uint8_t *comp) { - if (auto_adjust_knobs) - dsa_wait_and_adjust(comp); - else - dsa_wait_no_adjust(comp); + switch (auto_adjust_knobs) { + case AUTO_ADJUST_KNOBS: + dsa_wait_and_adjust(comp); + break; + case AUTO_ADJUST_KNOBS_V2: + dsa_wait_and_adjust_v2(comp); + break; + default: + dsa_wait_no_adjust(comp); + } if (likely(*comp == DSA_COMP_SUCCESS)) { thr_bytes_completed += hw->xfer_size; @@ -502,21 +637,32 @@ static __always_inline int dsa_execute(struct dto_wq *wq, //LOG_TRACE("desc flags: 0x%x, opcode: 0x%x\n", hw->flags, hw->opcode); __builtin_ia32_sfence(); - if (wq->wq_mmapped) + switch (wq->wq_mmapped) { + case WQ_MMAPPED: ret = enqcmd(hw, wq->wq_portal); - - else { + break; + default: ret = write(wq->wq_fd, hw, sizeof(*hw)); - if (ret != sizeof(*hw)) + if (ret != sizeof(*hw)) { return FAIL_OTHERS; - else + } + else { ret = 0; - } + } + break; + } + if (!ret) { - if (auto_adjust_knobs) - dsa_wait_and_adjust(comp); - else - dsa_wait_no_adjust(comp); + switch (auto_adjust_knobs) { + case AUTO_ADJUST_KNOBS: + dsa_wait_and_adjust(comp); + break; + case AUTO_ADJUST_KNOBS_V2: + dsa_wait_and_adjust_v2(comp); + break; + default: + dsa_wait_no_adjust(comp); + } if (*comp == DSA_COMP_SUCCESS) { thr_bytes_completed += hw->xfer_size; @@ -557,6 +703,7 @@ static void print_stats(void) clock_gettime(CLOCK_BOOTTIME, &dto_end_time); LOG_TRACE("DTO Run Time: %ld ms\n", TS_NS(dto_start_time, dto_end_time)/1000000); + LOG_TRACE("DTO CPU Fraction: %.2f \n", cpu_size_fraction/100.0); // display stats for (int t = 0; t < 2; ++t) { @@ -1121,7 +1268,38 @@ static int dsa_init(void) max_avg_waits = MAX_AVG_POLL_WAITS; } else LOG_ERROR("umwait not supported. Falling back to default wait method\n"); - } + } else if (!strncmp(env_str, wait_names[WAIT_TPAUSE], strlen(wait_names[WAIT_TPAUSE]))) { + if (umwait_support) { + wait_method = WAIT_TPAUSE; + } else { + LOG_ERROR("tpause not supported. Falling back to busypoll\n"); + wait_method = WAIT_BUSYPOLL; + } + } else if (!strncmp(env_str, wait_names[WAIT_SLEEP], strlen(wait_names[WAIT_SLEEP]))) { + double local_cpu_size_fraction = 0; + env_str = getenv("DTO_CPU_SIZE_FRACTION"); + + if (env_str != NULL) { + errno = 0; + local_cpu_size_fraction = strtod(env_str, NULL); + } + + uint64_t local_auto_adjust_knobs = 0; + env_str = getenv("DTO_AUTO_ADJUST_KNOBS"); + + if (env_str != NULL) { + errno = 0; + local_auto_adjust_knobs = strtoul(env_str, NULL, 10); + if (errno) + local_auto_adjust_knobs = 1; + } + if (local_cpu_size_fraction < 0.001 && local_auto_adjust_knobs == 0) { + wait_method = WAIT_SLEEP; + } else { + LOG_ERROR("sleep not supported for partial offloading (fraction > 0 and/or autotuning is selected\n"); + wait_method = WAIT_BUSYPOLL; + } + } } env_str = getenv("DTO_WQ_LIST"); @@ -1303,6 +1481,7 @@ static int init_dto(void) } /* Use only 2 digits after decimal point */ cpu_size_fraction = cpu_size_fraction_float * 100; + tl_cpu_size_fraction = cpu_size_fraction; } env_str = getenv("DTO_AUTO_ADJUST_KNOBS"); @@ -1312,10 +1491,19 @@ static int init_dto(void) auto_adjust_knobs = strtoul(env_str, NULL, 10); if (errno) auto_adjust_knobs = 1; - - auto_adjust_knobs = !!auto_adjust_knobs; + if (auto_adjust_knobs == AUTO_ADJUST_KNOBS_V2) { + tl_next_sample = rand() % (SAMPLE_INTERVAL*2) + 1; + } } + // Only use c02 if we are offloading a significant chunk to + // DSA so we amortize the exit latency of C02 state + if (cpu_size_fraction <= 20 && auto_adjust_knobs == 0) { + dto_use_c02 = true; + } else { + dto_use_c02 = false; + } + if (numa_available() != -1) { env_str = getenv("DTO_IS_NUMA_AWARE"); if (env_str != NULL) { @@ -1340,12 +1528,29 @@ static int init_dto(void) LOG_ERROR("Didn't find any usable DSAs. Falling back to using CPUs.\n"); use_std_lib_calls = 1; } + unsigned int num, den, freq; + unsigned int unused; + unsigned long long tmp; + __get_cpuid( 0x15, &den, &num, &freq, &unused ); + freq /= 1000; + LOG_TRACE( "Core Freq = %u kHz\n", freq ); + LOG_TRACE( "TSC Mult = %u\n", num ); + LOG_TRACE( "TSC Den = %u\n", den ); + freq *= num; + freq /= den; + LOG_TRACE( "CPU freq = %u kHz\n", freq ); + LOG_TRACE( "Requested wait: %llu nsec\n", tpause_wait_time ); + tmp = tpause_wait_time; + tmp *= freq; + tpause_wait_time = tmp / NSEC_PER_MSEC; + LOG_TRACE( "Requested wait duration: %llu cycles\n", tpause_wait_time ); + // display configuration LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, " - "cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s, dto_dsa_cc: %d\n", + "cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s, dto_dsa_cc: %d, dto_use_c02: %d\n", log_level, collect_stats, use_std_lib_calls, dsa_min_size, - cpu_size_fraction_float, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware], dto_dsa_cc); + cpu_size_fraction_float, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware], dto_dsa_cc, dto_use_c02); for (int i = 0; i < num_wqs; i++) LOG_TRACE("[%d] wq_path: %s, wq_size: %d, dsa_cap: %lx\n", i, wqs[i].wq_path, wqs[i].wq_size, wqs[i].dsa_gencap); @@ -1419,7 +1624,9 @@ static void dto_memset(void *s, int c, size_t n, int *result) thr_desc.pattern = memset_pattern; /* cpu_size_fraction guaranteed to be >= 0 and < 100 */ - cpu_size = n * cpu_size_fraction / 100; + uint64_t cpu_frac = auto_adjust_knobs == AUTO_ADJUST_KNOBS_V2 ? + tl_cpu_size_fraction : cpu_size_fraction; + cpu_size = n * cpu_frac / 100; dsa_size = n - cpu_size; thr_bytes_completed = 0; @@ -1437,7 +1644,7 @@ static void dto_memset(void *s, int c, size_t n, int *result) } } else { uint32_t threshold; - size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm + size_t current_cpu_size_fraction = cpu_frac; // the cpu_size_fraction might be changed by the auto tune algorithm threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction); do { @@ -1484,6 +1691,149 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n) return true; } +__attribute__((visibility("default"))) uint64_t dto_crc(const void *src, size_t n, callback_t cb, void* args) { + //submit dsa work if successful, call the callback + if (use_std_lib_calls || n < dsa_min_size) { + if (cb) { + cb(args); + } + return crc32c_hw(src, n); + } + int result = 0; + struct dto_wq *wq = get_wq(src); + size_t dsa_size = n; +#ifdef DTO_STATS_SUPPORT + struct timespec st, et; + size_t orig_n = n; + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + thr_desc.opcode = DSA_OPCODE_CRCGEN; + thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_BOF; + if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + thr_desc.flags |= IDXD_OP_FLAG_CC; + thr_desc.completion_addr = (uint64_t)&thr_comp; + + thr_bytes_completed = 0; + thr_desc.src_addr = (uint64_t) src; + thr_desc.dst_addr = 0; // dst_addr is not used for CRC generation + thr_desc.xfer_size = (uint32_t) dsa_size; + thr_desc.crc_seed = 0; // default seed valuie + thr_desc.rsvd = 0; + thr_comp.status = 0; + result = dsa_submit(wq, &thr_desc); + if (result == SUCCESS) { + if (cb) { + cb(args); + } + result = dsa_wait(wq, &thr_desc, &thr_comp.status); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY_ASYNC, n, thr_bytes_completed, result); +#endif + if (thr_bytes_completed < n) { + return 0; + } + return thr_comp.crc_val; +} + +__attribute__((visibility("default"))) uint64_t dto_memcpy_crc_async(void *dest, const void *src, size_t n, callback_t cb, void* args) { + //submit dsa work if successful, call the callback + if (use_std_lib_calls || n < dsa_min_size) { + if (cb) { + cb(args); + } + orig_memcpy(dest, src, n); + return crc32c_hw(src, n); + } + int result = 0; + struct dto_wq *wq = get_wq(dest); + size_t dsa_size = n; +#ifdef DTO_STATS_SUPPORT + struct timespec st, et; + size_t orig_n = n; + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + thr_desc.opcode = DSA_OPCODE_COPY_CRC; + thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_BOF; + if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + thr_desc.flags |= IDXD_OP_FLAG_CC; + thr_desc.completion_addr = (uint64_t)&thr_comp; + + thr_bytes_completed = 0; + thr_desc.src_addr = (uint64_t) src; + thr_desc.dst_addr = (uint64_t) dest; + thr_desc.xfer_size = (uint32_t) dsa_size; + thr_desc.crc_seed = 0; // default seed valuie + thr_desc.rsvd = 0; + thr_comp.status = 0; + result = dsa_submit(wq, &thr_desc); + if (result == SUCCESS) { + if (cb) { + cb(args); + } + result = dsa_wait(wq, &thr_desc, &thr_comp.status); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY_ASYNC, n, thr_bytes_completed, result); +#endif + if (thr_bytes_completed < n) { + return 0; + } + return thr_comp.crc_val; +} + +__attribute__((visibility("default"))) void dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args) { + //submit dsa work if successful, call the callback + int result = 0; + struct dto_wq *wq = get_wq(dest); + size_t dsa_size = n; +#ifdef DTO_STATS_SUPPORT + struct timespec st, et; + size_t orig_n = n; + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + thr_desc.opcode = DSA_OPCODE_MEMMOVE; + thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR; + if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + thr_desc.flags |= IDXD_OP_FLAG_CC; + thr_desc.completion_addr = (uint64_t)&thr_comp; + + thr_bytes_completed = 0; + + thr_desc.src_addr = (uint64_t) src; + thr_desc.dst_addr = (uint64_t) dest; + thr_desc.xfer_size = (uint32_t) dsa_size; + thr_comp.status = 0; + result = dsa_submit(wq, &thr_desc); + if (result == SUCCESS) { + cb(args); + result = dsa_wait(wq, &thr_desc, &thr_comp.status); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY_ASYNC, n, thr_bytes_completed, result); +#endif + if (thr_bytes_completed != n) { + /* fallback to std call if job is only partially completed */ + n -= thr_bytes_completed; + if (thr_comp.result == 0) { + dest = (void *)((uint64_t)dest + thr_bytes_completed); + src = (const void *)((uint64_t)src + thr_bytes_completed); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + orig_memcpy(dest, src, n); + +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCOPY, n, orig_n); +#endif + } +} + static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result) { struct dto_wq *wq = get_wq(dest); @@ -1495,11 +1845,14 @@ static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy thr_desc.flags |= IDXD_OP_FLAG_CC; thr_desc.completion_addr = (uint64_t)&thr_comp; + uint64_t cpu_frac = auto_adjust_knobs == AUTO_ADJUST_KNOBS_V2 ? + tl_cpu_size_fraction : cpu_size_fraction; + /* cpu_size_fraction guaranteed to be >= 0 and < 1 */ if (!is_memcpy && is_overlapping_buffers(dest, src, n)) cpu_size = 0; else - cpu_size = n * cpu_size_fraction / 100; + cpu_size = n * cpu_frac / 100; dsa_size = n - cpu_size; @@ -1523,7 +1876,7 @@ static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy } } else { uint32_t threshold; - size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm + size_t current_cpu_size_fraction = cpu_frac; // the cpu_size_fraction might be changed by the auto tune algorithm threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction); do { size_t len; diff --git a/dto.h b/dto.h new file mode 100644 index 0000000..4ca65a9 --- /dev/null +++ b/dto.h @@ -0,0 +1,20 @@ + +#ifndef DTO_H +#define DTO_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void(*callback_t)(void*); + +void dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args); +uint64_t dto_memcpy_crc_async(void *dest, const void *src, size_t n, callback_t cb, void* args); +uint64_t dto_crc(const void *src, size_t n, callback_t cb, void* args); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/dto_cachelib_env.sh b/dto_cachelib_env.sh new file mode 100755 index 0000000..0fcf16f --- /dev/null +++ b/dto_cachelib_env.sh @@ -0,0 +1,9 @@ +#!/bin/bash +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib +export DTO_USESTDC_CALLS=0 +export DTO_COLLECT_STATS=1 +export DTO_WAIT_METHOD=umwait +export DTO_MIN_BYTES=32768 +export DTO_CPU_SIZE_FRACTION=0.0 +export DTO_AUTO_ADJUST_KNOBS=0 +export DTO_DSA_CC=0 diff --git a/dto_env.sh b/dto_env.sh new file mode 100755 index 0000000..8f34761 --- /dev/null +++ b/dto_env.sh @@ -0,0 +1,7 @@ +#!/bin/bash +export DTO_USESTDC_CALLS=0 +export DTO_COLLECT_STATS=1 +export DTO_WAIT_METHOD=umwait +export DTO_MIN_BYTES=32768 +export DTO_CPU_SIZE_FRACTION=0.0 +export DTO_AUTO_ADJUST_KNOBS=0