diff --git a/Makefile b/Makefile index 95af413..a79ef47 100644 --- a/Makefile +++ b/Makefile @@ -7,15 +7,16 @@ all: libdto dto-test-wodto DML_LIB_CXX=-D_GNU_SOURCE libdto: dto.c - gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg + gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -msse4.2 -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg libdto_nostats: dto.c - gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg + gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -msse4.2 -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg install: cp libdto.so.1.0 /usr/lib64/ ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so.1 ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so + cp dto.h /usr/include/ install-local: ln -sf ./libdto.so.1.0 ./libdto.so.1 diff --git a/README.md b/README.md index 1a746e2..870455d 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,65 @@ can be enabled or disabled using an environment variable DTO_AUTO_ADJUST_KNOBS. DTO can also be used to learn certain application characterics by building histogram of various API types and sizes. The histogram can be built using an environment variable DTO_COLLECT_STATS. +## DTO API + +DTO provides drop-in replacements for standard memory operations. Each function +is available in three forms: + +- `*_default` uses library defaults. +- `*_cfg` accepts a `struct dto_call_cfg` to override per-call behavior. +- The unsuffixed name derives configuration from a bitwise OR of `DTO_API_*` + flags. + +We also support CRC32C offload via `dto_memcpy_crc` and `dto_crc` functions. `dto_memcpy_crc` +offloads both memcpy and CRC32C computation to DSA, while `dto_crc` offloads only CRC32C computation to DSA. + +Available entry points include: + +- `dto_memcpy_default`, `dto_memcpy_cfg`, `dto_memcpy` +- `dto_memmove_default`, `dto_memmove_cfg`, `dto_memmove` +- `dto_memset_default`, `dto_memset_cfg`, `dto_memset` +- `dto_memcmp_default`, `dto_memcmp_cfg`, `dto_memcmp` +- `dto_memcpy_crc_default`, `dto_memcpy_crc_cfg`, `dto_memcpy_crc` +- `dto_crc_default`, `dto_crc_cfg`, `dto_crc` + +### Sample usage + +```c +#include +#include "dto.h" + +int main(void) +{ + char src[64] = "example"; + char dst[64]; + + /* Use defaults */ + dto_memcpy_default(dst, src, sizeof(src)); + + /* Per-call configuration */ + struct dto_call_cfg cfg = { + .auto_adjust = 0, + .cache_control = 1, + .wait_method = WAIT_BUSYPOLL, + .numa_mode = NA_BUFFER_CENTRIC, + .overlapping_action = OVERLAPPING_CPU, + }; + int diff = dto_memcmp_cfg(dst, src, sizeof(src), &cfg); + + /* Flags-based configuration */ + dto_memset(dst, 0, sizeof(dst), DTO_API_WAIT_YIELD); + + /* Copy while computing CRC */ + uint32_t c1 = dto_memcpy_crc_default(dst, src, sizeof(src)); + + /* Compute CRC with flags-based configuration */ + uint32_t c2 = dto_crc(src, sizeof(src), DTO_API_WAIT_YIELD, NULL, NULL); + + return diff; +} +``` + ```bash dto.c: DSA Transparent Offload shared library dto-test.c: Sample multi-threaded test application diff --git a/accelConfig.sh b/accelConfig.sh index dd8b4d5..d134306 100755 --- a/accelConfig.sh +++ b/accelConfig.sh @@ -4,6 +4,7 @@ echo "OPTIONAL Arg-1: DSA device id. Default: 0" echo "OPTIONAL Arg-2: Enable/Disable DSA device. Default: yes" echo "OPTIONAL Arg-3: SHARED WQ id. Default: 1" echo "OPTIONAL Arg-4: ENGINE count. Default: 4" +echo "OPTIONAL Arg-5: Block on fault. Default: no, but this is needed for CRC32!" if [ "$#" -ge 5 ]; then echo "ERROR: Incorrect argument count. Expected arg count <= 4" @@ -14,6 +15,7 @@ DEVID=${1:-0} ENABLE=${2:-yes} SWQID=${3:-1} NENGS=${4:-4} +BLOCKONFAULT=${5:-no} DEV=dsa${DEVID} SWQ=${DEV}/wq${DEVID}.${SWQID} @@ -41,7 +43,11 @@ accel-config config-wq ${SWQ} --priority=1 accel-config config-wq ${SWQ} --wq-size=128 accel-config config-wq ${SWQ} --max-batch-size=1024 accel-config config-wq ${SWQ} --max-transfer-size=2147483648 -accel-config config-wq ${SWQ} --block-on-fault=0 +if [ "${BLOCKONFAULT}" == "yes" ]; then + accel-config config-wq ${SWQ} --block-on-fault=1 +else + accel-config config-wq ${SWQ} --block-on-fault=0 + accel-config config-wq ${SWQ} --type=user accel-config config-wq ${SWQ} --name="dsa-test" accel-config config-wq ${SWQ} --mode=shared diff --git a/dto.c b/dto.c index b023fad..82650ab 100644 --- a/dto.c +++ b/dto.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include "dto.h" #define likely(x) __builtin_expect((x), 1) #define unlikely(x) __builtin_expect((x), 0) @@ -47,10 +49,10 @@ #define MAX_WQS 32 #define MAX_NUMA_NODES 32 #define DTO_DEFAULT_MIN_SIZE 65536 +#define DTO_DEFAULT_CPU_FRACTION 0 #define DTO_INITIALIZED 0 #define DTO_INITIALIZING 1 - #define NSEC_PER_SEC (1000000000) #define MSEC_PER_SEC (1000) #define NSEC_PER_MSEC (NSEC_PER_SEC/MSEC_PER_SEC) @@ -72,6 +74,7 @@ struct dto_wq { uint64_t dsa_gencap; int wq_size; uint32_t max_transfer_size; + int block_on_fault; int wq_fd; void *wq_portal; bool wq_mmapped; @@ -83,26 +86,6 @@ struct dto_device { atomic_uchar next_wq; }; -enum wait_options { - WAIT_BUSYPOLL = 0, - WAIT_UMWAIT, - WAIT_YIELD, - WAIT_TPAUSE -}; - -enum numa_aware { - NA_NONE = 0, - NA_BUFFER_CENTRIC, - NA_CPU_CENTRIC, - NA_LAST_ENTRY -}; - -enum overlapping_memmove_actions { - OVERLAPPING_CPU = 0, - OVERLAPPING_DSA, - OVERLAPPING_LAST_ENTRY -}; - static const char * const numa_aware_names[] = { [NA_NONE] = "none", [NA_BUFFER_CENTRIC] = "buffer-centric", @@ -117,17 +100,22 @@ static atomic_uchar next_wq; static atomic_uchar dto_initialized; static atomic_uchar dto_initializing; static uint8_t use_std_lib_calls; -static enum numa_aware is_numa_aware; -static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE; -static int wait_method = WAIT_BUSYPOLL; -static size_t cpu_size_fraction; // range of values is 0 to 99 +static struct dto_call_cfg dto_default_cfg = { + .wait_method = WAIT_BUSYPOLL, + .auto_adjust = 1, + .cache_control = 1, + .numa_mode = NA_NONE, + .overlapping_action = OVERLAPPING_CPU, +}; +static uint64_t wait_time = 100000; //10K nanoseconds + +static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE; //minimum size to offload to DSA +static size_t cpu_size_fraction = DTO_DEFAULT_CPU_FRACTION; //percentage of work done by CPU static uint8_t dto_dsa_memcpy = 1; static uint8_t dto_dsa_memmove = 1; static uint8_t dto_dsa_memset = 1; static uint8_t dto_dsa_memcmp = 1; - -static uint8_t dto_dsa_cc = 1; static bool dto_use_c02 = true; //C02 state is default - //C02 avg exit latency is ~500 ns //and C01 is about ~240 ns on SPR @@ -143,8 +131,6 @@ static uint64_t tpause_wait_time = TPAUSE_C02_DELAY_NS; static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT; -static uint8_t dto_overlapping_memmove_action = OVERLAPPING_CPU; - static uint8_t fork_handler_registered; enum memop { @@ -217,7 +203,7 @@ static struct timespec dto_start_time; } while (0) \ -#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, overlap, tbc, r) \ +#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, overlap, tbc, r, cfg) \ do { \ if (unlikely(cs)) { \ uint64_t t; \ @@ -225,20 +211,20 @@ static struct timespec dto_start_time; t = (((et.tv_sec*1000000000) + et.tv_nsec) - \ ((st.tv_sec*1000000000) + st.tv_nsec)); \ if (unlikely(r != SUCCESS)) \ - update_stats(op, n, overlap, tbc, t, DSA_CALL_FAILED, r); \ + update_stats(op, n, overlap, tbc, t, DSA_CALL_FAILED, r, cfg); \ else \ - update_stats(op, n, overlap, tbc, t, DSA_CALL_SUCCESS, 0); \ + update_stats(op, n, overlap, tbc, t, DSA_CALL_SUCCESS, 0, cfg); \ } \ } while (0) \ -#define DTO_COLLECT_STATS_CPU_END(cs, st, et, op, n, orig_n) \ +#define DTO_COLLECT_STATS_CPU_END(cs, st, et, op, n, orig_n, cfg) \ do { \ if (unlikely(cs)) { \ uint64_t t; \ clock_gettime(CLOCK_BOOTTIME, &et); \ t = (((et.tv_sec*1000000000) + et.tv_nsec) - \ ((st.tv_sec*1000000000) + st.tv_nsec)); \ - update_stats(op, orig_n, false, n, t, STDC_CALL, 0); \ + update_stats(op, orig_n, false, n, t, STDC_CALL, 0, cfg); \ } \ } while (0) \ @@ -288,10 +274,32 @@ static atomic_ullong adjust_num_waits; /* default waits are for yield because yield is default waiting method */ static double min_avg_waits = MIN_AVG_YIELD_WAITS; static double max_avg_waits = MAX_AVG_YIELD_WAITS; -static uint8_t auto_adjust_knobs = 1; extern char *__progname; +static uint32_t crc32c_hw(const uint8_t *data, size_t len) +{ + uint32_t crc = 0; + + while (len >= sizeof(uint64_t)) { + crc = _mm_crc32_u64(crc, *(const uint64_t *)data); + data += sizeof(uint64_t); + len -= sizeof(uint64_t); + } + + while (len >= sizeof(uint32_t)) { + crc = _mm_crc32_u32(crc, *(const uint32_t *)data); + data += sizeof(uint32_t); + len -= sizeof(uint32_t); + } + + while (len--) { + crc = _mm_crc32_u8(crc, *data++); + } + + return crc; +} + static void dto_log(int req_log_level, const char *fmt, ...) { char buf[512]; @@ -389,9 +397,9 @@ static __always_inline void dsa_wait_umwait(const volatile uint8_t *comp) } } -static __always_inline void __dsa_wait(const volatile uint8_t *comp) +static __always_inline void __dsa_wait(const volatile uint8_t *comp, const struct dto_call_cfg *cfg) { - switch(wait_method) { + switch(cfg->wait_method) { case WAIT_YIELD: sched_yield(); break; @@ -406,9 +414,9 @@ static __always_inline void __dsa_wait(const volatile uint8_t *comp) } } -static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp) +static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp, const struct dto_call_cfg *cfg) { - switch (wait_method) { + switch (cfg->wait_method) { case WAIT_YIELD: dsa_wait_yield(comp); break; @@ -444,13 +452,13 @@ static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp) * - If cpu_size_fraction not too low, decrease it by CSF_STEP_DECREMENT * - else if dsa_min_size not too low, decrease it by DMS_STEP_DECREMENT */ -static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp) +static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp, const struct dto_call_cfg *cfg) { uint64_t local_num_waits = 0; if ((++num_descs & DESCS_PER_RUN) != DESCS_PER_RUN) { while (*comp == 0) { - __dsa_wait(comp); + __dsa_wait(comp, cfg); } return; @@ -458,7 +466,7 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp) /* Run the heuristics as well as wait for DSA */ while (*comp == 0) { - __dsa_wait(comp); + __dsa_wait(comp, cfg); local_num_waits++; } @@ -494,12 +502,13 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp) } static __always_inline int dsa_wait(struct dto_wq *wq, - struct dsa_hw_desc *hw, volatile uint8_t *comp) + struct dsa_hw_desc *hw, volatile uint8_t *comp, + const struct dto_call_cfg *cfg) { - if (auto_adjust_knobs) - dsa_wait_and_adjust(comp); + if (cfg->auto_adjust) + dsa_wait_and_adjust(comp, cfg); else - dsa_wait_no_adjust(comp); + dsa_wait_no_adjust(comp, cfg); if (likely(*comp == DSA_COMP_SUCCESS)) { thr_bytes_completed += hw->xfer_size; @@ -534,7 +543,8 @@ static __always_inline int dsa_submit(struct dto_wq *wq, } static __always_inline int dsa_execute(struct dto_wq *wq, - struct dsa_hw_desc *hw, volatile uint8_t *comp) + struct dsa_hw_desc *hw, volatile uint8_t *comp, + const struct dto_call_cfg *cfg) { int ret; *comp = 0; @@ -552,7 +562,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq, ret = 0; } if (!ret) { - dsa_wait_no_adjust(comp); + dsa_wait_no_adjust(comp, cfg); if (*comp == DSA_COMP_SUCCESS) { thr_bytes_completed += hw->xfer_size; @@ -569,10 +579,10 @@ static __always_inline int dsa_execute(struct dto_wq *wq, #ifdef DTO_STATS_SUPPORT static void update_stats(int op, size_t n, bool overlapping, size_t bytes_completed, - uint64_t elapsed_ns, int group, int error_code) + uint64_t elapsed_ns, int group, int error_code, const struct dto_call_cfg *cfg) { // dto_memcpymove didn't actually submit the request to DSA, so there is nothing to log. This will be captured by a second call - if (op == MEMMOVE && overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU && group == DSA_CALL_SUCCESS) { + if (op == MEMMOVE && overlapping && cfg->overlapping_action == OVERLAPPING_CPU && group == DSA_CALL_SUCCESS) { return; } @@ -764,10 +774,10 @@ static void correct_devices_list() { } } -static __always_inline int get_numa_node(void* buf) { +static __always_inline int get_numa_node(void* buf, const struct dto_call_cfg *cfg) { int numa_node = -1; - switch (is_numa_aware) { + switch (cfg->numa_mode) { case NA_BUFFER_CENTRIC: { if (buf != NULL) { int status[1] = {-1}; @@ -896,6 +906,7 @@ static int dsa_init_from_wq_list(char *wq_list) close(dir_fd); goto fail_wq; } + wqs[num_wqs].max_transfer_size = dto_get_param_ullong(dir_fd, "block_on_fault", &rc); dto_get_param_string(dir_fd, "mode", wq_mode); @@ -947,7 +958,7 @@ static int dsa_init_from_wq_list(char *wq_list) close(wqs[num_wqs].wq_fd); } - if (is_numa_aware) { + if (dto_default_cfg.numa_mode != NA_NONE) { struct dto_device* dev = get_dto_device(dev_numa_node); if (dev != NULL && dev->num_wqs < MAX_WQS) { @@ -967,7 +978,7 @@ static int dsa_init_from_wq_list(char *wq_list) goto fail; } - if (is_numa_aware) { + if (dto_default_cfg.numa_mode != NA_NONE) { correct_devices_list(); } @@ -1024,7 +1035,7 @@ static int dsa_init_from_accfg(void) struct dto_device* dev = NULL; - if (is_numa_aware) { + if (dto_default_cfg.numa_mode != NA_NONE) { const int dev_numa_node = accfg_device_get_numa_node(device); dev = get_dto_device(dev_numa_node); } @@ -1051,13 +1062,14 @@ static int dsa_init_from_accfg(void) wqs[num_wqs].wq_size = accfg_wq_get_size(wq); wqs[num_wqs].max_transfer_size = accfg_wq_get_max_transfer_size(wq); + wqs[num_wqs].block_on_fault = accfg_wq_get_block_on_fault(wq); wqs[num_wqs].acc_wq = wq; wqs[num_wqs].dsa_gencap = accfg_device_get_gen_cap(device); used_devids[num_wqs] = accfg_device_get_id(device); - if (is_numa_aware && + if (dto_default_cfg.numa_mode != NA_NONE && dev != NULL && dev->num_wqs < MAX_WQS) { dev->wqs[dev->num_wqs++] = &wqs[num_wqs]; @@ -1113,7 +1125,7 @@ static int dsa_init_from_accfg(void) } } - if (is_numa_aware) { + if (dto_default_cfg.numa_mode != NA_NONE) { correct_devices_list(); } @@ -1151,12 +1163,12 @@ static int dsa_init(void) env_str = getenv("DTO_WAIT_METHOD"); if (env_str != NULL) { if (!strncmp(env_str, wait_names[WAIT_BUSYPOLL], strlen(wait_names[WAIT_BUSYPOLL]))) { - wait_method = WAIT_BUSYPOLL; + dto_default_cfg.wait_method = WAIT_BUSYPOLL; min_avg_waits = MIN_AVG_POLL_WAITS; max_avg_waits = MAX_AVG_POLL_WAITS; } else if (!strncmp(env_str, wait_names[WAIT_UMWAIT], strlen(wait_names[WAIT_UMWAIT]))) { if (waitpkg_support) { - wait_method = WAIT_UMWAIT; + dto_default_cfg.wait_method = WAIT_UMWAIT; /* Use the same waits as busypoll for now */ min_avg_waits = MIN_AVG_POLL_WAITS; max_avg_waits = MAX_AVG_POLL_WAITS; @@ -1164,10 +1176,10 @@ static int dsa_init(void) LOG_ERROR("umwait not supported. Falling back to default wait method\n"); } else if (!strncmp(env_str, wait_names[WAIT_TPAUSE], strlen(wait_names[WAIT_TPAUSE]))) { if (waitpkg_support) { - wait_method = WAIT_TPAUSE; + dto_default_cfg.wait_method = WAIT_TPAUSE; } else { LOG_ERROR("tpause not supported. Falling back to busypoll\n"); - wait_method = WAIT_BUSYPOLL; + dto_default_cfg.wait_method = WAIT_BUSYPOLL; } } } @@ -1251,11 +1263,11 @@ static int init_dto(void) env_str = getenv("DTO_DSA_CC"); if (env_str != NULL) { errno = 0; - dto_dsa_cc = strtoul(env_str, NULL, 10); + dto_default_cfg.cache_control = strtoul(env_str, NULL, 10); if (errno) - dto_dsa_cc = 0; + dto_default_cfg.cache_control = 0; - dto_dsa_cc = !!dto_dsa_cc; + dto_default_cfg.cache_control = !!dto_default_cfg.cache_control; } env_str = getenv("DTO_DSA_MEMMOVE"); @@ -1291,9 +1303,9 @@ static int init_dto(void) env_str = getenv("DTO_OVERLAPPING_MEMMOVE_ACTION"); if (env_str != NULL) { errno = 0; - dto_overlapping_memmove_action = strtoul(env_str, NULL, 10); + dto_default_cfg.overlapping_action = strtoul(env_str, NULL, 10); if (errno) - dto_overlapping_memmove_action = OVERLAPPING_CPU; + dto_default_cfg.overlapping_action = OVERLAPPING_CPU; } #ifdef DTO_STATS_SUPPORT @@ -1365,20 +1377,20 @@ static int init_dto(void) if (env_str != NULL) { errno = 0; - auto_adjust_knobs = strtoul(env_str, NULL, 10); + dto_default_cfg.auto_adjust = strtoul(env_str, NULL, 10); if (errno) - auto_adjust_knobs = 1; + dto_default_cfg.auto_adjust = 1; - auto_adjust_knobs = !!auto_adjust_knobs; + dto_default_cfg.auto_adjust = !!dto_default_cfg.auto_adjust; } if (numa_available() != -1) { env_str = getenv("DTO_IS_NUMA_AWARE"); if (env_str != NULL) { errno = 0; - is_numa_aware = strtoul(env_str, NULL, 10); - if (errno || is_numa_aware >= NA_LAST_ENTRY) { - is_numa_aware = NA_NONE; + dto_default_cfg.numa_mode = strtoul(env_str, NULL, 10); + if (errno || dto_default_cfg.numa_mode >= NA_LAST_ENTRY) { + dto_default_cfg.numa_mode = NA_NONE; } } } @@ -1398,7 +1410,7 @@ static int init_dto(void) } // calculate the wait time for TPAUSE - if (wait_method == WAIT_TPAUSE) { + if (dto_default_cfg.wait_method == WAIT_TPAUSE) { unsigned int num, den, freq; unsigned int empty; unsigned long long tmp; @@ -1420,9 +1432,9 @@ static int init_dto(void) // display configuration LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, " - "cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s, dto_dsa_cc: %d\n", + "init cpu_size_fraction: %.2f, wait_method: %s, auto_adjust: %d, numa_awareness: %s, dto_dsa_cc: %d\n", log_level, collect_stats, use_std_lib_calls, dsa_min_size, - cpu_size_fraction_float, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware], dto_dsa_cc); + cpu_size_fraction_float, wait_names[dto_default_cfg.wait_method], dto_default_cfg.auto_adjust, numa_aware_names[dto_default_cfg.numa_mode], dto_default_cfg.cache_control); for (int i = 0; i < num_wqs; i++) LOG_TRACE("[%d] wq_path: %s, wq_size: %d, dsa_cap: %lx\n", i, wqs[i].wq_path, wqs[i].wq_size, wqs[i].dsa_gencap); @@ -1454,15 +1466,14 @@ static void cleanup_dto(void) cleanup_devices(); } -static __always_inline struct dto_wq *get_wq(void* buf) +static __always_inline struct dto_wq *get_wq(void* buf, const struct dto_call_cfg *cfg) { struct dto_wq* wq = NULL; - if (is_numa_aware) { + if (cfg->numa_mode != NA_NONE) { int status[1] = {-1}; - // get the numa node for the target DSA device - const int numa_node = get_numa_node(buf); + const int numa_node = get_numa_node(buf, cfg); if (numa_node >= 0 && numa_node < MAX_NUMA_NODES) { struct dto_device* dev = devices[numa_node]; if (dev != NULL && @@ -1479,18 +1490,18 @@ static __always_inline struct dto_wq *get_wq(void* buf) return wq; } -static void dto_memset(void *s, int c, size_t n, int *result) +static void dto_memset_dsa(void *s, int c, size_t n, int *result, const struct dto_call_cfg *cfg) { uint64_t memset_pattern; size_t cpu_size, dsa_size; - struct dto_wq *wq = get_wq(s); + struct dto_wq *wq = get_wq(s, cfg); for (int i = 0; i < 8; ++i) ((uint8_t *) &memset_pattern)[i] = (uint8_t) c; thr_desc.opcode = DSA_OPCODE_MEMFILL; thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR; - if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + if (cfg->cache_control && (wq->dsa_gencap & GENCAP_CC_MEMORY)) thr_desc.flags |= IDXD_OP_FLAG_CC; thr_desc.completion_addr = (uint64_t)&thr_comp; thr_desc.pattern = memset_pattern; @@ -1510,11 +1521,11 @@ static void dto_memset(void *s, int c, size_t n, int *result) orig_memset(s, c, cpu_size); thr_bytes_completed = cpu_size; } - *result = dsa_wait(wq, &thr_desc, &thr_comp.status); + *result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg); } } else { uint32_t threshold; - size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm + size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction); do { @@ -1536,7 +1547,7 @@ static void dto_memset(void *s, int c, size_t n, int *result) orig_memset(s1, c, cpu_size); thr_bytes_completed += cpu_size; } - *result = dsa_wait(wq, &thr_desc, &thr_comp.status); + *result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg); } if (*result != SUCCESS) @@ -1561,7 +1572,93 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n) return true; } -static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result) +static uint32_t __dto_crc(const void *src, size_t n, + struct dto_call_cfg *cfg, callback_t cb, void *args) +{ + struct dto_wq *wq = get_wq((void *)src, cfg); + if (wq->block_on_fault == 0) { + LOG_ERROR("DSA WQ %s does not support block on fault. Falling back to CPU crc32c\n", wq->wq_path); + orig_memcpy((void *)src, src, n); + if (cb) + cb(args); + return crc32c_hw(src, n); + } + int result; + + thr_bytes_completed = 0; + + thr_desc.opcode = DSA_OPCODE_CRCGEN; + thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_BOF; + if (cfg->cache_control && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + thr_desc.flags |= IDXD_OP_FLAG_CC; + thr_desc.completion_addr = (uint64_t)&thr_comp; + + thr_desc.src_addr = (uint64_t)src; + thr_desc.dst_addr = 0; + thr_desc.xfer_size = (uint32_t)n; + thr_desc.crc_seed = 0; + thr_desc.rsvd = 0; + thr_comp.status = 0; + + result = dsa_submit(wq, &thr_desc); + if (result == SUCCESS) { + if (cb) + cb(args); + result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg); + } + + if (result != SUCCESS || thr_bytes_completed < n) + return 0; + + return (uint32_t)thr_comp.crc_val; +} + +static uint32_t __dto_memcpy_crc(void *dest, const void *src, size_t n, + struct dto_call_cfg *cfg, callback_t cb, void *args) +{ + int use_orig = USE_ORIG_FUNC(n, dto_dsa_memcpy); + + if (use_orig) { + orig_memcpy(dest, src, n); + if (cb) + cb(args); + return crc32c_hw(src, n); + } + + struct dto_wq *wq = get_wq(dest, cfg); + int result; + + thr_bytes_completed = 0; + + thr_desc.opcode = DSA_OPCODE_COPY_CRC; + thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_BOF; + if (cfg->cache_control && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + thr_desc.flags |= IDXD_OP_FLAG_CC; + thr_desc.completion_addr = (uint64_t)&thr_comp; + + thr_desc.src_addr = (uint64_t)src; + thr_desc.dst_addr = (uint64_t)dest; + thr_desc.xfer_size = (uint32_t)n; + thr_desc.crc_seed = 0; + thr_desc.rsvd = 0; + thr_comp.status = 0; + + result = dsa_submit(wq, &thr_desc); + if (result == SUCCESS) { + if (cb) + cb(args); + result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg); + } + + if (result != SUCCESS || thr_bytes_completed < n) + return 0; + + return (uint32_t)thr_comp.crc_val; +} + +static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, + int *result, callback_t cb, void* args, + const struct dto_call_cfg *cfg) { struct dto_wq *wq; size_t cpu_size, dsa_size; @@ -1580,17 +1677,17 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy // If this is an overlapping memmove and the action is to perform on CPU, return having done nothing and // memmove will perform the copy and correctly attribute statistics to stdlib call group - if (is_overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU) { + if (is_overlapping && cfg->overlapping_action == OVERLAPPING_CPU) { *result = SUCCESS; return true; } dsa_size = n - cpu_size; - wq = get_wq(dest); + wq = get_wq(dest, cfg); thr_desc.opcode = DSA_OPCODE_MEMMOVE; thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR; - if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + if (cfg->cache_control && (wq->dsa_gencap & GENCAP_CC_MEMORY)) thr_desc.flags |= IDXD_OP_FLAG_CC; thr_desc.completion_addr = (uint64_t)&thr_comp; @@ -1600,7 +1697,7 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy thr_desc.xfer_size = (uint32_t) dsa_size; thr_comp.status = 0; if (is_overlapping) { - *result = dsa_execute(wq, &thr_desc, &thr_comp.status); + *result = dsa_execute(wq, &thr_desc, &thr_comp.status, cfg); } else { *result = dsa_submit(wq, &thr_desc); if (*result == SUCCESS) { @@ -1611,18 +1708,20 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy orig_memmove(dest, src, cpu_size); thr_bytes_completed += cpu_size; } - *result = dsa_wait(wq, &thr_desc, &thr_comp.status); + if (cb) { cb(args); }; + *result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg); } } } else { uint32_t threshold; - size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm + size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm if (is_overlapping) { threshold = wq->max_transfer_size; } else { threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction); } + bool first_iteration = true; do { size_t len; @@ -1638,7 +1737,7 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy thr_desc.xfer_size = (uint32_t) dsa_size; thr_comp.status = 0; if (is_overlapping){ - *result = dsa_execute(wq, &thr_desc, &thr_comp.status); + *result = dsa_execute(wq, &thr_desc, &thr_comp.status, cfg); } else { *result = dsa_submit(wq, &thr_desc); if (*result == SUCCESS) { @@ -1652,9 +1751,11 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy orig_memmove(dest1, src1, cpu_size); thr_bytes_completed += cpu_size; } - *result = dsa_wait(wq, &thr_desc, &thr_comp.status); + if (cb && first_iteration) { cb(args); }; + *result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg); } } + first_iteration = false; if (*result != SUCCESS) break; @@ -1669,9 +1770,216 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy return is_overlapping; } -static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result) +void init_dto_cfg(struct dto_call_cfg *call_cfg, int flags) +{ + *call_cfg = dto_default_cfg; + + if (flags & DTO_AUTO_SPLIT) { + call_cfg->auto_adjust = 1; + call_cfg->wait_method = WAIT_BUSYPOLL; + call_cfg->numa_mode = NA_BUFFER_CENTRIC; + call_cfg->cache_control = 1; + call_cfg->overlapping_action = OVERLAPPING_CPU; + return; + } + + if (flags & DTO_DSA_COMPLETE_OFFLOAD) { + call_cfg->auto_adjust = 0; + call_cfg->wait_method = WAIT_UMWAIT; + call_cfg->numa_mode = NA_BUFFER_CENTRIC; + call_cfg->cache_control = 0; + call_cfg->overlapping_action = OVERLAPPING_DSA; + return; + } + + if (flags & DTO_API_AUTO_ADJUST_KNOBS) + call_cfg->auto_adjust = 1; + else if (flags & DTO_API_NO_AUTO_ADJUST_KNOBS) + call_cfg->auto_adjust = 0; + + if (flags & DTO_API_WAIT_BUSYPOLL) + call_cfg->wait_method = WAIT_BUSYPOLL; + else if (flags & DTO_API_WAIT_UMWAIT) + call_cfg->wait_method = WAIT_UMWAIT; + else if (flags & DTO_API_WAIT_TPAUSE) + call_cfg->wait_method = WAIT_TPAUSE; + else if (flags & DTO_API_WAIT_YIELD) + call_cfg->wait_method = WAIT_YIELD; + + if (flags & DTO_API_CACHE_CONTROL) + call_cfg->cache_control = 1; + else if (flags & DTO_API_NO_CACHE_CONTROL) + call_cfg->cache_control = 0; + + if (flags & DTO_API_NUMA_AWARE_BUFFER_CENTRIC) + call_cfg->numa_mode = NA_BUFFER_CENTRIC; + else if (flags & DTO_API_NUMA_AWARE_CPU_CENTRIC) + call_cfg->numa_mode = NA_CPU_CENTRIC; + else if (flags & DTO_API_NUMA_AWARE_DISABLED) + call_cfg->numa_mode = NA_NONE; + + if (flags & DTO_API_OVERLAPPING_MEMMOVE_ACTION_DSA) + call_cfg->overlapping_action = OVERLAPPING_DSA; + else if (flags & DTO_API_OVERLAPPING_MEMMOVE_ACTION_CPU) + call_cfg->overlapping_action = OVERLAPPING_CPU; +} + +void __dto_memcpy(void *dest, const void *src, size_t n, + struct dto_call_cfg *cfg, callback_t cb, void* args) { + int result = 0; +#ifdef DTO_STATS_SUPPORT + struct timespec st, et; + size_t orig_n = n; + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + dto_memcpymove(dest, src, n, true, &result, cb, args, cfg); +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, false, thr_bytes_completed, result, cfg); + if (thr_bytes_completed != n) { + n -= thr_bytes_completed; + if (thr_comp.result == 0) { + dest = (void *)((uint64_t)dest + thr_bytes_completed); + src = (const void *)((uint64_t)src + thr_bytes_completed); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + orig_memcpy(dest, src, n); + +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCOPY, n, orig_n, cfg); +#endif + } +#endif +} + +__attribute__((visibility("default"))) void dto_memcpy_default(void *dest, const void *src, size_t n) { + __dto_memcpy(dest, src, n, &dto_default_cfg, NULL, NULL); +} + +__attribute__((visibility("default"))) void dto_memcpy(void *dest, const void *src, size_t n, + int flags, callback_t cb, void* args) { + struct dto_call_cfg cfg; + init_dto_cfg(&cfg, flags); + __dto_memcpy(dest, src, n, &cfg, cb, args); +} + +__attribute__((visibility("default"))) void dto_memcpy_cfg(void *dest, const void *src, size_t n, + struct dto_call_cfg *cfg, callback_t cb, void* args) { + __dto_memcpy(dest, src, n, cfg, cb, args); +} + +__attribute__((visibility("default"))) uint32_t dto_memcpy_crc_default(void *dest, const void *src, size_t n) { + return __dto_memcpy_crc(dest, src, n, &dto_default_cfg, NULL, NULL); +} + +__attribute__((visibility("default"))) uint32_t dto_memcpy_crc(void *dest, const void *src, size_t n, + int flags, callback_t cb, void* args) { + struct dto_call_cfg cfg; + init_dto_cfg(&cfg, flags); + return __dto_memcpy_crc(dest, src, n, &cfg, cb, args); +} + +__attribute__((visibility("default"))) uint32_t dto_memcpy_crc_cfg(void *dest, const void *src, size_t n, + struct dto_call_cfg *cfg, callback_t cb, void* args) { + return __dto_memcpy_crc(dest, src, n, cfg, cb, args); +} + +void __dto_memmove(void *dest, const void *src, size_t n, + struct dto_call_cfg *cfg, callback_t cb, void* args) { + int result = 0; + bool is_overlapping = false; +#ifdef DTO_STATS_SUPPORT + struct timespec st, et; + size_t orig_n = n; + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + is_overlapping = dto_memcpymove(dest, src, n, false, &result, cb, args, cfg); +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, is_overlapping, thr_bytes_completed, result, cfg); + if (thr_bytes_completed != n) { + /* fallback to std call if job is only partially completed */ + n -= thr_bytes_completed; + if (thr_comp.result == 0) { + dest = (void *)((uint64_t)dest + thr_bytes_completed); + src = (const void *)((uint64_t)src + thr_bytes_completed); + } +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + + orig_memmove(dest, src, n); + +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMMOVE, n, orig_n, cfg); +#endif + } +#endif +} + +__attribute__((visibility("default"))) void dto_memmove_default(void *dest, const void *src, size_t n) { + __dto_memmove(dest, src, n, &dto_default_cfg, NULL, NULL); +} + +__attribute__((visibility("default"))) void dto_memmove(void *dest, const void *src, size_t n, + int flags, callback_t cb, void* args) { + struct dto_call_cfg cfg; + init_dto_cfg(&cfg, flags); + __dto_memmove(dest, src, n, &cfg, cb, args); +} + +__attribute__((visibility("default"))) void dto_memmove_cfg(void *dest, const void *src, size_t n, + struct dto_call_cfg *cfg, callback_t cb, void* args) { + __dto_memmove(dest, src, n, cfg, cb, args); +} + +void __dto_memset(void *s, int c, size_t n, struct dto_call_cfg *cfg) +{ + int result = 0; +#ifdef DTO_STATS_SUPPORT + struct timespec st, et; + size_t orig_n = n; + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + dto_memset_dsa(s, c, n, &result, cfg); +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, false, thr_bytes_completed, result, cfg); + if (thr_bytes_completed != n) { +#endif + n -= thr_bytes_completed; + s = (void *)((uint64_t)s + thr_bytes_completed); +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + orig_memset(s, c, n); +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMSET, n, orig_n, cfg); + } +#endif +} + +__attribute__((visibility("default"))) void dto_memset_default(void *s, int c, size_t n) +{ + __dto_memset(s, c, n, &dto_default_cfg); +} + +__attribute__((visibility("default"))) void dto_memset(void *s, int c, size_t n, int flags) { - struct dto_wq *wq = get_wq((void*)s2); + struct dto_call_cfg cfg; + init_dto_cfg(&cfg, flags); + __dto_memset(s, c, n, &cfg); +} + +__attribute__((visibility("default"))) void dto_memset_cfg(void *s, int c, size_t n, + struct dto_call_cfg *cfg) +{ + __dto_memset(s, c, n, cfg); +} + +static int dto_memcmp_dsa(const void *s1, const void *s2, size_t n, int *result, const struct dto_call_cfg *cfg) +{ + struct dto_wq *wq = get_wq((void*)s2, cfg); int cmp_result = 0; size_t orig_n = n; @@ -1686,7 +1994,7 @@ static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result) thr_desc.src_addr = (uint64_t) s1; thr_desc.src2_addr = (uint64_t) s2; thr_desc.xfer_size = (uint32_t) n; - *result = dsa_execute(wq, &thr_desc, &thr_comp.status); + *result = dsa_execute(wq, &thr_desc, &thr_comp.status, cfg); } else { do { size_t len; @@ -1696,7 +2004,7 @@ static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result) thr_desc.src_addr = (uint64_t) s1 + thr_bytes_completed; thr_desc.src2_addr = (uint64_t) s2 + thr_bytes_completed; thr_desc.xfer_size = (uint32_t) len; - *result = dsa_execute(wq, &thr_desc, &thr_comp.status); + *result = dsa_execute(wq, &thr_desc, &thr_comp.status, cfg); if (*result != SUCCESS || thr_comp.result) break; @@ -1719,8 +2027,73 @@ static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result) * we didn't process all the bytes */ thr_bytes_completed = orig_n; - } - return cmp_result; + } + return cmp_result; +} + +int __dto_memcmp(const void *s1, const void *s2, size_t n, struct dto_call_cfg *cfg) +{ + int result = 0; + int ret; +#ifdef DTO_STATS_SUPPORT + struct timespec st, et; + size_t orig_n = n; + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + ret = dto_memcmp_dsa(s1, s2, n, &result, cfg); +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, false, thr_bytes_completed, result, cfg); + if (thr_bytes_completed != n) { +#endif + n -= thr_bytes_completed; + s1 = (const void *)((uint64_t)s1 + thr_bytes_completed); + s2 = (const void *)((uint64_t)s2 + thr_bytes_completed); +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_START(collect_stats, st); +#endif + ret = orig_memcmp(s1, s2, n); +#ifdef DTO_STATS_SUPPORT + DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCMP, n, orig_n, cfg); + } +#endif + return ret; +} + +__attribute__((visibility("default"))) int dto_memcmp_default(const void *s1, const void *s2, size_t n) +{ + return __dto_memcmp(s1, s2, n, &dto_default_cfg); +} + +__attribute__((visibility("default"))) int dto_memcmp(const void *s1, const void *s2, size_t n, int flags) +{ + struct dto_call_cfg cfg; + init_dto_cfg(&cfg, flags); + return __dto_memcmp(s1, s2, n, &cfg); +} + +__attribute__((visibility("default"))) int dto_memcmp_cfg(const void *s1, const void *s2, size_t n, + struct dto_call_cfg *cfg) +{ + return __dto_memcmp(s1, s2, n, cfg); +} + +__attribute__((visibility("default"))) uint32_t dto_crc_default(const void *src, size_t n) +{ + return __dto_crc(src, n, &dto_default_cfg, NULL, NULL); +} + +__attribute__((visibility("default"))) uint32_t dto_crc(const void *src, size_t n, + int flags, callback_t cb, void *args) +{ + struct dto_call_cfg cfg; + init_dto_cfg(&cfg, flags); + return __dto_crc(src, n, &cfg, cb, args); +} + +__attribute__((visibility("default"))) uint32_t dto_crc_cfg(const void *src, size_t n, + struct dto_call_cfg *cfg, callback_t cb, void *args) +{ + return __dto_crc(src, n, cfg, cb, args); } /* The dto_internal_mem* APIs are used only when mem* APIs are @@ -1793,10 +2166,10 @@ void *memset(void *s1, int c, size_t n) #ifdef DTO_STATS_SUPPORT DTO_COLLECT_STATS_START(collect_stats, st); #endif - dto_memset(s1, c, n, &result); + dto_memset_dsa(s1, c, n, &result, &dto_default_cfg); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, false, thr_bytes_completed, result); + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, false, thr_bytes_completed, result, &dto_default_cfg); #endif if (thr_bytes_completed != n) { /* fallback to std call if job is only partially completed */ @@ -1814,7 +2187,7 @@ void *memset(void *s1, int c, size_t n) orig_memset(s1, c, n); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMSET, n, orig_n); + DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMSET, n, orig_n, &dto_default_cfg); #endif } return ret; @@ -1843,10 +2216,10 @@ void *memcpy(void *dest, const void *src, size_t n) #ifdef DTO_STATS_SUPPORT DTO_COLLECT_STATS_START(collect_stats, st); #endif - dto_memcpymove(dest, src, n, 1, &result); + dto_memcpymove(dest, src, n, true, &result, NULL, NULL, &dto_default_cfg); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, false, thr_bytes_completed, result); + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, false, thr_bytes_completed, result, &dto_default_cfg); #endif if (thr_bytes_completed != n) { /* fallback to std call if job is only partially completed */ @@ -1867,7 +2240,7 @@ void *memcpy(void *dest, const void *src, size_t n) orig_memcpy(dest, src, n); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCOPY, n, orig_n); + DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCOPY, n, orig_n, &dto_default_cfg); #endif } return ret; @@ -1897,10 +2270,10 @@ void *memmove(void *dest, const void *src, size_t n) #ifdef DTO_STATS_SUPPORT DTO_COLLECT_STATS_START(collect_stats, st); #endif - is_overlapping = dto_memcpymove(dest, src, n, 0, &result); + is_overlapping = dto_memcpymove(dest, src, n, false, &result, NULL, NULL, &dto_default_cfg); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, is_overlapping, thr_bytes_completed, result); + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, is_overlapping, thr_bytes_completed, result, &dto_default_cfg); #endif if (thr_bytes_completed != n) { /* fallback to std call if job is only partially completed */ @@ -1921,7 +2294,7 @@ void *memmove(void *dest, const void *src, size_t n) orig_memmove(dest, src, n); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMMOVE, n, orig_n); + DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMMOVE, n, orig_n, &dto_default_cfg); #endif } return ret; @@ -1950,10 +2323,10 @@ int memcmp(const void *s1, const void *s2, size_t n) #ifdef DTO_STATS_SUPPORT DTO_COLLECT_STATS_START(collect_stats, st); #endif - ret = dto_memcmp(s1, s2, n, &result); + ret = dto_memcmp_dsa(s1, s2, n, &result, &dto_default_cfg); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, false, thr_bytes_completed, result); + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, false, thr_bytes_completed, result, &dto_default_cfg); #endif if (thr_bytes_completed != n) { /* fallback to std call if job is only partially completed */ @@ -1972,7 +2345,7 @@ int memcmp(const void *s1, const void *s2, size_t n) ret = orig_memcmp(s1, s2, n); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCMP, n, orig_n); + DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCMP, n, orig_n, &dto_default_cfg); #endif } return ret; diff --git a/dto.h b/dto.h new file mode 100644 index 0000000..84745b3 --- /dev/null +++ b/dto.h @@ -0,0 +1,262 @@ + +#ifndef DTO_H +#define DTO_H + +#define DTO_API_AUTO_ADJUST_KNOBS 1 +#define DTO_API_NO_AUTO_ADJUST_KNOBS 2 +#define DTO_API_WAIT_BUSYPOLL 4 +#define DTO_API_WAIT_UMWAIT 8 +#define DTO_API_WAIT_TPAUSE 16 +#define DTO_API_WAIT_YIELD 32 +#define DTO_API_CACHE_CONTROL 64 +#define DTO_API_NO_CACHE_CONTROL 128 +#define DTO_API_NUMA_AWARE_BUFFER_CENTRIC 256 +#define DTO_API_NUMA_AWARE_CPU_CENTRIC 512 +#define DTO_API_NUMA_AWARE_DISABLED 1024 +#define DTO_API_OVERLAPPING_MEMMOVE_ACTION_DSA 2048 +#define DTO_API_OVERLAPPING_MEMMOVE_ACTION_CPU 4096 + +#define DTO_DSA_COMPLETE_OFFLOAD 8192 +#define DTO_AUTO_SPLIT 16384 + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void(*callback_t)(void*); + +enum wait_options { + WAIT_BUSYPOLL = 0, + WAIT_UMWAIT, + WAIT_YIELD, + WAIT_TPAUSE +}; + +enum numa_aware { + NA_NONE = 0, + NA_BUFFER_CENTRIC, + NA_CPU_CENTRIC, + NA_LAST_ENTRY +}; + +enum overlapping_memmove_actions { + OVERLAPPING_CPU = 0, + OVERLAPPING_DSA, + OVERLAPPING_LAST_ENTRY +}; + +struct dto_call_cfg { + unsigned char auto_adjust; + unsigned char cache_control; + enum wait_options wait_method; + enum numa_aware numa_mode; + enum overlapping_memmove_actions overlapping_action; +}; + +/** + * dto_memcpy_default - Copy memory using DTO's default configuration. + * @dest: Destination buffer. + * @src: Source buffer. + * @n: Number of bytes to copy. + */ +void dto_memcpy_default(void *dest, const void *src, size_t n); + +/** + * dto_memcpy_cfg - Copy memory using a caller provided configuration. + * @dest: Destination buffer. + * @src: Source buffer. + * @n: Number of bytes to copy. + * @cfg: DTO configuration to use for this call. + * @cb: Optional callback invoked after completion. + * @args: Argument passed to the callback. + */ +void dto_memcpy_cfg(void *dest, const void *src, size_t n, + struct dto_call_cfg *cfg, callback_t cb, void* args); + +/** + * dto_memcpy - Copy memory using a configuration derived from @flags. + * @dest: Destination buffer. + * @src: Source buffer. + * @n: Number of bytes to copy. + * @flags: Bitwise OR of DTO_API_* values that override defaults. + * @cb: Optional callback invoked after completion. + * @args: Argument passed to the callback. + */ +void dto_memcpy(void *dest, const void *src, size_t n, + int flags, callback_t cb, void* args); + + +/** + * dto_memcpy_crc_default - Copy memory and compute CRC using DTO defaults. + * @dest: Destination buffer. + * @src: Source buffer. + * @n: Number of bytes to copy. + * + * Return: CRC32C value of the copied data. + */ +uint32_t dto_memcpy_crc_default(void *dest, const void *src, size_t n); + +/** + * dto_memcpy_crc_cfg - Copy memory and compute CRC using caller configuration. + * @dest: Destination buffer. + * @src: Source buffer. + * @n: Number of bytes to copy. + * @cfg: DTO configuration to use for this call. + * @cb: Optional callback invoked after submission. + * @args: Argument passed to the callback. + * + * Return: CRC32C value of the copied data. + */ +uint32_t dto_memcpy_crc_cfg(void *dest, const void *src, size_t n, + struct dto_call_cfg *cfg, callback_t cb, void* args); + +/** + * dto_memcpy_crc - Copy memory and compute CRC using @flags derived + * configuration. + * @dest: Destination buffer. + * @src: Source buffer. + * @n: Number of bytes to copy. + * @flags: Bitwise OR of DTO_API_* values that override defaults. + * @cb: Optional callback invoked after submission. + * @args: Argument passed to the callback. + * + * Return: CRC32C value of the copied data. + */ +uint32_t dto_memcpy_crc(void *dest, const void *src, size_t n, + int flags, callback_t cb, void* args); + +/** + * dto_memmove_default - Move memory using DTO's default configuration. + * @dest: Destination buffer. + * @src: Source buffer. + * @n: Number of bytes to move. + */ +void dto_memmove_default(void *dest, const void *src, size_t n); + +/** + * dto_memmove_cfg - Move memory using a caller provided configuration. + * @dest: Destination buffer. + * @src: Source buffer. + * @n: Number of bytes to move. + * @cfg: DTO configuration to use for this call. + * @cb: Optional callback invoked after completion. + * @args: Argument passed to the callback. + */ +void dto_memmove_cfg(void *dest, const void *src, size_t n, + struct dto_call_cfg *cfg, callback_t cb, void* args); + +/** + * dto_memmove - Move memory using a configuration derived from @flags. + * @dest: Destination buffer. + * @src: Source buffer. + * @n: Number of bytes to move. + * @flags: Bitwise OR of DTO_API_* values that override defaults. + * @cb: Optional callback invoked after completion. + * @args: Argument passed to the callback. + */ +void dto_memmove(void *dest, const void *src, size_t n, + int flags, callback_t cb, void* args); + +/** + * dto_memset_default - Set memory using DTO's default configuration. + * @s: Destination buffer to fill. + * @c: Byte value to set. + * @n: Number of bytes to set. + */ +void dto_memset_default(void *s, int c, size_t n); + +/** + * dto_memset_cfg - Set memory using a caller provided configuration. + * @s: Destination buffer to fill. + * @c: Byte value to set. + * @n: Number of bytes to set. + * @cfg: DTO configuration to use for this call. + */ +void dto_memset_cfg(void *s, int c, size_t n, struct dto_call_cfg *cfg); + +/** + * dto_memset - Set memory using a configuration derived from @flags. + * @s: Destination buffer to fill. + * @c: Byte value to set. + * @n: Number of bytes to set. + * @flags: Bitwise OR of DTO_API_* values that override defaults. + */ +void dto_memset(void *s, int c, size_t n, int flags); + +/** + * dto_memcmp_default - Compare memory using DTO's default configuration. + * @s1: First buffer. + * @s2: Second buffer. + * @n: Number of bytes to compare. + * + * Return: < 0, 0 or > 0 if s1 is found to be less than, equal to or greater + * than s2 respectively. + */ +int dto_memcmp_default(const void *s1, const void *s2, size_t n); + +/** + * dto_memcmp_cfg - Compare memory using a caller provided configuration. + * @s1: First buffer. + * @s2: Second buffer. + * @n: Number of bytes to compare. + * @cfg: DTO configuration to use for this call. + * + * Return: Comparison result as in memcmp(). + */ +int dto_memcmp_cfg(const void *s1, const void *s2, size_t n, + struct dto_call_cfg *cfg); + +/** + * dto_memcmp - Compare memory using a configuration derived from @flags. + * @s1: First buffer. + * @s2: Second buffer. + * @n: Number of bytes to compare. + * @flags: Bitwise OR of DTO_API_* values that override defaults. + * + * Return: Comparison result as in memcmp(). + */ +int dto_memcmp(const void *s1, const void *s2, size_t n, int flags); + +/** + * dto_crc_default - Compute CRC using DTO's default configuration. + * @src: Buffer to checksum. + * @n: Number of bytes to process. + * + * Return: CRC32C value of the buffer. + */ +uint32_t dto_crc_default(const void *src, size_t n); + +/** + * dto_crc_cfg - Compute CRC using a caller provided configuration. + * @src: Buffer to checksum. + * @n: Number of bytes to process. + * @cfg: DTO configuration to use for this call. + * @cb: Optional callback invoked after submission. + * @args: Argument passed to the callback. + * + * Return: CRC32C value of the buffer. + */ +uint32_t dto_crc_cfg(const void *src, size_t n, struct dto_call_cfg *cfg, + callback_t cb, void* args); + +/** + * dto_crc - Compute CRC using a configuration derived from @flags. + * @src: Buffer to checksum. + * @n: Number of bytes to process. + * @flags: Bitwise OR of DTO_API_* values that override defaults. + * @cb: Optional callback invoked after submission. + * @args: Argument passed to the callback. + * + * Return: CRC32C value of the buffer. + */ +uint32_t dto_crc(const void *src, size_t n, int flags, callback_t cb, + void* args); + +#ifdef __cplusplus +} +#endif + +#endif +