From a622c4742886257617d920e36349b4cd4f6b2761 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Fri, 12 Sep 2025 06:26:06 -0700
Subject: [PATCH 1/3] DTO API -------

DTO provides drop-in replacements for standard memory operations. Each function
is available in three forms:

- *_default uses library defaults.
- *_cfg accepts a `struct dto_call_cfg` to override per-call behavior.
- The unsuffixed name derives configuration from a bitwise OR of `DTO_API_*`
  flags.

Sample usage
-------------

include <dto.h>

int main(void)
{
    char src[64] = "example";
    char dst[64];

    /* Use defaults */
    dto_memcpy_default(dst, src, sizeof(src));

    /* Per-call configuration */
    struct dto_call_cfg cfg = {
        .auto_adjust = 0,
        .cache_control = 1,
        .wait_method = WAIT_BUSYPOLL,
        .numa_mode = NA_BUFFER_CENTRIC,
        .overlapping_action = OVERLAPPING_CPU,
    };
    int diff = dto_memcmp_cfg(dst, src, sizeof(src), &cfg);

    /* Flags-based configuration */
    dto_memset(dst, 0, sizeof(dst), DTO_API_WAIT_YIELD);

    return diff;
}
---
 Makefile  |   1 +
 README.md |  54 +++++++
 dto.c     | 451 ++++++++++++++++++++++++++++++++++++++++--------------
 dto.h     | 185 ++++++++++++++++++++++
 4 files changed, 579 insertions(+), 112 deletions(-)
 create mode 100644 dto.h

diff --git a/Makefile b/Makefile
index 95af413..6f75c14 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,7 @@ install:
 	cp libdto.so.1.0 /usr/lib64/
 	ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so.1
 	ln -sf /usr/lib64/libdto.so.1.0 /usr/lib64/libdto.so
+	cp dto.h /usr/include/
 
 install-local:
 	ln -sf ./libdto.so.1.0 ./libdto.so.1
diff --git a/README.md b/README.md
index 1a746e2..e923361 100644
--- a/README.md
+++ b/README.md
@@ -41,6 +41,60 @@ can be enabled or disabled using an environment variable DTO_AUTO_ADJUST_KNOBS.
 
 DTO can also be used to learn certain application characterics by building histogram of various API types and sizes. The histogram can be built using an environment variable DTO_COLLECT_STATS.
 
+Finally, DTO offers an API to allow applications to pass a function pointer to be called while waiting for DSA to complete the operation. This can be used to perform other work while waiting for DSA to complete the operation. The function signature is:
+
+```bash
+dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args);
+```
+where callback_t cb is a function pointer in the calling application. If the callback terminates before DSA completes the operation, the specified wait method is used to complete the waiting.
+
+## DTO API
+
+DTO provides drop-in replacements for standard memory operations. Each function
+is available in three forms:
+
+- `*_default` uses library defaults.
+- `*_cfg` accepts a `struct dto_call_cfg` to override per-call behavior.
+- The unsuffixed name derives configuration from a bitwise OR of `DTO_API_*`
+  flags.
+
+Available entry points include:
+
+- `dto_memcpy_default`, `dto_memcpy_cfg`, `dto_memcpy`
+- `dto_memmove_default`, `dto_memmove_cfg`, `dto_memmove`
+- `dto_memset_default`, `dto_memset_cfg`, `dto_memset`
+- `dto_memcmp_default`, `dto_memcmp_cfg`, `dto_memcmp`
+
+### Sample usage
+
+```c
+#include "dto.h"
+
+int main(void)
+{
+    char src[64] = "example";
+    char dst[64];
+
+    /* Use defaults */
+    dto_memcpy_default(dst, src, sizeof(src));
+
+    /* Per-call configuration */
+    struct dto_call_cfg cfg = {
+        .auto_adjust = 0,
+        .cache_control = 1,
+        .wait_method = WAIT_BUSYPOLL,
+        .numa_mode = NA_BUFFER_CENTRIC,
+        .overlapping_action = OVERLAPPING_CPU,
+    };
+    int diff = dto_memcmp_cfg(dst, src, sizeof(src), &cfg);
+
+    /* Flags-based configuration */
+    dto_memset(dst, 0, sizeof(dst), DTO_API_WAIT_YIELD);
+
+    return diff;
+}
+```
+
 ```bash
 dto.c: DSA Transparent Offload shared library
 dto-test.c: Sample multi-threaded test application
diff --git a/dto.c b/dto.c
index b023fad..6a65c2e 100644
--- a/dto.c
+++ b/dto.c
@@ -23,6 +23,7 @@
 #include <accel-config/libaccel_config.h>
 #include <numaif.h>
 #include <numa.h>
+#include "dto.h"
 
 #define likely(x)       __builtin_expect((x), 1)
 #define unlikely(x)     __builtin_expect((x), 0)
@@ -47,10 +48,10 @@
 #define MAX_WQS 32
 #define MAX_NUMA_NODES 32
 #define DTO_DEFAULT_MIN_SIZE 65536
+#define DTO_DEFAULT_CPU_FRACTION 0
 #define DTO_INITIALIZED 0
 #define DTO_INITIALIZING 1
 
-
 #define NSEC_PER_SEC (1000000000)
 #define MSEC_PER_SEC (1000)
 #define NSEC_PER_MSEC (NSEC_PER_SEC/MSEC_PER_SEC)
@@ -83,26 +84,6 @@ struct dto_device {
 	atomic_uchar next_wq;
 };
 
-enum wait_options {
-	WAIT_BUSYPOLL = 0,
-	WAIT_UMWAIT,
-	WAIT_YIELD,
-	WAIT_TPAUSE
-};
-
-enum numa_aware {
-	NA_NONE = 0,
-	NA_BUFFER_CENTRIC,
-	NA_CPU_CENTRIC,
-	NA_LAST_ENTRY
-};
-
-enum overlapping_memmove_actions {
-	OVERLAPPING_CPU = 0,
-	OVERLAPPING_DSA,
-	OVERLAPPING_LAST_ENTRY
-};
-
 static const char * const numa_aware_names[] = {
 	[NA_NONE] = "none",
 	[NA_BUFFER_CENTRIC] = "buffer-centric",
@@ -117,17 +98,22 @@ static atomic_uchar next_wq;
 static atomic_uchar dto_initialized;
 static atomic_uchar dto_initializing;
 static uint8_t use_std_lib_calls;
-static enum numa_aware is_numa_aware;
-static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE;
-static int wait_method = WAIT_BUSYPOLL;
-static size_t cpu_size_fraction;   // range of values is 0 to 99
+static struct dto_call_cfg dto_default_cfg = {
+		.wait_method = WAIT_BUSYPOLL,
+		.auto_adjust = 1,
+		.cache_control = 1,
+		.numa_mode = NA_NONE,
+		.overlapping_action = OVERLAPPING_CPU,
+};
+static uint64_t wait_time = 100000; //10K nanoseconds
+
+static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE; //minimum size to offload to DSA
+static size_t cpu_size_fraction = DTO_DEFAULT_CPU_FRACTION; //percentage of work done by CPU
 
 static uint8_t dto_dsa_memcpy = 1;
 static uint8_t dto_dsa_memmove = 1;
 static uint8_t dto_dsa_memset = 1;
 static uint8_t dto_dsa_memcmp = 1;
-
-static uint8_t dto_dsa_cc = 1;
 static bool dto_use_c02 = true; //C02 state is default -
                             //C02 avg exit latency is ~500 ns
                             //and C01 is about ~240 ns on SPR
@@ -143,8 +129,6 @@ static uint64_t tpause_wait_time = TPAUSE_C02_DELAY_NS;
 
 static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT;
 
-static uint8_t dto_overlapping_memmove_action = OVERLAPPING_CPU;
-
 static uint8_t fork_handler_registered;
 
 enum memop {
@@ -217,7 +201,7 @@ static struct timespec dto_start_time;
 	} while (0)						\
 
 
-#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, overlap, tbc, r)				\
+#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, overlap, tbc, r, cfg)				\
 	do {										\
 		if (unlikely(cs)) {							\
 			uint64_t t;							\
@@ -225,20 +209,20 @@ static struct timespec dto_start_time;
 			t = (((et.tv_sec*1000000000) + et.tv_nsec) -			\
 					((st.tv_sec*1000000000) + st.tv_nsec));		\
 			if (unlikely(r != SUCCESS))					\
-				update_stats(op, n, overlap, tbc, t, DSA_CALL_FAILED, r);	\
+				update_stats(op, n, overlap, tbc, t, DSA_CALL_FAILED, r, cfg);	\
 			else								\
-				update_stats(op, n, overlap, tbc, t, DSA_CALL_SUCCESS, 0);	\
+				update_stats(op, n, overlap, tbc, t, DSA_CALL_SUCCESS, 0, cfg);	\
 		}									\
 	} while (0)									\
 
-#define DTO_COLLECT_STATS_CPU_END(cs, st, et, op, n, orig_n)			\
+#define DTO_COLLECT_STATS_CPU_END(cs, st, et, op, n, orig_n, cfg)			\
 	do {									\
 		if (unlikely(cs)) {						\
 			uint64_t t;						\
 			clock_gettime(CLOCK_BOOTTIME, &et);			\
 			t = (((et.tv_sec*1000000000) + et.tv_nsec) -		\
 				((st.tv_sec*1000000000) + st.tv_nsec));		\
-			update_stats(op, orig_n, false, n, t, STDC_CALL, 0);		\
+			update_stats(op, orig_n, false, n, t, STDC_CALL, 0, cfg);		\
 		}								\
 	} while (0)								\
 
@@ -288,7 +272,6 @@ static atomic_ullong adjust_num_waits;
 /* default waits are for yield because yield is default waiting method */
 static double min_avg_waits = MIN_AVG_YIELD_WAITS;
 static double max_avg_waits = MAX_AVG_YIELD_WAITS;
-static uint8_t auto_adjust_knobs = 1;
 
 extern char *__progname;
 
@@ -389,9 +372,9 @@ static __always_inline void dsa_wait_umwait(const volatile uint8_t *comp)
         }
 }
 
-static __always_inline void __dsa_wait(const volatile uint8_t *comp)
+static __always_inline void __dsa_wait(const volatile uint8_t *comp, const struct dto_call_cfg *cfg)
 {
-        switch(wait_method) {
+        switch(cfg->wait_method) {
             case WAIT_YIELD:
 		sched_yield();
                 break;
@@ -406,9 +389,9 @@ static __always_inline void __dsa_wait(const volatile uint8_t *comp)
         }
 }
 
-static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp)
+static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp, const struct dto_call_cfg *cfg)
 {
-    switch (wait_method) {
+    switch (cfg->wait_method) {
         case WAIT_YIELD:
             dsa_wait_yield(comp);
             break;
@@ -444,13 +427,13 @@ static __always_inline void dsa_wait_no_adjust(const volatile uint8_t *comp)
  *      - If cpu_size_fraction not too low, decrease it by CSF_STEP_DECREMENT
  *      - else if dsa_min_size not too low, decrease it by DMS_STEP_DECREMENT
  */
-static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp)
+static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp, const struct dto_call_cfg *cfg)
 {
 	uint64_t local_num_waits = 0;
 
 	if ((++num_descs & DESCS_PER_RUN) != DESCS_PER_RUN) {
 		while (*comp == 0) {
-			__dsa_wait(comp);
+			__dsa_wait(comp, cfg);
                 }
 
 		return;
@@ -458,7 +441,7 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp)
 
 	/* Run the heuristics as well as wait for DSA */
 	while (*comp == 0) {
-		__dsa_wait(comp);
+		__dsa_wait(comp, cfg);
 		local_num_waits++;
 	}
 
@@ -494,12 +477,13 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp)
 }
 
 static __always_inline int dsa_wait(struct dto_wq *wq,
-	struct dsa_hw_desc *hw, volatile uint8_t *comp)
+	struct dsa_hw_desc *hw, volatile uint8_t *comp,
+	const struct dto_call_cfg *cfg)
 {
-	if (auto_adjust_knobs)
-		dsa_wait_and_adjust(comp);
+	if (cfg->auto_adjust)
+		dsa_wait_and_adjust(comp, cfg);
 	else
-		dsa_wait_no_adjust(comp);
+		dsa_wait_no_adjust(comp, cfg);
 
 	if (likely(*comp == DSA_COMP_SUCCESS)) {
 		thr_bytes_completed += hw->xfer_size;
@@ -534,7 +518,8 @@ static __always_inline int dsa_submit(struct dto_wq *wq,
 }
 
 static __always_inline int dsa_execute(struct dto_wq *wq,
-	struct dsa_hw_desc *hw, volatile uint8_t *comp)
+	struct dsa_hw_desc *hw, volatile uint8_t *comp,
+	const struct dto_call_cfg *cfg)
 {
 	int ret;
 	*comp = 0;
@@ -552,7 +537,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
 			ret = 0;
 	}
 	if (!ret) {
-		dsa_wait_no_adjust(comp);
+		dsa_wait_no_adjust(comp, cfg);
 
 		if (*comp == DSA_COMP_SUCCESS) {
 			thr_bytes_completed += hw->xfer_size;
@@ -569,10 +554,10 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
 
 #ifdef DTO_STATS_SUPPORT
 static void update_stats(int op, size_t n, bool overlapping, size_t bytes_completed,
-		uint64_t elapsed_ns, int group, int error_code)
+		uint64_t elapsed_ns, int group, int error_code, const struct dto_call_cfg *cfg)
 {
 	// dto_memcpymove didn't actually submit the request to DSA, so there is nothing to log. This will be captured by a second call
-	if (op == MEMMOVE && overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU && group == DSA_CALL_SUCCESS) {
+	if (op == MEMMOVE && overlapping && cfg->overlapping_action == OVERLAPPING_CPU && group == DSA_CALL_SUCCESS) {
 		return;
 	}
 
@@ -764,10 +749,10 @@ static void correct_devices_list() {
 	}
 }
 
-static __always_inline  int get_numa_node(void* buf) {
+static __always_inline  int get_numa_node(void* buf, const struct dto_call_cfg *cfg) {
 	int numa_node = -1;
 
-	switch (is_numa_aware) {
+	switch (cfg->numa_mode) {
         case NA_BUFFER_CENTRIC: {
 			if (buf != NULL) {
 				int status[1] = {-1};
@@ -947,7 +932,7 @@ static int dsa_init_from_wq_list(char *wq_list)
 			close(wqs[num_wqs].wq_fd);
 		}
 
-		if (is_numa_aware) {
+		if (dto_default_cfg.numa_mode != NA_NONE) {
 			struct dto_device* dev = get_dto_device(dev_numa_node);
 			if (dev != NULL &&
 				dev->num_wqs < MAX_WQS) {
@@ -967,7 +952,7 @@ static int dsa_init_from_wq_list(char *wq_list)
 		goto fail;
 	}
 
-	if (is_numa_aware) {
+	if (dto_default_cfg.numa_mode != NA_NONE) {
 		correct_devices_list();
 	}
 
@@ -1024,7 +1009,7 @@ static int dsa_init_from_accfg(void)
 
 		struct dto_device* dev = NULL;
 
-		if (is_numa_aware) {
+		if (dto_default_cfg.numa_mode != NA_NONE) {
 			const int dev_numa_node = accfg_device_get_numa_node(device);
 			dev = get_dto_device(dev_numa_node);
 		}
@@ -1057,7 +1042,7 @@ static int dsa_init_from_accfg(void)
 
 			used_devids[num_wqs] = accfg_device_get_id(device);
 
-			if (is_numa_aware &&
+			if (dto_default_cfg.numa_mode != NA_NONE &&
 				dev != NULL &&
 				dev->num_wqs < MAX_WQS) {
 				dev->wqs[dev->num_wqs++] = &wqs[num_wqs];
@@ -1113,7 +1098,7 @@ static int dsa_init_from_accfg(void)
 		}
 	}
 
-	if (is_numa_aware) {
+	if (dto_default_cfg.numa_mode != NA_NONE) {
 		correct_devices_list();
 	}
 
@@ -1151,12 +1136,12 @@ static int dsa_init(void)
 	env_str = getenv("DTO_WAIT_METHOD");
 	if (env_str != NULL) {
 		if (!strncmp(env_str, wait_names[WAIT_BUSYPOLL], strlen(wait_names[WAIT_BUSYPOLL]))) {
-			wait_method = WAIT_BUSYPOLL;
+			dto_default_cfg.wait_method = WAIT_BUSYPOLL;
 			min_avg_waits = MIN_AVG_POLL_WAITS;
 			max_avg_waits = MAX_AVG_POLL_WAITS;
 		} else if (!strncmp(env_str, wait_names[WAIT_UMWAIT], strlen(wait_names[WAIT_UMWAIT]))) {
 			if (waitpkg_support) {
-				wait_method = WAIT_UMWAIT;
+				dto_default_cfg.wait_method = WAIT_UMWAIT;
 				/* Use the same waits as busypoll for now */
 				min_avg_waits = MIN_AVG_POLL_WAITS;
 				max_avg_waits = MAX_AVG_POLL_WAITS;
@@ -1164,10 +1149,10 @@ static int dsa_init(void)
 				LOG_ERROR("umwait not supported. Falling back to default wait method\n");
 		} else if (!strncmp(env_str, wait_names[WAIT_TPAUSE], strlen(wait_names[WAIT_TPAUSE]))) {
 		    if (waitpkg_support) {
-			wait_method = WAIT_TPAUSE;
+			dto_default_cfg.wait_method = WAIT_TPAUSE;
                     } else {
 			LOG_ERROR("tpause not supported. Falling back to busypoll\n");
-                        wait_method = WAIT_BUSYPOLL;
+                        dto_default_cfg.wait_method = WAIT_BUSYPOLL;
                     }
                 }
 	}
@@ -1251,11 +1236,11 @@ static int init_dto(void)
 		env_str = getenv("DTO_DSA_CC");
 		if (env_str != NULL) {
 			errno = 0;
-			dto_dsa_cc = strtoul(env_str, NULL, 10);
+			dto_default_cfg.cache_control = strtoul(env_str, NULL, 10);
 			if (errno)
-				dto_dsa_cc = 0;
+				dto_default_cfg.cache_control = 0;
 
-			dto_dsa_cc = !!dto_dsa_cc;
+			dto_default_cfg.cache_control = !!dto_default_cfg.cache_control;
 		}
 
 		env_str = getenv("DTO_DSA_MEMMOVE");
@@ -1291,9 +1276,9 @@ static int init_dto(void)
 		env_str = getenv("DTO_OVERLAPPING_MEMMOVE_ACTION");
 		if (env_str != NULL) {
 			errno = 0;
-			dto_overlapping_memmove_action = strtoul(env_str, NULL, 10);
+			dto_default_cfg.overlapping_action = strtoul(env_str, NULL, 10);
 			if (errno)
-				dto_overlapping_memmove_action = OVERLAPPING_CPU;
+				dto_default_cfg.overlapping_action = OVERLAPPING_CPU;
 		}
 
 #ifdef DTO_STATS_SUPPORT
@@ -1365,20 +1350,20 @@ static int init_dto(void)
 
 			if (env_str != NULL) {
 				errno = 0;
-				auto_adjust_knobs = strtoul(env_str, NULL, 10);
+				dto_default_cfg.auto_adjust = strtoul(env_str, NULL, 10);
 				if (errno)
-					auto_adjust_knobs = 1;
+					dto_default_cfg.auto_adjust = 1;
 
-				auto_adjust_knobs = !!auto_adjust_knobs;
+				dto_default_cfg.auto_adjust = !!dto_default_cfg.auto_adjust;
 			}
 
 			if (numa_available() != -1) {
 				env_str = getenv("DTO_IS_NUMA_AWARE");
 				if (env_str != NULL) {
 					errno = 0;
-					is_numa_aware = strtoul(env_str, NULL, 10);
-					if (errno || is_numa_aware >= NA_LAST_ENTRY) {
-						is_numa_aware = NA_NONE;
+					dto_default_cfg.numa_mode = strtoul(env_str, NULL, 10);
+					if (errno || dto_default_cfg.numa_mode >= NA_LAST_ENTRY) {
+						dto_default_cfg.numa_mode = NA_NONE;
 					}
 				}
 			}
@@ -1398,7 +1383,7 @@ static int init_dto(void)
 			}
 
                         // calculate the wait time for TPAUSE
-                        if (wait_method == WAIT_TPAUSE) {
+                        if (dto_default_cfg.wait_method == WAIT_TPAUSE) {
     			        unsigned int num, den, freq;
     			        unsigned int empty;
     			        unsigned long long tmp;
@@ -1420,9 +1405,9 @@ static int init_dto(void)
     
 			// display configuration
 			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
-				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s, dto_dsa_cc: %d\n",
+				"init cpu_size_fraction: %.2f, wait_method: %s, auto_adjust: %d, numa_awareness: %s, dto_dsa_cc: %d\n",
 				log_level, collect_stats, use_std_lib_calls, dsa_min_size,
-				cpu_size_fraction_float, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware], dto_dsa_cc);
+				cpu_size_fraction_float, wait_names[dto_default_cfg.wait_method], dto_default_cfg.auto_adjust, numa_aware_names[dto_default_cfg.numa_mode], dto_default_cfg.cache_control);
 			for (int i = 0; i < num_wqs; i++)
 				LOG_TRACE("[%d] wq_path: %s, wq_size: %d, dsa_cap: %lx\n", i,
 					wqs[i].wq_path, wqs[i].wq_size, wqs[i].dsa_gencap);
@@ -1454,15 +1439,14 @@ static void cleanup_dto(void)
 	cleanup_devices();
 }
 
-static __always_inline  struct dto_wq *get_wq(void* buf)
+static __always_inline  struct dto_wq *get_wq(void* buf, const struct dto_call_cfg *cfg)
 {
 	struct dto_wq* wq = NULL;
 
-	if (is_numa_aware) {
+	if (cfg->numa_mode != NA_NONE) {
 		int status[1] = {-1};
-
 		// get the numa node for the target DSA device
-		const int numa_node = get_numa_node(buf);
+		const int numa_node = get_numa_node(buf, cfg);
 		if (numa_node >= 0 && numa_node < MAX_NUMA_NODES) {
 			struct dto_device* dev = devices[numa_node];
 			if (dev != NULL &&
@@ -1479,18 +1463,18 @@ static __always_inline  struct dto_wq *get_wq(void* buf)
 	return wq;
 }
 
-static void dto_memset(void *s, int c, size_t n, int *result)
+static void dto_memset_dsa(void *s, int c, size_t n, int *result, const struct dto_call_cfg *cfg)
 {
 	uint64_t memset_pattern;
 	size_t cpu_size, dsa_size;
-	struct dto_wq *wq = get_wq(s);
+	struct dto_wq *wq = get_wq(s, cfg);
 
 	for (int i = 0; i < 8; ++i)
 		((uint8_t *) &memset_pattern)[i] = (uint8_t) c;
 
 	thr_desc.opcode = DSA_OPCODE_MEMFILL;
 	thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
-	if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY))
+	if (cfg->cache_control && (wq->dsa_gencap & GENCAP_CC_MEMORY))
 		thr_desc.flags |= IDXD_OP_FLAG_CC;
 	thr_desc.completion_addr = (uint64_t)&thr_comp;
 	thr_desc.pattern = memset_pattern;
@@ -1510,11 +1494,11 @@ static void dto_memset(void *s, int c, size_t n, int *result)
 				orig_memset(s, c, cpu_size);
 				thr_bytes_completed = cpu_size;
 			}
-			*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
+			*result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg);
 		}
 	} else {
 		uint32_t threshold;
-		size_t current_cpu_size_fraction = cpu_size_fraction;  // the cpu_size_fraction might be changed by the auto tune algorithm 
+		size_t current_cpu_size_fraction = cpu_size_fraction;  // the cpu_size_fraction might be changed by the auto tune algorithm
 		threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
 
 		do {
@@ -1536,7 +1520,7 @@ static void dto_memset(void *s, int c, size_t n, int *result)
 					orig_memset(s1, c, cpu_size);
 					thr_bytes_completed += cpu_size;
 				}
-				*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
+				*result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg);
 			}
 
 			if (*result != SUCCESS)
@@ -1561,7 +1545,9 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n)
 	return true;
 }
 
-static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
+static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy,
+		int *result, callback_t cb, void* args,
+		const struct dto_call_cfg *cfg)
 {
 	struct dto_wq *wq;
 	size_t cpu_size, dsa_size;
@@ -1580,17 +1566,17 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
 
 	// If this is an overlapping memmove and the action is to perform on CPU, return having done nothing and
 	// memmove will perform the copy and correctly attribute statistics to stdlib call group
-	if (is_overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU) {
+	if (is_overlapping && cfg->overlapping_action == OVERLAPPING_CPU) {
 		*result = SUCCESS;
 		return true;
 	}
 
 	dsa_size = n - cpu_size;
-	wq = get_wq(dest);
+	wq = get_wq(dest, cfg);
 
 	thr_desc.opcode = DSA_OPCODE_MEMMOVE;
 	thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
-	if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY))
+	if (cfg->cache_control && (wq->dsa_gencap & GENCAP_CC_MEMORY))
 		thr_desc.flags |= IDXD_OP_FLAG_CC;
 	thr_desc.completion_addr = (uint64_t)&thr_comp;
 
@@ -1600,7 +1586,7 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
 		thr_desc.xfer_size = (uint32_t) dsa_size;
 		thr_comp.status = 0;
 		if (is_overlapping) {
-			*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
+			*result = dsa_execute(wq, &thr_desc, &thr_comp.status, cfg);
 		} else {
 			*result = dsa_submit(wq, &thr_desc);
 			if (*result == SUCCESS) {
@@ -1611,18 +1597,20 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
 						orig_memmove(dest, src, cpu_size);
 					thr_bytes_completed += cpu_size;
 				}
-				*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
+				if (cb) { cb(args); };
+			  	*result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg);
 			}
 		}
 	} else {
 		uint32_t threshold;
-		size_t current_cpu_size_fraction = cpu_size_fraction;  // the cpu_size_fraction might be changed by the auto tune algorithm 
+		size_t current_cpu_size_fraction = cpu_size_fraction;  // the cpu_size_fraction might be changed by the auto tune algorithm
 		if (is_overlapping) {
 			threshold = wq->max_transfer_size;
 		} else {
 			threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
 		}
 
+                bool first_iteration = true;
 		do {
 			size_t len;
 
@@ -1638,7 +1626,7 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
 			thr_desc.xfer_size = (uint32_t) dsa_size;
 			thr_comp.status = 0;
 			if (is_overlapping){
-				*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
+				*result = dsa_execute(wq, &thr_desc, &thr_comp.status, cfg);
 			} else {
 				*result = dsa_submit(wq, &thr_desc);
 				if (*result == SUCCESS) {
@@ -1652,9 +1640,11 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
 							orig_memmove(dest1, src1, cpu_size);
 						thr_bytes_completed += cpu_size;
 					}
-					*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
+					if (cb && first_iteration) { cb(args); };
+				        *result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg);
 				}
 			}
+                        first_iteration = false;
 
 			if (*result != SUCCESS)
 				break;
@@ -1669,9 +1659,200 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
 	return is_overlapping;
 }
 
-static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result)
+void init_dto_cfg(struct dto_call_cfg *call_cfg, int flags)
 {
-	struct dto_wq *wq = get_wq((void*)s2);
+	*call_cfg = dto_default_cfg;
+
+        if (flags & DTO_AUTO_SPLIT) {
+                call_cfg->auto_adjust = 1;
+                call_cfg->wait_method = WAIT_BUSYPOLL;
+                call_cfg->numa_mode = NA_BUFFER_CENTRIC;
+                call_cfg->cache_control = 1;
+                call_cfg->overlapping_action = OVERLAPPING_CPU;
+                return;
+        }
+
+        if (flags & DTO_DSA_COMPLETE_OFFLOAD) {
+                call_cfg->auto_adjust = 0;
+                call_cfg->wait_method = WAIT_UMWAIT;
+                call_cfg->numa_mode = NA_BUFFER_CENTRIC;
+                call_cfg->cache_control = 0;
+                call_cfg->overlapping_action = OVERLAPPING_DSA;
+                return;
+        }
+
+	if (flags & DTO_API_AUTO_ADJUST_KNOBS)
+		call_cfg->auto_adjust = 1;
+        else if (flags & DTO_API_NO_AUTO_ADJUST_KNOBS)
+                call_cfg->auto_adjust = 0;
+
+	if (flags & DTO_API_WAIT_BUSYPOLL)
+		call_cfg->wait_method = WAIT_BUSYPOLL;
+	else if (flags & DTO_API_WAIT_UMWAIT)
+		call_cfg->wait_method = WAIT_UMWAIT;
+	else if (flags & DTO_API_WAIT_TPAUSE)
+		call_cfg->wait_method = WAIT_TPAUSE;
+	else if (flags & DTO_API_WAIT_YIELD)
+		call_cfg->wait_method = WAIT_YIELD;
+
+	if (flags & DTO_API_CACHE_CONTROL)
+		call_cfg->cache_control = 1;
+        else if (flags & DTO_API_NO_CACHE_CONTROL)
+                call_cfg->cache_control = 0;
+
+	if (flags & DTO_API_NUMA_AWARE_BUFFER_CENTRIC)
+		call_cfg->numa_mode = NA_BUFFER_CENTRIC;
+	else if (flags & DTO_API_NUMA_AWARE_CPU_CENTRIC)
+		call_cfg->numa_mode = NA_CPU_CENTRIC;
+        else if (flags & DTO_API_NUMA_AWARE_DISABLED)
+                call_cfg->numa_mode = NA_NONE;
+
+	if (flags & DTO_API_OVERLAPPING_MEMMOVE_ACTION_DSA)
+		call_cfg->overlapping_action = OVERLAPPING_DSA;
+        else if (flags & DTO_API_OVERLAPPING_MEMMOVE_ACTION_CPU)
+                call_cfg->overlapping_action = OVERLAPPING_CPU;
+}
+
+void __dto_memcpy(void *dest, const void *src, size_t n,
+	struct dto_call_cfg *cfg, callback_t cb, void* args) {
+	int result = 0;
+#ifdef DTO_STATS_SUPPORT
+	struct timespec st, et;
+	size_t orig_n = n;
+	DTO_COLLECT_STATS_START(collect_stats, st);
+#endif
+	dto_memcpymove(dest, src, n, true, &result, cb, args, cfg);
+#ifdef DTO_STATS_SUPPORT
+	DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, false, thr_bytes_completed, result, cfg);
+	if (thr_bytes_completed != n) {
+		n -= thr_bytes_completed;
+		if (thr_comp.result == 0) {
+			dest = (void *)((uint64_t)dest + thr_bytes_completed);
+			src = (const void *)((uint64_t)src + thr_bytes_completed);
+		}
+#ifdef DTO_STATS_SUPPORT
+		DTO_COLLECT_STATS_START(collect_stats, st);
+#endif
+
+		orig_memcpy(dest, src, n);
+
+#ifdef DTO_STATS_SUPPORT
+		DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCOPY, n, orig_n, cfg);
+#endif
+	}
+#endif
+}
+
+__attribute__((visibility("default"))) void dto_memcpy_default(void *dest, const void *src, size_t n) {
+        __dto_memcpy(dest, src, n, &dto_default_cfg, NULL, NULL);
+}
+
+__attribute__((visibility("default"))) void dto_memcpy(void *dest, const void *src, size_t n,
+	int flags, callback_t cb, void* args) {
+        struct dto_call_cfg cfg;
+        init_dto_cfg(&cfg, flags);
+        __dto_memcpy(dest, src, n, &cfg, cb, args);
+}
+
+__attribute__((visibility("default"))) void dto_memcpy_cfg(void *dest, const void *src, size_t n,
+	struct dto_call_cfg *cfg, callback_t cb, void* args) {
+        __dto_memcpy(dest, src, n, cfg, cb, args);
+}
+
+void __dto_memmove(void *dest, const void *src, size_t n,
+       		   struct dto_call_cfg *cfg, callback_t cb, void* args) {
+	int result = 0;
+	bool is_overlapping = false;
+#ifdef DTO_STATS_SUPPORT
+	struct timespec st, et;
+	size_t orig_n = n;
+	DTO_COLLECT_STATS_START(collect_stats, st);
+#endif
+	is_overlapping = dto_memcpymove(dest, src, n, false, &result, cb, args, cfg);
+#ifdef DTO_STATS_SUPPORT
+	DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, is_overlapping, thr_bytes_completed, result, cfg);
+	if (thr_bytes_completed != n) {
+		/* fallback to std call if job is only partially completed */
+		n -= thr_bytes_completed;
+		if (thr_comp.result == 0) {
+			dest = (void *)((uint64_t)dest + thr_bytes_completed);
+			src = (const void *)((uint64_t)src + thr_bytes_completed);
+		}
+#ifdef DTO_STATS_SUPPORT
+		DTO_COLLECT_STATS_START(collect_stats, st);
+#endif
+
+		orig_memmove(dest, src, n);
+
+#ifdef DTO_STATS_SUPPORT
+		DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMMOVE, n, orig_n, cfg);
+#endif
+	}
+#endif
+}
+
+__attribute__((visibility("default"))) void dto_memmove_default(void *dest, const void *src, size_t n) {
+        __dto_memmove(dest, src, n, &dto_default_cfg, NULL, NULL);
+}
+
+__attribute__((visibility("default"))) void dto_memmove(void *dest, const void *src, size_t n,
+	int flags, callback_t cb, void* args) {
+        struct dto_call_cfg cfg;
+        init_dto_cfg(&cfg, flags);
+        __dto_memmove(dest, src, n, &cfg, cb, args);
+}
+
+__attribute__((visibility("default"))) void dto_memmove_cfg(void *dest, const void *src, size_t n,
+        struct dto_call_cfg *cfg, callback_t cb, void* args) {
+        __dto_memmove(dest, src, n, cfg, cb, args);
+}
+
+void __dto_memset(void *s, int c, size_t n, struct dto_call_cfg *cfg)
+{
+        int result = 0;
+#ifdef DTO_STATS_SUPPORT
+        struct timespec st, et;
+        size_t orig_n = n;
+        DTO_COLLECT_STATS_START(collect_stats, st);
+#endif
+        dto_memset_dsa(s, c, n, &result, cfg);
+#ifdef DTO_STATS_SUPPORT
+        DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, false, thr_bytes_completed, result, cfg);
+        if (thr_bytes_completed != n) {
+#endif
+                n -= thr_bytes_completed;
+                s = (void *)((uint64_t)s + thr_bytes_completed);
+#ifdef DTO_STATS_SUPPORT
+                DTO_COLLECT_STATS_START(collect_stats, st);
+#endif
+                orig_memset(s, c, n);
+#ifdef DTO_STATS_SUPPORT
+                DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMSET, n, orig_n, cfg);
+        }
+#endif
+}
+
+__attribute__((visibility("default"))) void dto_memset_default(void *s, int c, size_t n)
+{
+        __dto_memset(s, c, n, &dto_default_cfg);
+}
+
+__attribute__((visibility("default"))) void dto_memset(void *s, int c, size_t n, int flags)
+{
+        struct dto_call_cfg cfg;
+        init_dto_cfg(&cfg, flags);
+        __dto_memset(s, c, n, &cfg);
+}
+
+__attribute__((visibility("default"))) void dto_memset_cfg(void *s, int c, size_t n,
+        struct dto_call_cfg *cfg)
+{
+        __dto_memset(s, c, n, cfg);
+}
+
+static int dto_memcmp_dsa(const void *s1, const void *s2, size_t n, int *result, const struct dto_call_cfg *cfg)
+{
+        struct dto_wq *wq = get_wq((void*)s2, cfg);
 	int cmp_result = 0;
 	size_t orig_n = n;
 
@@ -1686,7 +1867,7 @@ static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result)
 		thr_desc.src_addr = (uint64_t) s1;
 		thr_desc.src2_addr = (uint64_t) s2;
 		thr_desc.xfer_size = (uint32_t) n;
-		*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
+		*result = dsa_execute(wq, &thr_desc, &thr_comp.status, cfg);
 	} else {
 		do {
 			size_t len;
@@ -1696,7 +1877,7 @@ static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result)
 			thr_desc.src_addr = (uint64_t) s1 + thr_bytes_completed;
 			thr_desc.src2_addr = (uint64_t) s2 + thr_bytes_completed;
 			thr_desc.xfer_size = (uint32_t) len;
-			*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
+			*result = dsa_execute(wq, &thr_desc, &thr_comp.status, cfg);
 
 			if (*result != SUCCESS || thr_comp.result)
 				break;
@@ -1719,8 +1900,54 @@ static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result)
 		 * we didn't process all the bytes
 		 */
 		thr_bytes_completed = orig_n;
-	}
-	return cmp_result;
+        }
+        return cmp_result;
+}
+
+int __dto_memcmp(const void *s1, const void *s2, size_t n, struct dto_call_cfg *cfg)
+{
+        int result = 0;
+        int ret;
+#ifdef DTO_STATS_SUPPORT
+        struct timespec st, et;
+        size_t orig_n = n;
+        DTO_COLLECT_STATS_START(collect_stats, st);
+#endif
+        ret = dto_memcmp_dsa(s1, s2, n, &result, cfg);
+#ifdef DTO_STATS_SUPPORT
+        DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, false, thr_bytes_completed, result, cfg);
+        if (thr_bytes_completed != n) {
+#endif
+                n -= thr_bytes_completed;
+                s1 = (const void *)((uint64_t)s1 + thr_bytes_completed);
+                s2 = (const void *)((uint64_t)s2 + thr_bytes_completed);
+#ifdef DTO_STATS_SUPPORT
+                DTO_COLLECT_STATS_START(collect_stats, st);
+#endif
+                ret = orig_memcmp(s1, s2, n);
+#ifdef DTO_STATS_SUPPORT
+                DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCMP, n, orig_n, cfg);
+        }
+#endif
+        return ret;
+}
+
+__attribute__((visibility("default"))) int dto_memcmp_default(const void *s1, const void *s2, size_t n)
+{
+        return __dto_memcmp(s1, s2, n, &dto_default_cfg);
+}
+
+__attribute__((visibility("default"))) int dto_memcmp(const void *s1, const void *s2, size_t n, int flags)
+{
+        struct dto_call_cfg cfg;
+        init_dto_cfg(&cfg, flags);
+        return __dto_memcmp(s1, s2, n, &cfg);
+}
+
+__attribute__((visibility("default"))) int dto_memcmp_cfg(const void *s1, const void *s2, size_t n,
+                struct dto_call_cfg *cfg)
+{
+        return __dto_memcmp(s1, s2, n, cfg);
 }
 
 /* The dto_internal_mem* APIs are used only when mem* APIs are
@@ -1793,10 +2020,10 @@ void *memset(void *s1, int c, size_t n)
 #ifdef DTO_STATS_SUPPORT
 		DTO_COLLECT_STATS_START(collect_stats, st);
 #endif
-		dto_memset(s1, c, n, &result);
+                dto_memset_dsa(s1, c, n, &result, &dto_default_cfg);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, false, thr_bytes_completed, result);
+		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, false, thr_bytes_completed, result, &dto_default_cfg);
 #endif
 		if (thr_bytes_completed != n) {
 			/* fallback to std call if job is only partially completed */
@@ -1814,7 +2041,7 @@ void *memset(void *s1, int c, size_t n)
 		orig_memset(s1, c, n);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMSET, n, orig_n);
+		DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMSET, n, orig_n, &dto_default_cfg);
 #endif
 	}
 	return ret;
@@ -1843,10 +2070,10 @@ void *memcpy(void *dest, const void *src, size_t n)
 #ifdef DTO_STATS_SUPPORT
 		DTO_COLLECT_STATS_START(collect_stats, st);
 #endif
-		dto_memcpymove(dest, src, n, 1, &result);
+		dto_memcpymove(dest, src, n, true, &result, NULL, NULL, &dto_default_cfg);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, false, thr_bytes_completed, result);
+		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, false, thr_bytes_completed, result, &dto_default_cfg);
 #endif
 		if (thr_bytes_completed != n) {
 			/* fallback to std call if job is only partially completed */
@@ -1867,7 +2094,7 @@ void *memcpy(void *dest, const void *src, size_t n)
 		orig_memcpy(dest, src, n);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCOPY, n, orig_n);
+		DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCOPY, n, orig_n, &dto_default_cfg);
 #endif
 	}
 	return ret;
@@ -1897,10 +2124,10 @@ void *memmove(void *dest, const void *src, size_t n)
 #ifdef DTO_STATS_SUPPORT
 		DTO_COLLECT_STATS_START(collect_stats, st);
 #endif
-		is_overlapping = dto_memcpymove(dest, src, n, 0, &result);
+		is_overlapping = dto_memcpymove(dest, src, n, false, &result, NULL, NULL, &dto_default_cfg);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, is_overlapping, thr_bytes_completed, result);
+		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, is_overlapping, thr_bytes_completed, result, &dto_default_cfg);
 #endif
 		if (thr_bytes_completed != n) {
 			/* fallback to std call if job is only partially completed */
@@ -1921,7 +2148,7 @@ void *memmove(void *dest, const void *src, size_t n)
 		orig_memmove(dest, src, n);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMMOVE, n, orig_n);
+		DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMMOVE, n, orig_n, &dto_default_cfg);
 #endif
 	}
 	return ret;
@@ -1950,10 +2177,10 @@ int memcmp(const void *s1, const void *s2, size_t n)
 #ifdef DTO_STATS_SUPPORT
 		DTO_COLLECT_STATS_START(collect_stats, st);
 #endif
-		ret = dto_memcmp(s1, s2, n, &result);
+                ret = dto_memcmp_dsa(s1, s2, n, &result, &dto_default_cfg);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, false, thr_bytes_completed, result);
+		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, false, thr_bytes_completed, result, &dto_default_cfg);
 #endif
 		if (thr_bytes_completed != n) {
 			/* fallback to std call if job is only partially completed */
@@ -1972,7 +2199,7 @@ int memcmp(const void *s1, const void *s2, size_t n)
 		ret = orig_memcmp(s1, s2, n);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCMP, n, orig_n);
+		DTO_COLLECT_STATS_CPU_END(collect_stats, st, et, MEMCMP, n, orig_n, &dto_default_cfg);
 #endif
 	}
 	return ret;
diff --git a/dto.h b/dto.h
new file mode 100644
index 0000000..cac8518
--- /dev/null
+++ b/dto.h
@@ -0,0 +1,185 @@
+
+#ifndef DTO_H
+#define DTO_H
+
+#define DTO_API_AUTO_ADJUST_KNOBS 1
+#define DTO_API_NO_AUTO_ADJUST_KNOBS 2
+#define DTO_API_WAIT_BUSYPOLL 4
+#define DTO_API_WAIT_UMWAIT 8
+#define DTO_API_WAIT_TPAUSE 16
+#define DTO_API_WAIT_YIELD 32
+#define DTO_API_CACHE_CONTROL 64
+#define DTO_API_NO_CACHE_CONTROL 128
+#define DTO_API_NUMA_AWARE_BUFFER_CENTRIC 256
+#define DTO_API_NUMA_AWARE_CPU_CENTRIC 512
+#define DTO_API_NUMA_AWARE_DISABLED 1024
+#define DTO_API_OVERLAPPING_MEMMOVE_ACTION_DSA 2048
+#define DTO_API_OVERLAPPING_MEMMOVE_ACTION_CPU 4096
+
+#define DTO_DSA_COMPLETE_OFFLOAD 8192
+#define DTO_AUTO_SPLIT 16384
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void(*callback_t)(void*);
+
+enum wait_options {
+		WAIT_BUSYPOLL = 0,
+		WAIT_UMWAIT,
+		WAIT_YIELD,
+		WAIT_TPAUSE
+};
+
+enum numa_aware {
+		NA_NONE = 0,
+		NA_BUFFER_CENTRIC,
+		NA_CPU_CENTRIC,
+		NA_LAST_ENTRY
+};
+
+enum overlapping_memmove_actions {
+		OVERLAPPING_CPU = 0,
+		OVERLAPPING_DSA,
+		OVERLAPPING_LAST_ENTRY
+};
+
+struct dto_call_cfg {
+		unsigned char auto_adjust;
+		unsigned char cache_control;
+		enum wait_options wait_method;
+		enum numa_aware numa_mode;
+		enum overlapping_memmove_actions overlapping_action;
+};
+
+/**
+ * dto_memcpy_default - Copy memory using DTO's default configuration.
+ * @dest: Destination buffer.
+ * @src:  Source buffer.
+ * @n:    Number of bytes to copy.
+ */
+void dto_memcpy_default(void *dest, const void *src, size_t n);
+
+/**
+ * dto_memcpy_cfg - Copy memory using a caller provided configuration.
+ * @dest: Destination buffer.
+ * @src:  Source buffer.
+ * @n:    Number of bytes to copy.
+ * @cfg:  DTO configuration to use for this call.
+ * @cb:   Optional callback invoked after completion.
+ * @args: Argument passed to the callback.
+ */
+void dto_memcpy_cfg(void *dest, const void *src, size_t n,
+                struct dto_call_cfg *cfg, callback_t cb, void* args);
+
+/**
+ * dto_memcpy - Copy memory using a configuration derived from @flags.
+ * @dest:  Destination buffer.
+ * @src:   Source buffer.
+ * @n:     Number of bytes to copy.
+ * @flags: Bitwise OR of DTO_API_* values that override defaults.
+ * @cb:    Optional callback invoked after completion.
+ * @args:  Argument passed to the callback.
+ */
+void dto_memcpy(void *dest, const void *src, size_t n,
+                int flags, callback_t cb, void* args);
+
+/**
+ * dto_memmove_default - Move memory using DTO's default configuration.
+ * @dest: Destination buffer.
+ * @src:  Source buffer.
+ * @n:    Number of bytes to move.
+ */
+void dto_memmove_default(void *dest, const void *src, size_t n);
+
+/**
+ * dto_memmove_cfg - Move memory using a caller provided configuration.
+ * @dest: Destination buffer.
+ * @src:  Source buffer.
+ * @n:    Number of bytes to move.
+ * @cfg:  DTO configuration to use for this call.
+ * @cb:   Optional callback invoked after completion.
+ * @args: Argument passed to the callback.
+ */
+void dto_memmove_cfg(void *dest, const void *src, size_t n,
+                struct dto_call_cfg *cfg, callback_t cb, void* args);
+
+/**
+ * dto_memmove - Move memory using a configuration derived from @flags.
+ * @dest:  Destination buffer.
+ * @src:   Source buffer.
+ * @n:     Number of bytes to move.
+ * @flags: Bitwise OR of DTO_API_* values that override defaults.
+ * @cb:    Optional callback invoked after completion.
+ * @args:  Argument passed to the callback.
+ */
+void dto_memmove(void *dest, const void *src, size_t n,
+                int flags, callback_t cb, void* args);
+
+/**
+ * dto_memset_default - Set memory using DTO's default configuration.
+ * @s: Destination buffer to fill.
+ * @c: Byte value to set.
+ * @n: Number of bytes to set.
+ */
+void dto_memset_default(void *s, int c, size_t n);
+
+/**
+ * dto_memset_cfg - Set memory using a caller provided configuration.
+ * @s:   Destination buffer to fill.
+ * @c:   Byte value to set.
+ * @n:   Number of bytes to set.
+ * @cfg: DTO configuration to use for this call.
+ */
+void dto_memset_cfg(void *s, int c, size_t n, struct dto_call_cfg *cfg);
+
+/**
+ * dto_memset - Set memory using a configuration derived from @flags.
+ * @s:     Destination buffer to fill.
+ * @c:     Byte value to set.
+ * @n:     Number of bytes to set.
+ * @flags: Bitwise OR of DTO_API_* values that override defaults.
+ */
+void dto_memset(void *s, int c, size_t n, int flags);
+
+/**
+ * dto_memcmp_default - Compare memory using DTO's default configuration.
+ * @s1: First buffer.
+ * @s2: Second buffer.
+ * @n:  Number of bytes to compare.
+ *
+ * Return: < 0, 0 or > 0 if s1 is found to be less than, equal to or greater
+ * than s2 respectively.
+ */
+int dto_memcmp_default(const void *s1, const void *s2, size_t n);
+
+/**
+ * dto_memcmp_cfg - Compare memory using a caller provided configuration.
+ * @s1:  First buffer.
+ * @s2:  Second buffer.
+ * @n:   Number of bytes to compare.
+ * @cfg: DTO configuration to use for this call.
+ *
+ * Return: Comparison result as in memcmp().
+ */
+int dto_memcmp_cfg(const void *s1, const void *s2, size_t n,
+                struct dto_call_cfg *cfg);
+
+/**
+ * dto_memcmp - Compare memory using a configuration derived from @flags.
+ * @s1:    First buffer.
+ * @s2:    Second buffer.
+ * @n:     Number of bytes to compare.
+ * @flags: Bitwise OR of DTO_API_* values that override defaults.
+ *
+ * Return: Comparison result as in memcmp().
+ */
+int dto_memcmp(const void *s1, const void *s2, size_t n, int flags);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+

From dd3198d317b5fc8904e5e3efaac7ee70f889d2c3 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Fri, 12 Sep 2025 08:46:20 -0700
Subject: [PATCH 2/3] adds CRC

---
 Makefile       |   4 +-
 README.md      |  13 +++++
 accelConfig.sh |   8 ++-
 dto.c          | 146 +++++++++++++++++++++++++++++++++++++++++++++++++
 dto.h          |  77 ++++++++++++++++++++++++++
 5 files changed, 245 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 6f75c14..a79ef47 100644
--- a/Makefile
+++ b/Makefile
@@ -7,10 +7,10 @@ all: libdto dto-test-wodto
 DML_LIB_CXX=-D_GNU_SOURCE
 
 libdto: dto.c
-	gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg
+	gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -msse4.2 -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg
 
 libdto_nostats: dto.c
-	gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg
+	gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -msse4.2 -o libdto.so.1.0 -laccel-config -ldl -lnuma -mwaitpkg
 
 install:
 	cp libdto.so.1.0 /usr/lib64/
diff --git a/README.md b/README.md
index e923361..f7ab205 100644
--- a/README.md
+++ b/README.md
@@ -58,16 +58,23 @@ is available in three forms:
 - The unsuffixed name derives configuration from a bitwise OR of `DTO_API_*`
   flags.
 
+We also support CRC32C offload via `dto_memcpy_crc` and `dto_crc` functions. `dto_memcpy_crc`
+offloads both memcpy and CRC32C computation to DSA, while `dto_crc` offloads only CRC32C computation to DSA.
+
 Available entry points include:
 
 - `dto_memcpy_default`, `dto_memcpy_cfg`, `dto_memcpy`
+- `dto_memcpy_async`
 - `dto_memmove_default`, `dto_memmove_cfg`, `dto_memmove`
 - `dto_memset_default`, `dto_memset_cfg`, `dto_memset`
 - `dto_memcmp_default`, `dto_memcmp_cfg`, `dto_memcmp`
+- `dto_memcpy_crc_default`, `dto_memcpy_crc_cfg`, `dto_memcpy_crc`
+- `dto_crc_default`, `dto_crc_cfg`, `dto_crc`
 
 ### Sample usage
 
 ```c
+#include <stdint.h>
 #include "dto.h"
 
 int main(void)
@@ -91,6 +98,12 @@ int main(void)
     /* Flags-based configuration */
     dto_memset(dst, 0, sizeof(dst), DTO_API_WAIT_YIELD);
 
+    /* Copy while computing CRC */
+    uint32_t c1 = dto_memcpy_crc_default(dst, src, sizeof(src));
+
+    /* Compute CRC with flags-based configuration */
+    uint32_t c2 = dto_crc(src, sizeof(src), DTO_API_WAIT_YIELD, NULL, NULL);
+
     return diff;
 }
 ```
diff --git a/accelConfig.sh b/accelConfig.sh
index dd8b4d5..d134306 100755
--- a/accelConfig.sh
+++ b/accelConfig.sh
@@ -4,6 +4,7 @@ echo "OPTIONAL Arg-1: DSA device id. Default: 0"
 echo "OPTIONAL Arg-2: Enable/Disable DSA device. Default: yes"
 echo "OPTIONAL Arg-3: SHARED WQ id. Default: 1"
 echo "OPTIONAL Arg-4: ENGINE count. Default: 4"
+echo "OPTIONAL Arg-5: Block on fault. Default: no, but this is needed for CRC32!"
 
 if [ "$#" -ge 5 ]; then
     echo "ERROR: Incorrect argument count. Expected arg count <= 4"
@@ -14,6 +15,7 @@ DEVID=${1:-0}
 ENABLE=${2:-yes}
 SWQID=${3:-1}
 NENGS=${4:-4}
+BLOCKONFAULT=${5:-no}
 
 DEV=dsa${DEVID}
 SWQ=${DEV}/wq${DEVID}.${SWQID}
@@ -41,7 +43,11 @@ accel-config config-wq ${SWQ} --priority=1
 accel-config config-wq ${SWQ} --wq-size=128
 accel-config config-wq ${SWQ} --max-batch-size=1024
 accel-config config-wq ${SWQ} --max-transfer-size=2147483648
-accel-config config-wq ${SWQ} --block-on-fault=0
+if [ "${BLOCKONFAULT}" == "yes" ]; then
+    accel-config config-wq ${SWQ} --block-on-fault=1
+else
+    accel-config config-wq ${SWQ} --block-on-fault=0
+
 accel-config config-wq ${SWQ} --type=user
 accel-config config-wq ${SWQ} --name="dsa-test"
 accel-config config-wq ${SWQ} --mode=shared
diff --git a/dto.c b/dto.c
index 6a65c2e..82650ab 100644
--- a/dto.c
+++ b/dto.c
@@ -23,6 +23,7 @@
 #include <accel-config/libaccel_config.h>
 #include <numaif.h>
 #include <numa.h>
+#include <nmmintrin.h>
 #include "dto.h"
 
 #define likely(x)       __builtin_expect((x), 1)
@@ -73,6 +74,7 @@ struct dto_wq {
 	uint64_t dsa_gencap;
 	int wq_size;
 	uint32_t max_transfer_size;
+        int block_on_fault;
 	int wq_fd;
 	void *wq_portal;
 	bool wq_mmapped;
@@ -275,6 +277,29 @@ static double max_avg_waits = MAX_AVG_YIELD_WAITS;
 
 extern char *__progname;
 
+static uint32_t crc32c_hw(const uint8_t *data, size_t len)
+{
+    uint32_t crc = 0;
+
+    while (len >= sizeof(uint64_t)) {
+        crc = _mm_crc32_u64(crc, *(const uint64_t *)data);
+        data += sizeof(uint64_t);
+        len -= sizeof(uint64_t);
+    }
+
+    while (len >= sizeof(uint32_t)) {
+        crc = _mm_crc32_u32(crc, *(const uint32_t *)data);
+        data += sizeof(uint32_t);
+        len -= sizeof(uint32_t);
+    }
+
+    while (len--) {
+        crc = _mm_crc32_u8(crc, *data++);
+    }
+
+    return crc;
+}
+
 static void dto_log(int req_log_level, const char *fmt, ...)
 {
 	char buf[512];
@@ -881,6 +906,7 @@ static int dsa_init_from_wq_list(char *wq_list)
 			close(dir_fd);
 			goto fail_wq;
 		}
+		wqs[num_wqs].max_transfer_size = dto_get_param_ullong(dir_fd, "block_on_fault", &rc);
 
 		dto_get_param_string(dir_fd, "mode", wq_mode);
 
@@ -1036,6 +1062,7 @@ static int dsa_init_from_accfg(void)
 
 			wqs[num_wqs].wq_size = accfg_wq_get_size(wq);
 			wqs[num_wqs].max_transfer_size = accfg_wq_get_max_transfer_size(wq);
+                        wqs[num_wqs].block_on_fault = accfg_wq_get_block_on_fault(wq);
 
 			wqs[num_wqs].acc_wq = wq;
 			wqs[num_wqs].dsa_gencap = accfg_device_get_gen_cap(device);
@@ -1545,6 +1572,90 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n)
 	return true;
 }
 
+static uint32_t __dto_crc(const void *src, size_t n,
+                          struct dto_call_cfg *cfg, callback_t cb, void *args)
+{
+        struct dto_wq *wq = get_wq((void *)src, cfg);
+        if (wq->block_on_fault == 0) {
+            LOG_ERROR("DSA WQ %s does not support block on fault. Falling back to CPU crc32c\n", wq->wq_path);
+            orig_memcpy((void *)src, src, n);
+            if (cb)
+                cb(args);
+            return crc32c_hw(src, n);
+        }
+        int result;
+
+        thr_bytes_completed = 0;
+
+        thr_desc.opcode = DSA_OPCODE_CRCGEN;
+        thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_BOF;
+        if (cfg->cache_control && (wq->dsa_gencap & GENCAP_CC_MEMORY))
+                thr_desc.flags |= IDXD_OP_FLAG_CC;
+        thr_desc.completion_addr = (uint64_t)&thr_comp;
+
+        thr_desc.src_addr = (uint64_t)src;
+        thr_desc.dst_addr = 0;
+        thr_desc.xfer_size = (uint32_t)n;
+        thr_desc.crc_seed = 0;
+        thr_desc.rsvd = 0;
+        thr_comp.status = 0;
+
+        result = dsa_submit(wq, &thr_desc);
+        if (result == SUCCESS) {
+                if (cb)
+                        cb(args);
+                result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg);
+        }
+
+        if (result != SUCCESS || thr_bytes_completed < n)
+                return 0;
+
+        return (uint32_t)thr_comp.crc_val;
+}
+
+static uint32_t __dto_memcpy_crc(void *dest, const void *src, size_t n,
+                                 struct dto_call_cfg *cfg, callback_t cb, void *args)
+{
+        int use_orig = USE_ORIG_FUNC(n, dto_dsa_memcpy);
+
+        if (use_orig) {
+                orig_memcpy(dest, src, n);
+                if (cb)
+                        cb(args);
+                return crc32c_hw(src, n);
+        }
+
+        struct dto_wq *wq = get_wq(dest, cfg);
+        int result;
+
+        thr_bytes_completed = 0;
+
+        thr_desc.opcode = DSA_OPCODE_COPY_CRC;
+        thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_BOF;
+        if (cfg->cache_control && (wq->dsa_gencap & GENCAP_CC_MEMORY))
+                thr_desc.flags |= IDXD_OP_FLAG_CC;
+        thr_desc.completion_addr = (uint64_t)&thr_comp;
+
+        thr_desc.src_addr = (uint64_t)src;
+        thr_desc.dst_addr = (uint64_t)dest;
+        thr_desc.xfer_size = (uint32_t)n;
+        thr_desc.crc_seed = 0;
+        thr_desc.rsvd = 0;
+        thr_comp.status = 0;
+
+        result = dsa_submit(wq, &thr_desc);
+        if (result == SUCCESS) {
+                if (cb)
+                        cb(args);
+                result = dsa_wait(wq, &thr_desc, &thr_comp.status, cfg);
+        }
+
+        if (result != SUCCESS || thr_bytes_completed < n)
+                return 0;
+
+        return (uint32_t)thr_comp.crc_val;
+}
+
 static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy,
 		int *result, callback_t cb, void* args,
 		const struct dto_call_cfg *cfg)
@@ -1759,6 +1870,22 @@ __attribute__((visibility("default"))) void dto_memcpy_cfg(void *dest, const voi
         __dto_memcpy(dest, src, n, cfg, cb, args);
 }
 
+__attribute__((visibility("default"))) uint32_t dto_memcpy_crc_default(void *dest, const void *src, size_t n) {
+        return __dto_memcpy_crc(dest, src, n, &dto_default_cfg, NULL, NULL);
+}
+
+__attribute__((visibility("default"))) uint32_t dto_memcpy_crc(void *dest, const void *src, size_t n,
+        int flags, callback_t cb, void* args) {
+        struct dto_call_cfg cfg;
+        init_dto_cfg(&cfg, flags);
+        return __dto_memcpy_crc(dest, src, n, &cfg, cb, args);
+}
+
+__attribute__((visibility("default"))) uint32_t dto_memcpy_crc_cfg(void *dest, const void *src, size_t n,
+        struct dto_call_cfg *cfg, callback_t cb, void* args) {
+        return __dto_memcpy_crc(dest, src, n, cfg, cb, args);
+}
+
 void __dto_memmove(void *dest, const void *src, size_t n,
        		   struct dto_call_cfg *cfg, callback_t cb, void* args) {
 	int result = 0;
@@ -1950,6 +2077,25 @@ __attribute__((visibility("default"))) int dto_memcmp_cfg(const void *s1, const
         return __dto_memcmp(s1, s2, n, cfg);
 }
 
+__attribute__((visibility("default"))) uint32_t dto_crc_default(const void *src, size_t n)
+{
+        return __dto_crc(src, n, &dto_default_cfg, NULL, NULL);
+}
+
+__attribute__((visibility("default"))) uint32_t dto_crc(const void *src, size_t n,
+        int flags, callback_t cb, void *args)
+{
+        struct dto_call_cfg cfg;
+        init_dto_cfg(&cfg, flags);
+        return __dto_crc(src, n, &cfg, cb, args);
+}
+
+__attribute__((visibility("default"))) uint32_t dto_crc_cfg(const void *src, size_t n,
+        struct dto_call_cfg *cfg, callback_t cb, void *args)
+{
+        return __dto_crc(src, n, cfg, cb, args);
+}
+
 /* The dto_internal_mem* APIs are used only when mem* APIs are
  * called before DTO is properly initialized. So these
  * implementations dont have to be performant
diff --git a/dto.h b/dto.h
index cac8518..84745b3 100644
--- a/dto.h
+++ b/dto.h
@@ -19,6 +19,8 @@
 #define DTO_DSA_COMPLETE_OFFLOAD 8192
 #define DTO_AUTO_SPLIT 16384
 
+#include <stdint.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -85,6 +87,46 @@ void dto_memcpy_cfg(void *dest, const void *src, size_t n,
 void dto_memcpy(void *dest, const void *src, size_t n,
                 int flags, callback_t cb, void* args);
 
+
+/**
+ * dto_memcpy_crc_default - Copy memory and compute CRC using DTO defaults.
+ * @dest: Destination buffer.
+ * @src:  Source buffer.
+ * @n:    Number of bytes to copy.
+ *
+ * Return: CRC32C value of the copied data.
+ */
+uint32_t dto_memcpy_crc_default(void *dest, const void *src, size_t n);
+
+/**
+ * dto_memcpy_crc_cfg - Copy memory and compute CRC using caller configuration.
+ * @dest: Destination buffer.
+ * @src:  Source buffer.
+ * @n:    Number of bytes to copy.
+ * @cfg:  DTO configuration to use for this call.
+ * @cb:   Optional callback invoked after submission.
+ * @args: Argument passed to the callback.
+ *
+ * Return: CRC32C value of the copied data.
+ */
+uint32_t dto_memcpy_crc_cfg(void *dest, const void *src, size_t n,
+                  struct dto_call_cfg *cfg, callback_t cb, void* args);
+
+/**
+ * dto_memcpy_crc - Copy memory and compute CRC using @flags derived
+ *                   configuration.
+ * @dest:  Destination buffer.
+ * @src:   Source buffer.
+ * @n:     Number of bytes to copy.
+ * @flags: Bitwise OR of DTO_API_* values that override defaults.
+ * @cb:    Optional callback invoked after submission.
+ * @args:  Argument passed to the callback.
+ *
+ * Return: CRC32C value of the copied data.
+ */
+uint32_t dto_memcpy_crc(void *dest, const void *src, size_t n,
+                   int flags, callback_t cb, void* args);
+
 /**
  * dto_memmove_default - Move memory using DTO's default configuration.
  * @dest: Destination buffer.
@@ -177,6 +219,41 @@ int dto_memcmp_cfg(const void *s1, const void *s2, size_t n,
  */
 int dto_memcmp(const void *s1, const void *s2, size_t n, int flags);
 
+/**
+ * dto_crc_default - Compute CRC using DTO's default configuration.
+ * @src: Buffer to checksum.
+ * @n:   Number of bytes to process.
+ *
+ * Return: CRC32C value of the buffer.
+ */
+uint32_t dto_crc_default(const void *src, size_t n);
+
+/**
+ * dto_crc_cfg - Compute CRC using a caller provided configuration.
+ * @src:  Buffer to checksum.
+ * @n:    Number of bytes to process.
+ * @cfg:  DTO configuration to use for this call.
+ * @cb:   Optional callback invoked after submission.
+ * @args: Argument passed to the callback.
+ *
+ * Return: CRC32C value of the buffer.
+ */
+uint32_t dto_crc_cfg(const void *src, size_t n, struct dto_call_cfg *cfg,
+                callback_t cb, void* args);
+
+/**
+ * dto_crc - Compute CRC using a configuration derived from @flags.
+ * @src:   Buffer to checksum.
+ * @n:     Number of bytes to process.
+ * @flags: Bitwise OR of DTO_API_* values that override defaults.
+ * @cb:    Optional callback invoked after submission.
+ * @args:  Argument passed to the callback.
+ *
+ * Return: CRC32C value of the buffer.
+ */
+uint32_t dto_crc(const void *src, size_t n, int flags, callback_t cb,
+                void* args);
+
 #ifdef __cplusplus
 }
 #endif

From 9521e33cd1edd47146d4591b0ee9bf67fb4982f9 Mon Sep 17 00:00:00 2001
From: Daniel Byrne <byrnedj12@gmail.com>
Date: Mon, 15 Sep 2025 05:53:39 -0700
Subject: [PATCH 3/3] fix readme

---
 README.md | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/README.md b/README.md
index f7ab205..870455d 100644
--- a/README.md
+++ b/README.md
@@ -41,13 +41,6 @@ can be enabled or disabled using an environment variable DTO_AUTO_ADJUST_KNOBS.
 
 DTO can also be used to learn certain application characterics by building histogram of various API types and sizes. The histogram can be built using an environment variable DTO_COLLECT_STATS.
 
-Finally, DTO offers an API to allow applications to pass a function pointer to be called while waiting for DSA to complete the operation. This can be used to perform other work while waiting for DSA to complete the operation. The function signature is:
-
-```bash
-dto_memcpy_async(void *dest, const void *src, size_t n, callback_t cb, void* args);
-```
-where callback_t cb is a function pointer in the calling application. If the callback terminates before DSA completes the operation, the specified wait method is used to complete the waiting.
-
 ## DTO API
 
 DTO provides drop-in replacements for standard memory operations. Each function
@@ -64,7 +57,6 @@ offloads both memcpy and CRC32C computation to DSA, while `dto_crc` offloads onl
 Available entry points include:
 
 - `dto_memcpy_default`, `dto_memcpy_cfg`, `dto_memcpy`
-- `dto_memcpy_async`
 - `dto_memmove_default`, `dto_memmove_cfg`, `dto_memmove`
 - `dto_memset_default`, `dto_memset_cfg`, `dto_memset`
 - `dto_memcmp_default`, `dto_memcmp_cfg`, `dto_memcmp`