From f01d9b1c4d2b7b30104a97501e36ac7806447860 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Fri, 1 Mar 2024 11:42:29 +0100
Subject: [PATCH 01/23] Binaries added to ignore list

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 .gitignore
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..221b15a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+dto-test*
+libdto.so*

From c9fbe216f511a3a5085fb2effd4e9af6ddbcb45d Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Fri, 1 Mar 2024 11:43:41 +0100
Subject: [PATCH 02/23] Numa awareness implementation

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 Makefile  |   4 +-
 README.md |   6 +-
 dto.c     | 161 ++++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 152 insertions(+), 19 deletions(-)

diff --git a/Makefile b/Makefile
index b58c56a..92665ce 100644
--- a/Makefile
+++ b/Makefile
@@ -7,10 +7,10 @@ all: libdto dto-test-wodto
 DML_LIB_CXX=-D_GNU_SOURCE
 
 libdto: dto.c
-	gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl
+	gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -DDTO_STATS_SUPPORT -o libdto.so.1.0 -laccel-config -ldl -lnuma
 
 libdto_nostats: dto.c
-	gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl
+	gcc -shared -fPIC -Wl,-soname,libdto.so dto.c $(DML_LIB_CXX) -o libdto.so.1.0 -laccel-config -ldl -lnuma
 
 install:
 	cp libdto.so.1.0 /usr/lib64/
diff --git a/README.md b/README.md
index 54024bf..ef334f9 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,9 @@ DTO library falls back to using standard APIs on CPU under following scenarios:
 To improve throughput for synchronous offload, DTO uses "pseudo asynchronous" execution using following steps.
 1) After intercepting the API call, DTO splits the API job into two parts; 1) CPU job and 2) DSA job. For example, a 64 KB memcpy may
    be split into 20 KB CPU job and 44 KB DSA job. The split fraction can be configured using an environment variable DTO_CPU_SIZE_FRACTION. 
-2) DTO submits the DSA portion of the job to DSA.
+2) DTO submits the DSA portion of the job to DSA. 
+   If DTO_IS_NUMA_AWARE=1 DTO uses works queues of DSA device located on the same numa node as 
+   buffer (memcpy/memmove - dest buffer, memcmp - ptr2) delivered to method. This reduces UPI traffic.
 3) In parallel, DTO performs the CPU portion of the job using std library on CPU.
 4) DTO waits for DSA to complete (if it hasn't completed already). The wait method can be configured using an environment variable DTO_WAIT_METHOD.
 
@@ -44,6 +46,7 @@ Following environment variables control the behavior of DTO library:
 	DTO_MIN_BYTES=xxxx (specifies minimum size of API call needed for DSA operation execution, default is 8192 bytes)
 	DTO_CPU_SIZE_FRACTION=0.xx (specifies fraction of job performed by CPU, in parallel to DSA). Default is 0.00
 	DTO_AUTO_ADJUST_KNOBS=0/1 (disables/enables auto tuning of cpu_size_fraction and dsa_min_bytes parameters. 0 -- disable, 1 -- enable (default))
+   DTO_IS_NUMA_AWARE=0/1 (disables/enables numa awareness. 0 -- disable (default), 1 -- enable)
 	DTO_WQ_LIST="semi-colon(;) separated list of DSA WQs to use". The WQ names should match their names in /dev/dsa/ directory (see example below).
 				If not specified, DTO will try to auto-discover and use all available WQs.
 	DTO_LOG_FILE=<dto log file path> Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid.
@@ -89,6 +92,7 @@ make dto-test-wodto
             export DTO_CPU_SIZE_FRACTION=0.33
             export DTO_AUTO_ADJUST_KNOBS=1
             export DTO_WQ_LIST="wq0.0;wq2.0;wq4.0;wq6.0"
+            export DTO_IS_NUMA_AWARE=1
 	iii. Run the application - (CacheBench example below)
     3b. Using LD_PRELOAD method (doesn not require recompiling the application)
 	i. setup LD_PRELOAD environment variable to point to DTO library
diff --git a/dto.c b/dto.c
index 3d16ffb..cef139d 100644
--- a/dto.c
+++ b/dto.c
@@ -21,6 +21,8 @@
 #include <pthread.h>
 #include <dlfcn.h>
 #include <accel-config/libaccel_config.h>
+#include <numaif.h>
+#include <numa.h>
 
 #define likely(x)       __builtin_expect((x), 1)
 #define unlikely(x)     __builtin_expect((x), 0)
@@ -43,6 +45,8 @@
  * a hang (e.g., memset --> malloc --> alloc library calls memset --> memset)
  */
 #define MAX_WQS 16
+#define MAX_NUMA_NODES 16
+#define MAX_WQS_IN_DSA_DEVICE 8
 #define DTO_DEFAULT_MIN_SIZE 8192
 #define DTO_INITIALIZED 0
 #define DTO_INITIALIZING 1
@@ -68,6 +72,12 @@ struct dto_wq {
 	void *wq_portal;
 };
 
+struct dto_device {
+	struct dto_wq* wqs[MAX_WQS_IN_DSA_DEVICE];
+	uint8_t num_wqs;
+	atomic_uchar next_wq;
+};
+
 enum wait_options {
 	WAIT_BUSYPOLL = 0,
 	WAIT_UMWAIT,
@@ -76,11 +86,13 @@ enum wait_options {
 
 // global workqueue variables
 static struct dto_wq wqs[MAX_WQS];
+static struct dto_device* devices[MAX_NUMA_NODES];
 static uint8_t num_wqs;
-static uint8_t next_wq;
+static atomic_uchar next_wq;
 static uint8_t dto_initialized;
 static uint8_t dto_initializing;
 static uint8_t use_std_lib_calls;
+static uint8_t is_numa_aware;
 static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE;
 static int wait_method = WAIT_YIELD;
 static double cpu_size_fraction;
@@ -650,6 +662,36 @@ static unsigned long long dto_get_param_ullong(int dir_fd, char *path, int *err)
 	return val;
 }
 
+static void correct_devices_list() {
+/* 	Fill NULL gaps in devices list.
+	For SNC configurations there are less DSA devices then numa nodes
+	ex. SNC4: 8 numa nodes, 2 DSA devices:
+	dsa0 device has numa_node = 0, dsa2 device has numa_node = 4
+	Then we should use dsa0 device for numa_nodes = 0,1,2,3 and dsa2 device for numa_nodes = 4,5,6,7
+	and model the same in devices list.	
+ */	
+ 	struct dto_device* dev = NULL;
+	for (uint8_t i = 0; i < numa_num_configured_nodes(); i++) {
+		if (devices[i] != NULL) {
+			dev = devices[i];
+			continue;
+		} else {
+			devices[i] = dev;
+		}
+	}
+}
+
+static void cleanup_devices() {
+	struct dto_device* dev = NULL;
+	for (uint i = 0; i < MAX_NUMA_NODES; i++) {
+		if (devices[i] != dev) {
+			dev = devices[i];
+			free(devices[i]);
+		}
+		devices[i] = NULL;
+	}
+}
+ 
 static int dsa_init_from_wq_list(char *wq_list)
 {
 	char *wq;
@@ -680,10 +722,18 @@ static int dsa_init_from_wq_list(char *wq_list)
 		}
 
 		wqs[num_wqs].dsa_gencap = dto_get_param_ullong(dir_fd, "gen_cap", &rc);
-		close(dir_fd);
+		if (rc) {
+			close(dir_fd);
+			goto fail_wq;
+		}
 
-		if (rc)
+		const uint8_t dev_numa_node = (uint8_t)dto_get_param_ullong(dir_fd, "numa_node", &rc);
+		if (rc) {
+			close(dir_fd);
 			goto fail_wq;
+		}
+
+		close(dir_fd);
 
 		snprintf(file_path, PATH_MAX, "/sys/bus/dsa/devices/%s", wq);
 
@@ -735,6 +785,17 @@ static int dsa_init_from_wq_list(char *wq_list)
 			goto fail_wq;
 		}
 
+		if (is_numa_aware) {
+			struct dto_device* dev = devices[dev_numa_node] == NULL ? calloc(1, sizeof(struct dto_device)) : devices[dev_numa_node];
+			if (dev != NULL && 
+				dev->num_wqs < MAX_WQS_IN_DSA_DEVICE) {
+					if (dev->num_wqs == 0) {
+						devices[dev_numa_node] = dev;
+					}
+					dev->wqs[dev->num_wqs++] = &wqs[num_wqs];
+			}
+		}
+
 		++num_wqs;
 		if (num_wqs == MAX_WQS)
 			break;
@@ -746,12 +807,20 @@ static int dsa_init_from_wq_list(char *wq_list)
 		rc = -EINVAL;
 		goto fail;
 	}
+
+	if (is_numa_aware) {
+		correct_devices_list();
+	}
+
 	return 0;
 
 fail_wq:
 	for (int j = 0; j < num_wqs; j++)
 		munmap(wqs[j].wq_portal, 0x1000);
 	num_wqs = 0;
+
+	cleanup_devices();
+
 fail:
 	return rc;
 }
@@ -776,7 +845,11 @@ static int dsa_init_from_accfg(void)
 	num_wqs = 0;
 
 	accfg_device_foreach(dto_ctx, device) {
-		enum accfg_device_state dstate;
+		enum accfg_device_state dstate;		
+
+    	/* use dsa devices only*/
+		if (strncmp(accfg_device_get_devname(device), "dsa", 3)!= 0)
+			continue;
 
 		/* Make sure that the device is enabled */
 		dstate = accfg_device_get_state(device);
@@ -790,6 +863,14 @@ static int dsa_init_from_accfg(void)
 		if (i != num_wqs)
 			continue;
 
+		struct dto_device* dev = NULL;
+		uint8_t dev_numa_node = 0;
+
+		if (is_numa_aware) {
+			dev = calloc(1, sizeof(struct dto_device));
+			dev_numa_node = accfg_device_get_numa_node(device);
+		}
+
 		accfg_wq_foreach(device, wq) {
 			enum accfg_wq_state wstate;
 			enum accfg_wq_mode mode;
@@ -818,9 +899,20 @@ static int dsa_init_from_accfg(void)
 
 			used_devids[num_wqs] = accfg_device_get_id(device);
 
+			if (is_numa_aware && 
+				dev->num_wqs < MAX_WQS_IN_DSA_DEVICE && 
+				dev != NULL) {
+				dev->wqs[dev->num_wqs++] = &wqs[num_wqs];
+			}
+
 			num_wqs++;
-			break;
+			// break;
 		}
+
+		if (is_numa_aware) {
+			devices[dev_numa_node] = dev;
+		}
+
 		if (num_wqs == MAX_WQS)
 			break;
 	}
@@ -858,6 +950,10 @@ static int dsa_init_from_accfg(void)
 		}
 	}
 
+	if (is_numa_aware) {
+		correct_devices_list();
+	}
+
 	accfg_unref(dto_ctx);
 	return 0;
 
@@ -865,6 +961,8 @@ static int dsa_init_from_accfg(void)
 	for (int j = 0; j < i; j++)
 		munmap(wqs[j].wq_portal, 0x1000);
 	num_wqs = 0;
+
+	cleanup_devices();
 fail:
 	accfg_unref(dto_ctx);
 	return rc;
@@ -970,6 +1068,17 @@ static int init_dto(void)
 			use_std_lib_calls = !!use_std_lib_calls;
 		}
 
+		env_str = getenv("DTO_IS_NUMA_AWARE");
+		if (env_str != NULL) {
+			errno = 0;
+			is_numa_aware = strtoul(env_str, NULL, 10);
+			if (errno)
+				is_numa_aware = 0;
+
+			is_numa_aware = !!is_numa_aware &&
+							numa_available() != -1;
+		}
+
 #ifdef DTO_STATS_SUPPORT
 		env_str = getenv("DTO_COLLECT_STATS");
 		if (env_str != NULL) {
@@ -1053,9 +1162,9 @@ static int init_dto(void)
 
 			// display configuration
 			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
-				"cpu_size_fraction: %.2f wait_method: %s, auto_adjust_knobs %d\n",
+				"cpu_size_fraction: %.2f wait_method: %s, auto_adjust_knobs: %d, is_numa_aware: %d\n",
 				log_level, collect_stats, use_std_lib_calls, dsa_min_size,
-				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs);
+				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, is_numa_aware);
 			for (int i = 0; i < num_wqs; i++)
 				LOG_TRACE("[%d] wq_path: %s, wq_size: %d, dsa_cap: %lx\n", i,
 					wqs[i].wq_path, wqs[i].wq_size, wqs[i].dsa_gencap);
@@ -1080,23 +1189,43 @@ static void cleanup_dto(void)
 #endif
 	if (log_fd != -1)
 		close(log_fd);
+	
+	cleanup_devices();
 }
 
-static __always_inline  struct dto_wq *get_wq(void)
+static __always_inline  struct dto_wq *get_wq(void* buf)
 {
-	/* No need to have strict round robin wq usage
-	 * in order to avoid using locked instructions
-	 */
-	int wq_idx = next_wq++ % num_wqs;
+	struct dto_wq* wq = NULL;
+
+	if (is_numa_aware &&
+		buf != NULL) {
+		int status[1] = {-1};
+
+		// get numa node of memory pointed by buf
+		if (move_pages(0, 1, &buf, NULL, status, 0) == 0) {
+			int buf_numa_node = status[0];
+			if (buf_numa_node < MAX_NUMA_NODES) {
+				struct dto_device* dev = devices[buf_numa_node];
+				if (dev != NULL && 
+					dev->num_wqs > 0) {
+					wq = dev->wqs[dev->next_wq++ % dev->num_wqs];
+				}			
+			}
+		}
+	}
 
-	return &wqs[wq_idx];
+	if (wq == NULL) {
+		wq = &wqs[next_wq++ % num_wqs];
+	}
+	
+	return wq;
 }
 
 static void dto_memset(void *s, int c, size_t n, int *result)
 {
 	uint64_t memset_pattern;
 	size_t cpu_size, dsa_size;
-	struct dto_wq *wq = get_wq();
+	struct dto_wq *wq = get_wq(s);
 
 	for (int i = 0; i < 8; ++i)
 		((uint8_t *) &memset_pattern)[i] = (uint8_t) c;
@@ -1177,7 +1306,7 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n)
 
 static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
 {
-	struct dto_wq *wq = get_wq();
+	struct dto_wq *wq = get_wq(dest);
 	size_t cpu_size, dsa_size;
 
 	thr_desc.opcode = DSA_OPCODE_MEMMOVE;
@@ -1262,7 +1391,7 @@ static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
 
 static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result)
 {
-	struct dto_wq *wq = get_wq();
+	struct dto_wq *wq = get_wq((void*)s2);
 	int cmp_result = 0;
 	size_t orig_n = n;
 

From 4a040a0b563961b40784a81c87a148c07949e757 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Mon, 11 Mar 2024 10:28:47 +0100
Subject: [PATCH 03/23] dsa_init_from_wq_list: when dedicated wq found skip it
 and continue processing of wq list

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/dto.c b/dto.c
index cef139d..315289f 100644
--- a/dto.c
+++ b/dto.c
@@ -752,10 +752,14 @@ static int dsa_init_from_wq_list(char *wq_list)
 
 		dto_get_param_string(dir_fd, "mode", wq_mode);
 
-		if (strcmp(wq_mode, "dedicated") == 0) {
+        if (wq_mode[0] == '\0') {
 			close(dir_fd);
 			rc = -ENOTSUP;
-			goto fail_wq;
+			goto fail_wq;			
+		}
+
+		if (strcmp(wq_mode, "shared") != 0) {
+			continue;
 		}
 
 		wqs[num_wqs].wq_size = dto_get_param_ullong(dir_fd, "size", &rc);

From e811f206db346546410cd3d11a70cff724768be5 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Mon, 11 Mar 2024 14:34:40 +0100
Subject: [PATCH 04/23] Static code analysis with clangd

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/dto.c b/dto.c
index 315289f..524c0f5 100644
--- a/dto.c
+++ b/dto.c
@@ -89,8 +89,8 @@ static struct dto_wq wqs[MAX_WQS];
 static struct dto_device* devices[MAX_NUMA_NODES];
 static uint8_t num_wqs;
 static atomic_uchar next_wq;
-static uint8_t dto_initialized;
-static uint8_t dto_initializing;
+static atomic_uchar dto_initialized;
+static atomic_uchar dto_initializing;
 static uint8_t use_std_lib_calls;
 static uint8_t is_numa_aware;
 static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE;
@@ -287,7 +287,7 @@ static void child (void)
 	init_dto();
 }
 
-static __always_inline inline unsigned char enqcmd(struct dsa_hw_desc *desc, volatile void *reg)
+static __always_inline unsigned char enqcmd(struct dsa_hw_desc *desc, volatile void *reg)
 {
 	unsigned char retry;
 
@@ -297,18 +297,18 @@ static __always_inline inline unsigned char enqcmd(struct dsa_hw_desc *desc, vol
 	return retry;
 }
 
-static __always_inline inline void movdir64b(struct dsa_hw_desc *desc, volatile void *reg)
+static __always_inline void movdir64b(struct dsa_hw_desc *desc, volatile void *reg)
 {
 	asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02\t\n"
 		: : "a" (reg), "d" (desc));
 }
 
-static __always_inline inline void umonitor(const volatile void *addr)
+static __always_inline void umonitor(const volatile void *addr)
 {
 	asm volatile(".byte 0xf3, 0x48, 0x0f, 0xae, 0xf0" : : "a"(addr));
 }
 
-static __always_inline inline int umwait(unsigned long timeout, unsigned int state)
+static __always_inline int umwait(unsigned long timeout, unsigned int state)
 {
 	uint8_t r;
 	uint32_t timeout_low = (uint32_t)timeout;
@@ -411,7 +411,7 @@ static __always_inline void dsa_wait_and_adjust(const volatile uint8_t *comp)
 	adjust_num_waits += local_num_waits;
 
 	if (adjust_num_descs >= NUM_DESCS) {
-		atomic_ullong temp = adjust_num_descs;
+		unsigned long long temp = adjust_num_descs;
 
 		if (temp && atomic_compare_exchange_strong(&adjust_num_descs, &temp, 0)) {
 			double avg_num_waits = (double)adjust_num_waits / temp;

From faf59a31cd1cf28a36df73a3b39133c4715ade89 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Mon, 18 Mar 2024 10:18:08 +0100
Subject: [PATCH 05/23] Code review changes

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 README.md |  4 ++--
 dto.c     | 54 ++++++++++++++++++++++++++++--------------------------
 2 files changed, 30 insertions(+), 28 deletions(-)

diff --git a/README.md b/README.md
index ef334f9..3e56d26 100644
--- a/README.md
+++ b/README.md
@@ -20,8 +20,8 @@ To improve throughput for synchronous offload, DTO uses "pseudo asynchronous" ex
 1) After intercepting the API call, DTO splits the API job into two parts; 1) CPU job and 2) DSA job. For example, a 64 KB memcpy may
    be split into 20 KB CPU job and 44 KB DSA job. The split fraction can be configured using an environment variable DTO_CPU_SIZE_FRACTION. 
 2) DTO submits the DSA portion of the job to DSA. 
-   If DTO_IS_NUMA_AWARE=1 DTO uses works queues of DSA device located on the same numa node as 
-   buffer (memcpy/memmove - dest buffer, memcmp - ptr2) delivered to method. This reduces UPI traffic.
+   If DTO_IS_NUMA_AWARE=1 DTO uses work queues of DSA device located on the same numa node as 
+   buffer (memcpy/memmove - dest buffer, memcmp - ptr2) delivered to method. This reduces cross-socket traffic.
 3) In parallel, DTO performs the CPU portion of the job using std library on CPU.
 4) DTO waits for DSA to complete (if it hasn't completed already). The wait method can be configured using an environment variable DTO_WAIT_METHOD.
 
diff --git a/dto.c b/dto.c
index 524c0f5..b2ae418 100644
--- a/dto.c
+++ b/dto.c
@@ -44,9 +44,8 @@
  * Allocating memory dynamically may create cyclic dependency and may cause
  * a hang (e.g., memset --> malloc --> alloc library calls memset --> memset)
  */
-#define MAX_WQS 16
-#define MAX_NUMA_NODES 16
-#define MAX_WQS_IN_DSA_DEVICE 8
+#define MAX_WQS 32
+#define MAX_NUMA_NODES 32
 #define DTO_DEFAULT_MIN_SIZE 8192
 #define DTO_INITIALIZED 0
 #define DTO_INITIALIZING 1
@@ -73,7 +72,7 @@ struct dto_wq {
 };
 
 struct dto_device {
-	struct dto_wq* wqs[MAX_WQS_IN_DSA_DEVICE];
+	struct dto_wq* wqs[MAX_WQS];
 	uint8_t num_wqs;
 	atomic_uchar next_wq;
 };
@@ -662,6 +661,19 @@ static unsigned long long dto_get_param_ullong(int dir_fd, char *path, int *err)
 	return val;
 }
 
+static struct dto_device* get_dto_device(int dev_numa_node) {
+	struct dto_device* dev = NULL;
+
+	if (devices[dev_numa_node] == NULL) {
+		dev = calloc(1, sizeof(struct dto_device));
+		devices[dev_numa_node] = dev;
+	} else {
+		dev = devices[dev_numa_node];
+	}
+
+	return dev;
+}
+
 static void correct_devices_list() {
 /* 	Fill NULL gaps in devices list.
 	For SNC configurations there are less DSA devices then numa nodes
@@ -671,10 +683,9 @@ static void correct_devices_list() {
 	and model the same in devices list.	
  */	
  	struct dto_device* dev = NULL;
-	for (uint8_t i = 0; i < numa_num_configured_nodes(); i++) {
+	for (uint8_t i = 0; i < MAX_NUMA_NODES; i++) {
 		if (devices[i] != NULL) {
 			dev = devices[i];
-			continue;
 		} else {
 			devices[i] = dev;
 		}
@@ -727,7 +738,7 @@ static int dsa_init_from_wq_list(char *wq_list)
 			goto fail_wq;
 		}
 
-		const uint8_t dev_numa_node = (uint8_t)dto_get_param_ullong(dir_fd, "numa_node", &rc);
+		const int dev_numa_node = (int)dto_get_param_ullong(dir_fd, "numa_node", &rc);
 		if (rc) {
 			close(dir_fd);
 			goto fail_wq;
@@ -790,13 +801,10 @@ static int dsa_init_from_wq_list(char *wq_list)
 		}
 
 		if (is_numa_aware) {
-			struct dto_device* dev = devices[dev_numa_node] == NULL ? calloc(1, sizeof(struct dto_device)) : devices[dev_numa_node];
+			struct dto_device* dev = get_dto_device(dev_numa_node);
 			if (dev != NULL && 
-				dev->num_wqs < MAX_WQS_IN_DSA_DEVICE) {
-					if (dev->num_wqs == 0) {
-						devices[dev_numa_node] = dev;
-					}
-					dev->wqs[dev->num_wqs++] = &wqs[num_wqs];
+				dev->num_wqs < MAX_WQS) {
+				dev->wqs[dev->num_wqs++] = &wqs[num_wqs];
 			}
 		}
 
@@ -868,12 +876,11 @@ static int dsa_init_from_accfg(void)
 			continue;
 
 		struct dto_device* dev = NULL;
-		uint8_t dev_numa_node = 0;
 
 		if (is_numa_aware) {
-			dev = calloc(1, sizeof(struct dto_device));
-			dev_numa_node = accfg_device_get_numa_node(device);
-		}
+			const int dev_numa_node = accfg_device_get_numa_node(device);
+			dev = get_dto_device(dev_numa_node);
+		}	
 
 		accfg_wq_foreach(device, wq) {
 			enum accfg_wq_state wstate;
@@ -903,18 +910,13 @@ static int dsa_init_from_accfg(void)
 
 			used_devids[num_wqs] = accfg_device_get_id(device);
 
-			if (is_numa_aware && 
-				dev->num_wqs < MAX_WQS_IN_DSA_DEVICE && 
-				dev != NULL) {
+			if (is_numa_aware &&
+				dev != NULL &&
+				dev->num_wqs < MAX_WQS) {
 				dev->wqs[dev->num_wqs++] = &wqs[num_wqs];
 			}
 
 			num_wqs++;
-			// break;
-		}
-
-		if (is_numa_aware) {
-			devices[dev_numa_node] = dev;
 		}
 
 		if (num_wqs == MAX_WQS)
@@ -1166,7 +1168,7 @@ static int init_dto(void)
 
 			// display configuration
 			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
-				"cpu_size_fraction: %.2f wait_method: %s, auto_adjust_knobs: %d, is_numa_aware: %d\n",
+				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, is_numa_aware: %d\n",
 				log_level, collect_stats, use_std_lib_calls, dsa_min_size,
 				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, is_numa_aware);
 			for (int i = 0; i < num_wqs; i++)

From 8e7901787f4e884a9ad74556ad800f126101e100 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Wed, 20 Mar 2024 16:12:30 +0100
Subject: [PATCH 06/23] Required packages list updated

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3e56d26..dc62394 100644
--- a/README.md
+++ b/README.md
@@ -59,9 +59,9 @@ Pre-requisite packages:
 
 Should use glibc version 2.36 or later for best DTO performance. You can use "ldd --version" command to check glibc version on your system. glibc versions less than 2.36 have a bug which reduces DTO performance.
 
-On Fedora/CentOS/Rhel: kernel-headers, accel-config-devel, libuuid-devel
+On Fedora/CentOS/Rhel: kernel-headers, accel-config-devel, libuuid-devel, libnuma-devel
 
-On Ubuntu/Debian: linux-libc-dev, libaccel-config-dev, uuid-dev
+On Ubuntu/Debian: linux-libc-dev, libaccel-config-dev, uuid-dev, libnuma-dev
 
 ```bash
 make libdto

From af28b39f71d0dfc98a0c591eef67c7003a34c039 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Mon, 25 Mar 2024 08:49:31 +0100
Subject: [PATCH 07/23] DSA dedicated mode added (shared mode is default)

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 README.md |   3 +-
 dto.c     | 114 ++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 78 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index dc62394..0dbeefd 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,8 @@ Following environment variables control the behavior of DTO library:
 	DTO_CPU_SIZE_FRACTION=0.xx (specifies fraction of job performed by CPU, in parallel to DSA). Default is 0.00
 	DTO_AUTO_ADJUST_KNOBS=0/1 (disables/enables auto tuning of cpu_size_fraction and dsa_min_bytes parameters. 0 -- disable, 1 -- enable (default))
    DTO_IS_NUMA_AWARE=0/1 (disables/enables numa awareness. 0 -- disable (default), 1 -- enable)
-	DTO_WQ_LIST="semi-colon(;) separated list of DSA WQs to use". The WQ names should match their names in /dev/dsa/ directory (see example below).
+   DTO_DSA_MODE=0/1 (shared/dedicated uses shared/dedicated DSA mode, 0 -- shared (default), 1 -- dedicated)
+   DTO_WQ_LIST="semi-colon(;) separated list of DSA WQs to use". The WQ names should match their names in /dev/dsa/ directory (see example below).
 				If not specified, DTO will try to auto-discover and use all available WQs.
 	DTO_LOG_FILE=<dto log file path> Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid.
 	DTO_LOG_LEVEL=0/1/2 controls the log level. higher value means more verbose logging (default 0).
diff --git a/dto.c b/dto.c
index b2ae418..a7f53d1 100644
--- a/dto.c
+++ b/dto.c
@@ -239,6 +239,7 @@ static atomic_ullong adjust_num_waits;
 static double min_avg_waits = MIN_AVG_YIELD_WAITS;
 static double max_avg_waits = MAX_AVG_YIELD_WAITS;
 static uint8_t auto_adjust_knobs = 1;
+static uint8_t dsa_mode = 0; // 0: shared, 1: dedicated
 
 extern char *__progname;
 
@@ -453,44 +454,70 @@ static __always_inline int dsa_wait(struct dto_wq *wq,
 static __always_inline int dsa_submit(struct dto_wq *wq,
 	struct dsa_hw_desc *hw)
 {
-	int retry;
 	//LOG_TRACE("desc flags: 0x%x, opcode: 0x%x\n", hw->flags, hw->opcode);
 	__builtin_ia32_sfence();
-	for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
-		retry = enqcmd(hw, wq->wq_portal);
-		if (!retry)
+	switch (dsa_mode) {
+		case 0: { //shared
+			int retry;
+			for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
+				retry = enqcmd(hw, wq->wq_portal);
+				if (!retry)
+					return SUCCESS;
+			}
+			return RETRIES;
+		}
+		case 1: { //dedicated
+			movdir64b(hw, wq->wq_portal);
 			return SUCCESS;
+		} 
 	}
-	return RETRIES;
+	return FAIL_OTHERS;
 }
 
 static __always_inline int dsa_execute(struct dto_wq *wq,
 	struct dsa_hw_desc *hw, volatile uint8_t *comp)
 {
-	int retry;
 	*comp = 0;
+	bool do_wait = false;
 	//LOG_TRACE("desc flags: 0x%x, opcode: 0x%x\n", hw->flags, hw->opcode);
 	__builtin_ia32_sfence();
-	for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
-		retry = enqcmd(hw, wq->wq_portal);
-		if (!retry) {
-			if (auto_adjust_knobs)
-				dsa_wait_and_adjust(comp);
-			else
-				dsa_wait_no_adjust(comp);
-
-			if (*comp == DSA_COMP_SUCCESS) {
-				thr_bytes_completed += hw->xfer_size;
-				return SUCCESS;
-			} else if ((*comp & DSA_COMP_STATUS_MASK) == DSA_COMP_PAGE_FAULT_NOBOF) {
-				thr_bytes_completed += thr_comp.bytes_completed;
-				return PAGE_FAULT;
+	switch (dsa_mode) {
+		case 0: { //shared
+			int retry;
+			for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
+				retry = enqcmd(hw, wq->wq_portal);
+				if (!retry) {
+					do_wait = true;
+					break;
+				}
 			}
-			LOG_ERROR("failed status %x xfersz %x\n", *comp, hw->xfer_size);
-			return FAIL_OTHERS;
+			if (!do_wait)
+				return RETRIES;
+		}
+		case 1: { //dedicated
+			movdir64b(hw, wq->wq_portal);
+			do_wait = true;
 		}
 	}
-	return RETRIES;
+
+	if (do_wait) {
+		if (auto_adjust_knobs)
+			dsa_wait_and_adjust(comp);
+		else
+			dsa_wait_no_adjust(comp);
+
+		if (*comp == DSA_COMP_SUCCESS) {
+			thr_bytes_completed += hw->xfer_size;
+			return SUCCESS;
+		} else if ((*comp & DSA_COMP_STATUS_MASK) == DSA_COMP_PAGE_FAULT_NOBOF) {
+			thr_bytes_completed += thr_comp.bytes_completed;
+			return PAGE_FAULT;
+		}
+		LOG_ERROR("failed status %x xfersz %x\n", *comp, hw->xfer_size);
+		return FAIL_OTHERS;
+	}
+
+	return FAIL_OTHERS;
 }
 
 #ifdef DTO_STATS_SUPPORT
@@ -769,7 +796,8 @@ static int dsa_init_from_wq_list(char *wq_list)
 			goto fail_wq;			
 		}
 
-		if (strcmp(wq_mode, "shared") != 0) {
+		if ((dsa_mode == 0 && strcmp(wq_mode, "shared") != 0) || 
+		    (dsa_mode == 1 && strcmp(wq_mode, "dedicated") != 0)) {
 			continue;
 		}
 
@@ -899,7 +927,8 @@ static int dsa_init_from_accfg(void)
 
 			/* the wq mode should be shared work queue */
 			mode = accfg_wq_get_mode(wq);
-			if (mode != ACCFG_WQ_SHARED)
+			if ((dsa_mode == 0 && mode == ACCFG_WQ_DEDICATED) ||
+			    (dsa_mode == 1 && mode == ACCFG_WQ_SHARED))
 				continue;
 
 			wqs[num_wqs].wq_size = accfg_wq_get_size(wq);
@@ -1074,17 +1103,6 @@ static int init_dto(void)
 			use_std_lib_calls = !!use_std_lib_calls;
 		}
 
-		env_str = getenv("DTO_IS_NUMA_AWARE");
-		if (env_str != NULL) {
-			errno = 0;
-			is_numa_aware = strtoul(env_str, NULL, 10);
-			if (errno)
-				is_numa_aware = 0;
-
-			is_numa_aware = !!is_numa_aware &&
-							numa_available() != -1;
-		}
-
 #ifdef DTO_STATS_SUPPORT
 		env_str = getenv("DTO_COLLECT_STATS");
 		if (env_str != NULL) {
@@ -1161,6 +1179,26 @@ static int init_dto(void)
 				auto_adjust_knobs = !!auto_adjust_knobs;
 			}
 
+			env_str = getenv("DTO_IS_NUMA_AWARE");
+			if (env_str != NULL) {
+				errno = 0;
+				is_numa_aware = strtoul(env_str, NULL, 10);
+				if (errno)
+					is_numa_aware = 0;
+
+				is_numa_aware = !!is_numa_aware && numa_available() != -1;
+			}
+
+			env_str = getenv("DTO_DSA_MODE");
+			if (env_str != NULL) {
+				errno = 0;
+				dsa_mode = strtoul(env_str, NULL, 10);
+				if (errno)
+					dsa_mode = 0;
+
+				dsa_mode = !!dsa_mode;
+			}
+
 			if (dsa_init()) {
 				LOG_ERROR("Didn't find any usable DSAs. Falling back to using CPUs.\n");
 				use_std_lib_calls = 1;
@@ -1168,9 +1206,9 @@ static int init_dto(void)
 
 			// display configuration
 			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
-				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, is_numa_aware: %d\n",
+				"cpu_size_fraction: %.2f wait_method: %s, auto_adjust_knobs: %d, is_numa_aware: %d, dsa_mode: %s\n",
 				log_level, collect_stats, use_std_lib_calls, dsa_min_size,
-				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, is_numa_aware);
+				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, is_numa_aware, dsa_mode == 0 ? "shared" : "dedicated");
 			for (int i = 0; i < num_wqs; i++)
 				LOG_TRACE("[%d] wq_path: %s, wq_size: %d, dsa_cap: %lx\n", i,
 					wqs[i].wq_path, wqs[i].wq_size, wqs[i].dsa_gencap);

From 729250e3fd3505b95dc80820deb2ccd1586e5577 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Fri, 5 Apr 2024 11:49:59 +0200
Subject: [PATCH 08/23] Buffer centric and cpu centric numa wareness added:
 DTO_IS_NUMA_AWARE=0,1,2

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 README.md |  6 ++--
 dto.c     | 92 ++++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 75 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index dc62394..89aa049 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,9 @@ To improve throughput for synchronous offload, DTO uses "pseudo asynchronous" ex
    be split into 20 KB CPU job and 44 KB DSA job. The split fraction can be configured using an environment variable DTO_CPU_SIZE_FRACTION. 
 2) DTO submits the DSA portion of the job to DSA. 
    If DTO_IS_NUMA_AWARE=1 DTO uses work queues of DSA device located on the same numa node as 
-   buffer (memcpy/memmove - dest buffer, memcmp - ptr2) delivered to method. This reduces cross-socket traffic.
+   buffer (memcpy/memmove - dest buffer, memcmp - ptr2) delivered to method - buffer centric numa awareness.
+   If DTO_IS_NUMA_AWARE=2 DTO uses work queues of DSA device located on the same numa node as 
+   calling thread cpu - cpu centric numa awareness.
 3) In parallel, DTO performs the CPU portion of the job using std library on CPU.
 4) DTO waits for DSA to complete (if it hasn't completed already). The wait method can be configured using an environment variable DTO_WAIT_METHOD.
 
@@ -46,7 +48,7 @@ Following environment variables control the behavior of DTO library:
 	DTO_MIN_BYTES=xxxx (specifies minimum size of API call needed for DSA operation execution, default is 8192 bytes)
 	DTO_CPU_SIZE_FRACTION=0.xx (specifies fraction of job performed by CPU, in parallel to DSA). Default is 0.00
 	DTO_AUTO_ADJUST_KNOBS=0/1 (disables/enables auto tuning of cpu_size_fraction and dsa_min_bytes parameters. 0 -- disable, 1 -- enable (default))
-   DTO_IS_NUMA_AWARE=0/1 (disables/enables numa awareness. 0 -- disable (default), 1 -- enable)
+   DTO_IS_NUMA_AWARE=0/1/2 (disables/buffer centric/cpu centric numa awareness. 0 -- disable (default), 1 -- buffer centric, 2 - cpu centric)
 	DTO_WQ_LIST="semi-colon(;) separated list of DSA WQs to use". The WQ names should match their names in /dev/dsa/ directory (see example below).
 				If not specified, DTO will try to auto-discover and use all available WQs.
 	DTO_LOG_FILE=<dto log file path> Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid.
diff --git a/dto.c b/dto.c
index b2ae418..3b7bd65 100644
--- a/dto.c
+++ b/dto.c
@@ -83,6 +83,19 @@ enum wait_options {
 	WAIT_YIELD
 };
 
+enum numa_aware {
+	NA_NONE = 0,
+	NA_BUFFER_CENTRIC,
+	NA_CPU_CENTRIC,
+	NA_LAST_ENTRY
+};
+
+static const char * const numa_aware_names[] = {
+	[NA_NONE] = "none",
+	[NA_BUFFER_CENTRIC] = "buffer centric",
+	[NA_CPU_CENTRIC] = "cpu centric",
+};
+
 // global workqueue variables
 static struct dto_wq wqs[MAX_WQS];
 static struct dto_device* devices[MAX_NUMA_NODES];
@@ -91,7 +104,7 @@ static atomic_uchar next_wq;
 static atomic_uchar dto_initialized;
 static atomic_uchar dto_initializing;
 static uint8_t use_std_lib_calls;
-static uint8_t is_numa_aware;
+static enum numa_aware is_numa_aware;
 static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE;
 static int wait_method = WAIT_YIELD;
 static double cpu_size_fraction;
@@ -692,6 +705,45 @@ static void correct_devices_list() {
 	}
 }
 
+static __always_inline  int get_numa_node(void* buf) {
+	int numa_node = -1;
+	
+	switch (is_numa_aware) {
+        case NA_BUFFER_CENTRIC: {
+			if (buf != NULL) {
+				int status[1] = {-1};
+
+				// get numa node of memory pointed by buf
+				if (move_pages(0, 1, &buf, NULL, status, 0) == 0) {
+					numa_node = status[0];	
+				} else {
+					LOG_ERROR("move_pages call error: %d - %s", errno, strerror(errno));
+				}
+
+				// alternatively get_mempolicy can be used
+				// if (get_mempolicy(&numa_node, NULL, 0, (void *)buf, MPOL_F_NODE | MPOL_F_ADDR) != 0) {
+				// 	LOG_ERROR("get_mempolicy call error: %d - %s", errno, strerror(errno));
+				// }
+			}
+		}
+		break;
+		case NA_CPU_CENTRIC: {
+			const int cpu = sched_getcpu();
+			if (cpu != -1) {
+				numa_node = numa_node_of_cpu(sched_getcpu());
+			}
+			else {
+				LOG_ERROR("sched_getcpu call error: %d - %s", errno, strerror(errno));
+			}
+		}
+		break;
+        default:
+		break;
+        }
+
+        return numa_node;
+}
+
 static void cleanup_devices() {
 	struct dto_device* dev = NULL;
 	for (uint i = 0; i < MAX_NUMA_NODES; i++) {
@@ -1074,15 +1126,15 @@ static int init_dto(void)
 			use_std_lib_calls = !!use_std_lib_calls;
 		}
 
-		env_str = getenv("DTO_IS_NUMA_AWARE");
-		if (env_str != NULL) {
-			errno = 0;
-			is_numa_aware = strtoul(env_str, NULL, 10);
-			if (errno)
-				is_numa_aware = 0;
-
-			is_numa_aware = !!is_numa_aware &&
-							numa_available() != -1;
+		if (numa_available() != -1) {
+			env_str = getenv("DTO_IS_NUMA_AWARE");
+			if (env_str != NULL) {
+				errno = 0;
+				is_numa_aware = strtoul(env_str, NULL, 10);
+				if (errno || is_numa_aware >= NA_LAST_ENTRY) {
+					is_numa_aware = NA_NONE;
+				}
+			}
 		}
 
 #ifdef DTO_STATS_SUPPORT
@@ -1168,9 +1220,9 @@ static int init_dto(void)
 
 			// display configuration
 			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
-				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, is_numa_aware: %d\n",
+				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, is_numa_aware: %s\n",
 				log_level, collect_stats, use_std_lib_calls, dsa_min_size,
-				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, is_numa_aware);
+				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware]);
 			for (int i = 0; i < num_wqs; i++)
 				LOG_TRACE("[%d] wq_path: %s, wq_size: %d, dsa_cap: %lx\n", i,
 					wqs[i].wq_path, wqs[i].wq_size, wqs[i].dsa_gencap);
@@ -1208,15 +1260,13 @@ static __always_inline  struct dto_wq *get_wq(void* buf)
 		int status[1] = {-1};
 
 		// get numa node of memory pointed by buf
-		if (move_pages(0, 1, &buf, NULL, status, 0) == 0) {
-			int buf_numa_node = status[0];
-			if (buf_numa_node < MAX_NUMA_NODES) {
-				struct dto_device* dev = devices[buf_numa_node];
-				if (dev != NULL && 
-					dev->num_wqs > 0) {
-					wq = dev->wqs[dev->next_wq++ % dev->num_wqs];
-				}			
-			}
+		const int numa_node = get_numa_node(buf);
+		if (numa_node >= 0 && numa_node < MAX_NUMA_NODES) {
+			struct dto_device* dev = devices[numa_node];
+			if (dev != NULL && 
+				dev->num_wqs > 0) {
+				wq = dev->wqs[dev->next_wq++ % dev->num_wqs];
+			}			
 		}
 	}
 

From 60a7247c9896b8b385fbf62bd30de09bbd20fb0d Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Mon, 8 Apr 2024 07:36:04 +0200
Subject: [PATCH 09/23] buffer centric/cpu centric changed to
 buffer-centric/cpu-centric

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 README.md | 6 +++---
 dto.c     | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 89aa049..ebb60f4 100644
--- a/README.md
+++ b/README.md
@@ -21,9 +21,9 @@ To improve throughput for synchronous offload, DTO uses "pseudo asynchronous" ex
    be split into 20 KB CPU job and 44 KB DSA job. The split fraction can be configured using an environment variable DTO_CPU_SIZE_FRACTION. 
 2) DTO submits the DSA portion of the job to DSA. 
    If DTO_IS_NUMA_AWARE=1 DTO uses work queues of DSA device located on the same numa node as 
-   buffer (memcpy/memmove - dest buffer, memcmp - ptr2) delivered to method - buffer centric numa awareness.
+   buffer (memcpy/memmove - dest buffer, memcmp - ptr2) delivered to method - buffer-centric numa awareness.
    If DTO_IS_NUMA_AWARE=2 DTO uses work queues of DSA device located on the same numa node as 
-   calling thread cpu - cpu centric numa awareness.
+   calling thread cpu - cpu-centric numa awareness.
 3) In parallel, DTO performs the CPU portion of the job using std library on CPU.
 4) DTO waits for DSA to complete (if it hasn't completed already). The wait method can be configured using an environment variable DTO_WAIT_METHOD.
 
@@ -48,7 +48,7 @@ Following environment variables control the behavior of DTO library:
 	DTO_MIN_BYTES=xxxx (specifies minimum size of API call needed for DSA operation execution, default is 8192 bytes)
 	DTO_CPU_SIZE_FRACTION=0.xx (specifies fraction of job performed by CPU, in parallel to DSA). Default is 0.00
 	DTO_AUTO_ADJUST_KNOBS=0/1 (disables/enables auto tuning of cpu_size_fraction and dsa_min_bytes parameters. 0 -- disable, 1 -- enable (default))
-   DTO_IS_NUMA_AWARE=0/1/2 (disables/buffer centric/cpu centric numa awareness. 0 -- disable (default), 1 -- buffer centric, 2 - cpu centric)
+   DTO_IS_NUMA_AWARE=0/1/2 (disables/buffer-centric/cpu-centric numa awareness. 0 -- disable (default), 1 -- buffer-centric, 2 - cpu-centric)
 	DTO_WQ_LIST="semi-colon(;) separated list of DSA WQs to use". The WQ names should match their names in /dev/dsa/ directory (see example below).
 				If not specified, DTO will try to auto-discover and use all available WQs.
 	DTO_LOG_FILE=<dto log file path> Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid.
diff --git a/dto.c b/dto.c
index 3b7bd65..64e4360 100644
--- a/dto.c
+++ b/dto.c
@@ -92,8 +92,8 @@ enum numa_aware {
 
 static const char * const numa_aware_names[] = {
 	[NA_NONE] = "none",
-	[NA_BUFFER_CENTRIC] = "buffer centric",
-	[NA_CPU_CENTRIC] = "cpu centric",
+	[NA_BUFFER_CENTRIC] = "buffer-centric",
+	[NA_CPU_CENTRIC] = "cpu-centric"
 };
 
 // global workqueue variables

From dee2e3db6218a5aafa01c6254e8e3544ca1bddf2 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Tue, 9 Apr 2024 07:40:41 +0200
Subject: [PATCH 10/23] Comment corrected

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dto.c b/dto.c
index 64e4360..09bb7d2 100644
--- a/dto.c
+++ b/dto.c
@@ -1259,7 +1259,7 @@ static __always_inline  struct dto_wq *get_wq(void* buf)
 		buf != NULL) {
 		int status[1] = {-1};
 
-		// get numa node of memory pointed by buf
+		// get the numa node for the target DSA device
 		const int numa_node = get_numa_node(buf);
 		if (numa_node >= 0 && numa_node < MAX_NUMA_NODES) {
 			struct dto_device* dev = devices[numa_node];

From c1a204fc6805b2a26b10d15ec9a7f10523087b3f Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Tue, 9 Apr 2024 13:58:19 +0200
Subject: [PATCH 11/23] 'is_numa_aware:' changed to: 'numa_awareness'

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dto.c b/dto.c
index 09bb7d2..71714b2 100644
--- a/dto.c
+++ b/dto.c
@@ -1220,7 +1220,7 @@ static int init_dto(void)
 
 			// display configuration
 			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
-				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, is_numa_aware: %s\n",
+				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s\n",
 				log_level, collect_stats, use_std_lib_calls, dsa_min_size,
 				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware]);
 			for (int i = 0; i < num_wqs; i++)

From a4ce39c79980b5689191c4de5b86ddc9c34238b6 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Tue, 9 Apr 2024 14:23:55 +0200
Subject: [PATCH 12/23] enum DSA_MODE introduced

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/dto.c b/dto.c
index 369c00e..63a0d6a 100644
--- a/dto.c
+++ b/dto.c
@@ -96,6 +96,16 @@ static const char * const numa_aware_names[] = {
 	[NA_CPU_CENTRIC] = "cpu-centric"
 };
 
+enum dsa_mode {
+	DM_SHARED = 0,
+	DM_DEDICATED
+};
+
+static const char * const dsa_mode_names[] = {
+	[DM_SHARED] = "shared",
+	[DM_DEDICATED] = "dedicated"
+};
+
 // global workqueue variables
 static struct dto_wq wqs[MAX_WQS];
 static struct dto_device* devices[MAX_NUMA_NODES];
@@ -252,7 +262,7 @@ static atomic_ullong adjust_num_waits;
 static double min_avg_waits = MIN_AVG_YIELD_WAITS;
 static double max_avg_waits = MAX_AVG_YIELD_WAITS;
 static uint8_t auto_adjust_knobs = 1;
-static uint8_t dsa_mode = 0; // 0: shared, 1: dedicated
+static enum dsa_mode dsa_mode = DM_SHARED; 
 
 extern char *__progname;
 
@@ -470,7 +480,7 @@ static __always_inline int dsa_submit(struct dto_wq *wq,
 	//LOG_TRACE("desc flags: 0x%x, opcode: 0x%x\n", hw->flags, hw->opcode);
 	__builtin_ia32_sfence();
 	switch (dsa_mode) {
-		case 0: { //shared
+		case DM_SHARED: {
 			int retry;
 			for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
 				retry = enqcmd(hw, wq->wq_portal);
@@ -479,7 +489,7 @@ static __always_inline int dsa_submit(struct dto_wq *wq,
 			}
 			return RETRIES;
 		}
-		case 1: { //dedicated
+		case DM_DEDICATED: {
 			movdir64b(hw, wq->wq_portal);
 			return SUCCESS;
 		} 
@@ -495,7 +505,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
 	//LOG_TRACE("desc flags: 0x%x, opcode: 0x%x\n", hw->flags, hw->opcode);
 	__builtin_ia32_sfence();
 	switch (dsa_mode) {
-		case 0: { //shared
+		case DM_SHARED: {
 			int retry;
 			for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
 				retry = enqcmd(hw, wq->wq_portal);
@@ -507,7 +517,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
 			if (!do_wait)
 				return RETRIES;
 		}
-		case 1: { //dedicated
+		case DM_DEDICATED: {
 			movdir64b(hw, wq->wq_portal);
 			do_wait = true;
 		}
@@ -848,8 +858,8 @@ static int dsa_init_from_wq_list(char *wq_list)
 			goto fail_wq;			
 		}
 
-		if ((dsa_mode == 0 && strcmp(wq_mode, "shared") != 0) || 
-		    (dsa_mode == 1 && strcmp(wq_mode, "dedicated") != 0)) {
+		if ((dsa_mode == DM_SHARED && strcmp(wq_mode, "shared") != 0) || 
+		    (dsa_mode == DM_DEDICATED && strcmp(wq_mode, "dedicated") != 0)) {
 			continue;
 		}
 
@@ -979,8 +989,8 @@ static int dsa_init_from_accfg(void)
 
 			/* the wq mode should be shared work queue */
 			mode = accfg_wq_get_mode(wq);
-			if ((dsa_mode == 0 && mode == ACCFG_WQ_DEDICATED) ||
-			    (dsa_mode == 1 && mode == ACCFG_WQ_SHARED))
+			if ((dsa_mode == DM_SHARED && mode != ACCFG_WQ_SHARED) ||
+			    (dsa_mode == DM_DEDICATED && mode != ACCFG_WQ_DEDICATED))
 				continue;
 
 			wqs[num_wqs].wq_size = accfg_wq_get_size(wq);
@@ -1246,10 +1256,8 @@ static int init_dto(void)
 			if (env_str != NULL) {
 				errno = 0;
 				dsa_mode = strtoul(env_str, NULL, 10);
-				if (errno)
-					dsa_mode = 0;
-
-				dsa_mode = !!dsa_mode;
+				if (errno || dsa_mode > DM_DEDICATED)
+					dsa_mode = DM_SHARED;
 			}
 
 			if (dsa_init()) {
@@ -1261,7 +1269,7 @@ static int init_dto(void)
 			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
 				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s, dsa_mode: %s\n",
 				log_level, collect_stats, use_std_lib_calls, dsa_min_size,
-				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware], dsa_mode == 0 ? "shared" : "dedicated");
+				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware], dsa_mode_names[dsa_mode]);
 			for (int i = 0; i < num_wqs; i++)
 				LOG_TRACE("[%d] wq_path: %s, wq_size: %d, dsa_cap: %lx\n", i,
 					wqs[i].wq_path, wqs[i].wq_size, wqs[i].dsa_gencap);

From 11ccd8c7d020d03f67abfe82b7eceb7730e0fec8 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Tue, 9 Apr 2024 18:57:52 +0200
Subject: [PATCH 13/23] Rdundant buf != NULL check removed

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/dto.c b/dto.c
index 71714b2..0ca83a3 100644
--- a/dto.c
+++ b/dto.c
@@ -724,6 +724,8 @@ static __always_inline  int get_numa_node(void* buf) {
 				// if (get_mempolicy(&numa_node, NULL, 0, (void *)buf, MPOL_F_NODE | MPOL_F_ADDR) != 0) {
 				// 	LOG_ERROR("get_mempolicy call error: %d - %s", errno, strerror(errno));
 				// }
+			} else {
+				LOG_ERROR("NULL buffer delivered. Unable to detect numa node");
 			}
 		}
 		break;
@@ -1255,8 +1257,7 @@ static __always_inline  struct dto_wq *get_wq(void* buf)
 {
 	struct dto_wq* wq = NULL;
 
-	if (is_numa_aware &&
-		buf != NULL) {
+	if (is_numa_aware) {
 		int status[1] = {-1};
 
 		// get the numa node for the target DSA device

From 8a483cf2ba7e78ba762c9a331e4d65ad5b002298 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Wed, 10 Apr 2024 07:23:39 +0200
Subject: [PATCH 14/23] Whitespaces removed

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dto.c b/dto.c
index 0ca83a3..ff37d2f 100644
--- a/dto.c
+++ b/dto.c
@@ -1249,7 +1249,7 @@ static void cleanup_dto(void)
 #endif
 	if (log_fd != -1)
 		close(log_fd);
-	
+
 	cleanup_devices();
 }
 
@@ -1264,17 +1264,17 @@ static __always_inline  struct dto_wq *get_wq(void* buf)
 		const int numa_node = get_numa_node(buf);
 		if (numa_node >= 0 && numa_node < MAX_NUMA_NODES) {
 			struct dto_device* dev = devices[numa_node];
-			if (dev != NULL && 
+			if (dev != NULL &&
 				dev->num_wqs > 0) {
 				wq = dev->wqs[dev->next_wq++ % dev->num_wqs];
-			}			
+			}
 		}
 	}
 
 	if (wq == NULL) {
 		wq = &wqs[next_wq++ % num_wqs];
 	}
-	
+
 	return wq;
 }
 

From fb41ad5f01a4e987660710fe1881e8a7ba45c6a9 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Wed, 10 Apr 2024 19:53:33 +0200
Subject: [PATCH 15/23] More whitespaces removed

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/dto.c b/dto.c
index ff37d2f..ea067e7 100644
--- a/dto.c
+++ b/dto.c
@@ -693,9 +693,9 @@ static void correct_devices_list() {
 	ex. SNC4: 8 numa nodes, 2 DSA devices:
 	dsa0 device has numa_node = 0, dsa2 device has numa_node = 4
 	Then we should use dsa0 device for numa_nodes = 0,1,2,3 and dsa2 device for numa_nodes = 4,5,6,7
-	and model the same in devices list.	
- */	
- 	struct dto_device* dev = NULL;
+	and model the same in devices list.
+ */
+	struct dto_device* dev = NULL;
 	for (uint8_t i = 0; i < MAX_NUMA_NODES; i++) {
 		if (devices[i] != NULL) {
 			dev = devices[i];
@@ -707,7 +707,7 @@ static void correct_devices_list() {
 
 static __always_inline  int get_numa_node(void* buf) {
 	int numa_node = -1;
-	
+
 	switch (is_numa_aware) {
         case NA_BUFFER_CENTRIC: {
 			if (buf != NULL) {
@@ -715,7 +715,7 @@ static __always_inline  int get_numa_node(void* buf) {
 
 				// get numa node of memory pointed by buf
 				if (move_pages(0, 1, &buf, NULL, status, 0) == 0) {
-					numa_node = status[0];	
+					numa_node = status[0];
 				} else {
 					LOG_ERROR("move_pages call error: %d - %s", errno, strerror(errno));
 				}
@@ -756,7 +756,7 @@ static void cleanup_devices() {
 		devices[i] = NULL;
 	}
 }
- 
+
 static int dsa_init_from_wq_list(char *wq_list)
 {
 	char *wq;
@@ -820,7 +820,7 @@ static int dsa_init_from_wq_list(char *wq_list)
         if (wq_mode[0] == '\0') {
 			close(dir_fd);
 			rc = -ENOTSUP;
-			goto fail_wq;			
+			goto fail_wq;
 		}
 
 		if (strcmp(wq_mode, "shared") != 0) {
@@ -856,7 +856,7 @@ static int dsa_init_from_wq_list(char *wq_list)
 
 		if (is_numa_aware) {
 			struct dto_device* dev = get_dto_device(dev_numa_node);
-			if (dev != NULL && 
+			if (dev != NULL &&
 				dev->num_wqs < MAX_WQS) {
 				dev->wqs[dev->num_wqs++] = &wqs[num_wqs];
 			}
@@ -911,9 +911,9 @@ static int dsa_init_from_accfg(void)
 	num_wqs = 0;
 
 	accfg_device_foreach(dto_ctx, device) {
-		enum accfg_device_state dstate;		
+		enum accfg_device_state dstate;
 
-    	/* use dsa devices only*/
+		/* use dsa devices only*/
 		if (strncmp(accfg_device_get_devname(device), "dsa", 3)!= 0)
 			continue;
 
@@ -934,7 +934,7 @@ static int dsa_init_from_accfg(void)
 		if (is_numa_aware) {
 			const int dev_numa_node = accfg_device_get_numa_node(device);
 			dev = get_dto_device(dev_numa_node);
-		}	
+		}
 
 		accfg_wq_foreach(device, wq) {
 			enum accfg_wq_state wstate;

From 078093f3bb8810c789e1138bb4b0c55ca6ae8344 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Wed, 10 Apr 2024 20:13:44 +0200
Subject: [PATCH 16/23] Removed unneeded comment

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/dto.c b/dto.c
index ea067e7..7e462cd 100644
--- a/dto.c
+++ b/dto.c
@@ -688,13 +688,6 @@ static struct dto_device* get_dto_device(int dev_numa_node) {
 }
 
 static void correct_devices_list() {
-/* 	Fill NULL gaps in devices list.
-	For SNC configurations there are less DSA devices then numa nodes
-	ex. SNC4: 8 numa nodes, 2 DSA devices:
-	dsa0 device has numa_node = 0, dsa2 device has numa_node = 4
-	Then we should use dsa0 device for numa_nodes = 0,1,2,3 and dsa2 device for numa_nodes = 4,5,6,7
-	and model the same in devices list.
- */
 	struct dto_device* dev = NULL;
 	for (uint8_t i = 0; i < MAX_NUMA_NODES; i++) {
 		if (devices[i] != NULL) {

From 3aaea91cab6871cbe71b09c7051dd97638763d72 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Thu, 11 Apr 2024 08:48:39 +0200
Subject: [PATCH 17/23] Whitespaces removed

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/dto.c b/dto.c
index 89c51e9..9db10bc 100644
--- a/dto.c
+++ b/dto.c
@@ -262,7 +262,7 @@ static atomic_ullong adjust_num_waits;
 static double min_avg_waits = MIN_AVG_YIELD_WAITS;
 static double max_avg_waits = MAX_AVG_YIELD_WAITS;
 static uint8_t auto_adjust_knobs = 1;
-static enum dsa_mode dsa_mode = DM_SHARED; 
+static enum dsa_mode dsa_mode = DM_SHARED;
 
 extern char *__progname;
 
@@ -492,7 +492,7 @@ static __always_inline int dsa_submit(struct dto_wq *wq,
 		case DM_DEDICATED: {
 			movdir64b(hw, wq->wq_portal);
 			return SUCCESS;
-		} 
+		}
 	}
 	return FAIL_OTHERS;
 }
@@ -739,7 +739,7 @@ static __always_inline  int get_numa_node(void* buf) {
 	int numa_node = -1;
 
 	switch (is_numa_aware) {
-        case NA_BUFFER_CENTRIC: {
+		case NA_BUFFER_CENTRIC: {
 			if (buf != NULL) {
 				int status[1] = {-1};
 
@@ -769,11 +769,11 @@ static __always_inline  int get_numa_node(void* buf) {
 			}
 		}
 		break;
-        default:
+		default:
 		break;
-        }
+		}
 
-        return numa_node;
+	return numa_node;
 }
 
 static void cleanup_devices() {
@@ -847,14 +847,14 @@ static int dsa_init_from_wq_list(char *wq_list)
 
 		dto_get_param_string(dir_fd, "mode", wq_mode);
 
-        if (wq_mode[0] == '\0') {
+		if (wq_mode[0] == '\0') {
 			close(dir_fd);
 			rc = -ENOTSUP;
 			goto fail_wq;
 		}
 
-		if ((dsa_mode == DM_SHARED && strcmp(wq_mode, "shared") != 0) || 
-		    (dsa_mode == DM_DEDICATED && strcmp(wq_mode, "dedicated") != 0)) {
+		if ((dsa_mode == DM_SHARED && strcmp(wq_mode, "shared") != 0) ||
+			(dsa_mode == DM_DEDICATED && strcmp(wq_mode, "dedicated") != 0)) {
 			continue;
 		}
 
@@ -985,7 +985,7 @@ static int dsa_init_from_accfg(void)
 			/* the wq mode should be shared work queue */
 			mode = accfg_wq_get_mode(wq);
 			if ((dsa_mode == DM_SHARED && mode != ACCFG_WQ_SHARED) ||
-			    (dsa_mode == DM_DEDICATED && mode != ACCFG_WQ_DEDICATED))
+				(dsa_mode == DM_DEDICATED && mode != ACCFG_WQ_DEDICATED))
 				continue;
 
 			wqs[num_wqs].wq_size = accfg_wq_get_size(wq);

From a2340e102ce4546ce511bb825b45624b480d7f6b Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Fri, 12 Apr 2024 11:11:43 +0200
Subject: [PATCH 18/23] Removed unneeded code

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/dto.c b/dto.c
index 9db10bc..7e413e4 100644
--- a/dto.c
+++ b/dto.c
@@ -740,23 +740,19 @@ static __always_inline  int get_numa_node(void* buf) {
 
 	switch (is_numa_aware) {
 		case NA_BUFFER_CENTRIC: {
-			if (buf != NULL) {
-				int status[1] = {-1};
-
-				// get numa node of memory pointed by buf
-				if (move_pages(0, 1, &buf, NULL, status, 0) == 0) {
-					numa_node = status[0];
-				} else {
-					LOG_ERROR("move_pages call error: %d - %s", errno, strerror(errno));
-				}
+			int status[1] = {-1};
 
-				// alternatively get_mempolicy can be used
-				// if (get_mempolicy(&numa_node, NULL, 0, (void *)buf, MPOL_F_NODE | MPOL_F_ADDR) != 0) {
-				// 	LOG_ERROR("get_mempolicy call error: %d - %s", errno, strerror(errno));
-				// }
+			// get numa node of memory pointed by buf
+			if (move_pages(0, 1, &buf, NULL, status, 0) == 0) {
+				numa_node = status[0];
 			} else {
-				LOG_ERROR("NULL buffer delivered. Unable to detect numa node");
+				LOG_ERROR("move_pages call error: %d - %s", errno, strerror(errno));
 			}
+
+			// alternatively get_mempolicy can be used
+			// if (get_mempolicy(&numa_node, NULL, 0, (void *)buf, MPOL_F_NODE | MPOL_F_ADDR) != 0) {
+			// 	LOG_ERROR("get_mempolicy call error: %d - %s", errno, strerror(errno));
+			// }
 		}
 		break;
 		case NA_CPU_CENTRIC: {
@@ -1298,8 +1294,6 @@ static __always_inline  struct dto_wq *get_wq(void* buf)
 	struct dto_wq* wq = NULL;
 
 	if (is_numa_aware) {
-		int status[1] = {-1};
-
 		// get the numa node for the target DSA device
 		const int numa_node = get_numa_node(buf);
 		if (numa_node >= 0 && numa_node < MAX_NUMA_NODES) {

From c8d1ed71c64547e22a7a2eb49f9b82752f7dd0b0 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Mon, 29 Apr 2024 12:45:00 +0200
Subject: [PATCH 19/23] Outstanding descriptors updated for dedicated mode

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 README.md |  19 ++++-----
 dto.c     | 125 ++++++++++++++++++++++++++++++------------------------
 2 files changed, 79 insertions(+), 65 deletions(-)

diff --git a/README.md b/README.md
index e22bb52..66fb927 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,8 @@ The library intercepts standard memcpy, memmove, memset, and memcmp standard API
 and transparently uses DSA to perform those operations using DSA's memory move, fill, and compare operations. DTO is limited to
 synchronous offload model since these APIs have synchronous semantics.
 
-DTO library works with DSA's Shared Work Queues (SWQs). DTO also works with multiple DSAs and uses them in round robin manner.
-During initialization, DTO library can either auto-discover all configured SWQs (potentially on multile DSAs), or a list of specific SWQs that is 
+DTO library works with DSA's Shared and Dedicated Work Queues (SWQs). DTO also works with multiple DSAs and uses them in round robin manner.
+During initialization, DTO library can either auto-discover all configured SWQs (potentially on multile DSAs), or a list of specific SWQs that is
 specified using an environment variable DTO_WQ_LIST.
 
 DTO library falls back to using standard APIs on CPU under following scenarios:
@@ -18,18 +18,18 @@ DTO library falls back to using standard APIs on CPU under following scenarios:
 
 To improve throughput for synchronous offload, DTO uses "pseudo asynchronous" execution using following steps.
 1) After intercepting the API call, DTO splits the API job into two parts; 1) CPU job and 2) DSA job. For example, a 64 KB memcpy may
-   be split into 20 KB CPU job and 44 KB DSA job. The split fraction can be configured using an environment variable DTO_CPU_SIZE_FRACTION. 
-2) DTO submits the DSA portion of the job to DSA. 
-   If DTO_IS_NUMA_AWARE=1 DTO uses work queues of DSA device located on the same numa node as 
+   be split into 20 KB CPU job and 44 KB DSA job. The split fraction can be configured using an environment variable DTO_CPU_SIZE_FRACTION.
+2) DTO submits the DSA portion of the job to DSA.
+   If DTO_IS_NUMA_AWARE=1 DTO uses work queues of DSA device located on the same numa node as
    buffer (memcpy/memmove - dest buffer, memcmp - ptr2) delivered to method - buffer-centric numa awareness.
-   If DTO_IS_NUMA_AWARE=2 DTO uses work queues of DSA device located on the same numa node as 
+   If DTO_IS_NUMA_AWARE=2 DTO uses work queues of DSA device located on the same numa node as
    calling thread cpu - cpu-centric numa awareness.
 3) In parallel, DTO performs the CPU portion of the job using std library on CPU.
 4) DTO waits for DSA to complete (if it hasn't completed already). The wait method can be configured using an environment variable DTO_WAIT_METHOD.
 
 DTO also implements a heuristic to auto tune dsa_min_bytes and cpu_size_fraction parameters based on current DSA load. For example, if DSA is heavily loaded,
 DTO tries to reduce the DSA load by increasing cpu_size_fraction and dsa_min_bytes. Conversely, if DSA is lightly loaded, DTO tries to increase the DSA load by
-decreasing cpu_size_fraction and dsa_min_bytes. The goal of the heuristic is to minimize the wait time in step 4 above while maximizing throughput. The auto-tuning 
+decreasing cpu_size_fraction and dsa_min_bytes. The goal of the heuristic is to minimize the wait time in step 4 above while maximizing throughput. The auto-tuning
 can be enabled or disabled using an environment variable DTO_AUTO_ADJUST_KNOBS.
 
 DTO can also be used to learn certain application characterics by building histogram of various API types and sizes. The histogram can be built using an environment variable DTO_COLLECT_STATS.
@@ -48,8 +48,7 @@ Following environment variables control the behavior of DTO library:
 	DTO_MIN_BYTES=xxxx (specifies minimum size of API call needed for DSA operation execution, default is 8192 bytes)
 	DTO_CPU_SIZE_FRACTION=0.xx (specifies fraction of job performed by CPU, in parallel to DSA). Default is 0.00
 	DTO_AUTO_ADJUST_KNOBS=0/1 (disables/enables auto tuning of cpu_size_fraction and dsa_min_bytes parameters. 0 -- disable, 1 -- enable (default))
-  DTO_IS_NUMA_AWARE=0/1/2 (disables/buffer-centric/cpu-centric numa awareness. 0 -- disable (default), 1 -- buffer-centric, 2 - cpu-centric)
-  DTO_DSA_MODE=0/1 (shared/dedicated uses shared/dedicated DSA mode, 0 -- shared (default), 1 -- dedicated)
+   DTO_IS_NUMA_AWARE=0/1/2 (disables/buffer-centric/cpu-centric numa awareness. 0 -- disable (default), 1 -- buffer-centric, 2 - cpu-centric)
 	DTO_WQ_LIST="semi-colon(;) separated list of DSA WQs to use". The WQ names should match their names in /dev/dsa/ directory (see example below).
 				If not specified, DTO will try to auto-discover and use all available WQs.
 	DTO_LOG_FILE=<dto log file path> Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid.
@@ -139,7 +138,7 @@ Byte Range        -- set      cpy      mov      cmp      bytes        set      c
    >=2093056      -- 0        1        0        0        1975911      0        0        0        0        0            0        1        0        0        973209       0      1      0
 
 ******** Average Memory Operation Latency (us)  ********
-                     <******** stdc calls    ********> <******** dsa (success) ********> <******** dsa (failed)  ********> 
+                     <******** stdc calls    ********> <******** dsa (success) ********> <******** dsa (failed)  ********>
 Byte Range        -- set      cpy      mov      cmp      set      cpy      mov      cmp      set      cpy      mov      cmp
        0-4095     -- 0.01     0.02     0.01     0.04     0        0        0        0        0        0        0        0
     4096-8191     -- 0.07     0.42     0.47     0        0        0        0        0        0        0        0        0
diff --git a/dto.c b/dto.c
index 7e413e4..53d3eca 100644
--- a/dto.c
+++ b/dto.c
@@ -23,6 +23,7 @@
 #include <accel-config/libaccel_config.h>
 #include <numaif.h>
 #include <numa.h>
+#include <linux/limits.h>
 
 #define likely(x)       __builtin_expect((x), 1)
 #define unlikely(x)     __builtin_expect((x), 0)
@@ -69,6 +70,8 @@ struct dto_wq {
 	uint32_t max_transfer_size;
 	int wq_fd;
 	void *wq_portal;
+	enum accfg_wq_mode wq_mode;
+	atomic_int dwq_desc_outstanding;
 };
 
 struct dto_device {
@@ -96,14 +99,10 @@ static const char * const numa_aware_names[] = {
 	[NA_CPU_CENTRIC] = "cpu-centric"
 };
 
-enum dsa_mode {
-	DM_SHARED = 0,
-	DM_DEDICATED
-};
-
-static const char * const dsa_mode_names[] = {
-	[DM_SHARED] = "shared",
-	[DM_DEDICATED] = "dedicated"
+static const char * const wq_mode_names[] = {
+	[ACCFG_WQ_SHARED] = "shared",
+	[ACCFG_WQ_DEDICATED] = "dedicated",
+	[ACCFG_WQ_MODE_UNKNOWN] = "unknown"
 };
 
 // global workqueue variables
@@ -262,7 +261,6 @@ static atomic_ullong adjust_num_waits;
 static double min_avg_waits = MIN_AVG_YIELD_WAITS;
 static double max_avg_waits = MAX_AVG_YIELD_WAITS;
 static uint8_t auto_adjust_knobs = 1;
-static enum dsa_mode dsa_mode = DM_SHARED;
 
 extern char *__progname;
 
@@ -463,6 +461,10 @@ static __always_inline int dsa_wait(struct dto_wq *wq,
 	else
 		dsa_wait_no_adjust(comp);
 
+	if (wq->wq_mode == ACCFG_WQ_DEDICATED) {
+		wq->dwq_desc_outstanding--;
+	}
+
 	if (likely(*comp == DSA_COMP_SUCCESS)) {
 		thr_bytes_completed += hw->xfer_size;
 		return SUCCESS;
@@ -478,57 +480,74 @@ static __always_inline int dsa_submit(struct dto_wq *wq,
 	struct dsa_hw_desc *hw)
 {
 	//LOG_TRACE("desc flags: 0x%x, opcode: 0x%x\n", hw->flags, hw->opcode);
-	__builtin_ia32_sfence();
-	switch (dsa_mode) {
-		case DM_SHARED: {
-			int retry;
-			for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
+	int retry = 0;
+	for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
+		switch (wq->wq_mode) {
+			case ACCFG_WQ_SHARED: {
+				__builtin_ia32_sfence();
 				retry = enqcmd(hw, wq->wq_portal);
-				if (!retry)
-					return SUCCESS;
+				break;
+			}
+			case ACCFG_WQ_DEDICATED: {
+				int old_dwq_desc_outstanding = wq->dwq_desc_outstanding;
+				/* for dedicated wq, the atomic_compare_exchange_strong provides the required store fence */
+				if (old_dwq_desc_outstanding < wq->wq_size &&
+					atomic_compare_exchange_strong(&wq->dwq_desc_outstanding, &old_dwq_desc_outstanding, old_dwq_desc_outstanding + 1)) {
+					movdir64b(hw, wq->wq_portal);
+					retry = 0;
+				} else {
+					retry = 1;
+				}
+				break;
 			}
-			return RETRIES;
 		}
-		case DM_DEDICATED: {
-			movdir64b(hw, wq->wq_portal);
+
+		if (!retry)
 			return SUCCESS;
-		}
 	}
-	return FAIL_OTHERS;
+
+	return RETRIES;
 }
 
 static __always_inline int dsa_execute(struct dto_wq *wq,
 	struct dsa_hw_desc *hw, volatile uint8_t *comp)
 {
-	*comp = 0;
-	bool do_wait = false;
 	//LOG_TRACE("desc flags: 0x%x, opcode: 0x%x\n", hw->flags, hw->opcode);
-	__builtin_ia32_sfence();
-	switch (dsa_mode) {
-		case DM_SHARED: {
-			int retry;
-			for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
+	*comp = 0;
+
+	int retry = 0;
+	for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
+		switch (wq->wq_mode) {
+			case ACCFG_WQ_SHARED: {
+				__builtin_ia32_sfence();
 				retry = enqcmd(hw, wq->wq_portal);
-				if (!retry) {
-					do_wait = true;
-					break;
+				break;
+			}
+			case ACCFG_WQ_DEDICATED: {
+				int old_dwq_desc_outstanding = wq->dwq_desc_outstanding;
+				/* for dedicated wq, the atomic_compare_exchange_strong provides the required store fence */
+				if (old_dwq_desc_outstanding < wq->wq_size &&
+					atomic_compare_exchange_strong(&wq->dwq_desc_outstanding, &old_dwq_desc_outstanding, old_dwq_desc_outstanding + 1)) {
+					movdir64b(hw, wq->wq_portal);
+					retry = 0;
+				} else {
+					retry = 1;
 				}
+				break;
 			}
-			if (!do_wait)
-				return RETRIES;
-		}
-		case DM_DEDICATED: {
-			movdir64b(hw, wq->wq_portal);
-			do_wait = true;
 		}
 	}
 
-	if (do_wait) {
+	if (!retry) {
 		if (auto_adjust_knobs)
 			dsa_wait_and_adjust(comp);
 		else
 			dsa_wait_no_adjust(comp);
 
+		if (wq->wq_mode == ACCFG_WQ_DEDICATED) {
+			wq->dwq_desc_outstanding--;
+		}
+
 		if (*comp == DSA_COMP_SUCCESS) {
 			thr_bytes_completed += hw->xfer_size;
 			return SUCCESS;
@@ -540,7 +559,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
 		return FAIL_OTHERS;
 	}
 
-	return FAIL_OTHERS;
+	return RETRIES;
 }
 
 #ifdef DTO_STATS_SUPPORT
@@ -849,8 +868,11 @@ static int dsa_init_from_wq_list(char *wq_list)
 			goto fail_wq;
 		}
 
-		if ((dsa_mode == DM_SHARED && strcmp(wq_mode, "shared") != 0) ||
-			(dsa_mode == DM_DEDICATED && strcmp(wq_mode, "dedicated") != 0)) {
+		if (strcmp(wq_mode, "shared") == 0) {
+			wqs[num_wqs].wq_mode = ACCFG_WQ_SHARED;
+		} else if (strcmp(wq_mode, "dedicated") == 0) {
+			wqs[num_wqs].wq_mode = ACCFG_WQ_DEDICATED;
+		} else {
 			continue;
 		}
 
@@ -980,10 +1002,11 @@ static int dsa_init_from_accfg(void)
 
 			/* the wq mode should be shared work queue */
 			mode = accfg_wq_get_mode(wq);
-			if ((dsa_mode == DM_SHARED && mode != ACCFG_WQ_SHARED) ||
-				(dsa_mode == DM_DEDICATED && mode != ACCFG_WQ_DEDICATED))
+			if (mode == ACCFG_WQ_MODE_UNKNOWN) {
 				continue;
+			}
 
+			wqs[num_wqs].wq_mode = mode;
 			wqs[num_wqs].wq_size = accfg_wq_get_size(wq);
 			wqs[num_wqs].max_transfer_size = accfg_wq_get_max_transfer_size(wq);
 
@@ -1243,14 +1266,6 @@ static int init_dto(void)
 				}
 			}
 
-			env_str = getenv("DTO_DSA_MODE");
-			if (env_str != NULL) {
-				errno = 0;
-				dsa_mode = strtoul(env_str, NULL, 10);
-				if (errno || dsa_mode > DM_DEDICATED)
-					dsa_mode = DM_SHARED;
-			}
-
 			if (dsa_init()) {
 				LOG_ERROR("Didn't find any usable DSAs. Falling back to using CPUs.\n");
 				use_std_lib_calls = 1;
@@ -1258,12 +1273,12 @@ static int init_dto(void)
 
 			// display configuration
 			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
-				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s, dsa_mode: %s\n",
+				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s\n",
 				log_level, collect_stats, use_std_lib_calls, dsa_min_size,
-				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware], dsa_mode_names[dsa_mode]);
+				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware]);
 			for (int i = 0; i < num_wqs; i++)
-				LOG_TRACE("[%d] wq_path: %s, wq_size: %d, dsa_cap: %lx\n", i,
-					wqs[i].wq_path, wqs[i].wq_size, wqs[i].dsa_gencap);
+				LOG_TRACE("[%d] wq_path: %s, wq_size: %d, wq_mode: %s, dsa_cap: %lx\n", i,
+					wqs[i].wq_path, wqs[i].wq_size, wq_mode_names[wqs[i].wq_mode], wqs[i].dsa_gencap);
 		}
 		dto_initialized = 1;
 

From d17120e668365c2287e532047c756a14647bb322 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Mon, 29 Apr 2024 14:22:53 +0200
Subject: [PATCH 20/23] Added:    DTO_DSA_MEMCPY=0/1, 1 (default) - DTO uses
 DSA to process memcpy, 0 - DTO uses system memcpy    DTO_DSA_MEMMOVE=0/1, 1
 (default) - DTO uses DSA to process memmove, 0 - DTO uses system memmove   
 DTO_DSA_MEMSET=0/1, 1 (default) - DTO uses DSA to process memset, 0 - DTO use
 system memset    DTO_DSA_MEMCMP=0/1, 1 (default) - DTO uses DSA to process
 memcmp, 0 - DTO use system memcmp    DTO_ENQCMD_MAX_RETRIES=xxxx defines
 maximal number of retries for enquing command into DSA queue, default is 3   
 DTO_UMWAIT_DELAY=xxxx defines delay for umwait command (check max possible
 value at: /sys/devices/system/cpu/umwait_control/max_time), default is 100000

---
 README.md |  6 ++++
 dto.c     | 92 +++++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 66fb927..d781fa4 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,12 @@ Following environment variables control the behavior of DTO library:
    DTO_IS_NUMA_AWARE=0/1/2 (disables/buffer-centric/cpu-centric numa awareness. 0 -- disable (default), 1 -- buffer-centric, 2 - cpu-centric)
 	DTO_WQ_LIST="semi-colon(;) separated list of DSA WQs to use". The WQ names should match their names in /dev/dsa/ directory (see example below).
 				If not specified, DTO will try to auto-discover and use all available WQs.
+   DTO_DSA_MEMCPY=0/1, 1 (default) - DTO uses DSA to process memcpy, 0 - DTO uses system memcpy
+   DTO_DSA_MEMMOVE=0/1, 1 (default) - DTO uses DSA to process memmove, 0 - DTO uses system memmove
+   DTO_DSA_MEMSET=0/1, 1 (default) - DTO uses DSA to process memset, 0 - DTO use system memset
+   DTO_DSA_MEMCMP=0/1, 1 (default) - DTO uses DSA to process memcmp, 0 - DTO use system memcmp
+   DTO_ENQCMD_MAX_RETRIES - defines maximal number of retries for enquing command into DSA queue, default is 3
+   DTO_UMWAIT_DELAY - defines delay for umwait command (check max possible value at: /sys/devices/system/cpu/umwait_control/max_time), default is 100000
 	DTO_LOG_FILE=<dto log file path> Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid.
 	DTO_LOG_LEVEL=0/1/2 controls the log level. higher value means more verbose logging (default 0).
 ```
diff --git a/dto.c b/dto.c
index 53d3eca..54cdec6 100644
--- a/dto.c
+++ b/dto.c
@@ -13,6 +13,7 @@
 #include <sys/mman.h>
 #include <cpuid.h>
 #include <linux/idxd.h>
+#include <sys/types.h>
 #include <x86intrin.h>
 #include <sched.h>
 #include <sys/stat.h>
@@ -31,13 +32,13 @@
 // DSA capabilities
 #define GENCAP_CC_MEMORY  0x4
 
-#define ENQCMD_MAX_RETRIES 3
+#define ENQCMD_MAX_RETRIES_DEFAULT 3
 
-#define UMWAIT_DELAY 100000
+#define UMWAIT_DELAY_DEFAULT 100000
 /* C0.1 state */
 #define UMWAIT_STATE 1
 
-#define USE_ORIG_FUNC(n) (use_std_lib_calls == 1 || n < dsa_min_size)
+#define USE_ORIG_FUNC(n, use_dsa) (use_std_lib_calls == 1 || !use_dsa || n < dsa_min_size)
 #define TS_NS(s, e) (((e.tv_sec*1000000000) + e.tv_nsec) - ((s.tv_sec*1000000000) + s.tv_nsec))
 
 /* Maximum WQs that DTO will use. It is rather an arbitrary limit
@@ -118,6 +119,14 @@ static size_t dsa_min_size = DTO_DEFAULT_MIN_SIZE;
 static int wait_method = WAIT_YIELD;
 static double cpu_size_fraction;
 
+static uint8_t dto_dsa_memcpy = 1;
+static uint8_t dto_dsa_memmove = 1;
+static uint8_t dto_dsa_memset = 1;
+static uint8_t dto_dsa_memcmp = 1;
+
+static uint16_t dto_enqcmd_max_retries = ENQCMD_MAX_RETRIES_DEFAULT;
+static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT;
+
 static uint8_t fork_handler_registered;
 
 enum memop {
@@ -361,7 +370,7 @@ static __always_inline void __dsa_wait_umwait(const volatile uint8_t *comp)
 	// Hardware never writes 0 to this field. Software should initialize this field to 0
 	// so it can detect when the completion record has been written
 	if (*comp == 0) {
-		uint64_t delay = __rdtsc() + UMWAIT_DELAY;
+		uint64_t delay = __rdtsc() + dto_umwait_delay;
 
 		umwait(delay, UMWAIT_STATE);
 	}
@@ -481,7 +490,7 @@ static __always_inline int dsa_submit(struct dto_wq *wq,
 {
 	//LOG_TRACE("desc flags: 0x%x, opcode: 0x%x\n", hw->flags, hw->opcode);
 	int retry = 0;
-	for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
+	for (int r = 0; r < dto_enqcmd_max_retries; ++r) {
 		switch (wq->wq_mode) {
 			case ACCFG_WQ_SHARED: {
 				__builtin_ia32_sfence();
@@ -516,7 +525,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
 	*comp = 0;
 
 	int retry = 0;
-	for (int r = 0; r < ENQCMD_MAX_RETRIES; ++r) {
+	for (int r = 0; r < dto_enqcmd_max_retries; ++r) {
 		switch (wq->wq_mode) {
 			case ACCFG_WQ_SHARED: {
 				__builtin_ia32_sfence();
@@ -1179,6 +1188,47 @@ static int init_dto(void)
 			use_std_lib_calls = !!use_std_lib_calls;
 		}
 
+		env_str = getenv("DTO_DSA_MEMCPY");
+		if (env_str != NULL) {
+			errno = 0;
+			dto_dsa_memcpy = strtoul(env_str, NULL, 10);
+			if (errno)
+				dto_dsa_memcpy = 0;
+
+			dto_dsa_memcpy = !!dto_dsa_memcpy;
+		}
+
+		env_str = getenv("DTO_DSA_MEMMOVE");
+		if (env_str != NULL) {
+			errno = 0;
+			dto_dsa_memmove = strtoul(env_str, NULL, 10);
+			if (errno)
+				dto_dsa_memmove = 0;
+
+			dto_dsa_memmove = !!dto_dsa_memmove;
+		}
+
+		env_str = getenv("DTO_DSA_MEMSET");
+		if (env_str != NULL) {
+			errno = 0;
+			dto_dsa_memset = strtoul(env_str, NULL, 10);
+			if (errno)
+				dto_dsa_memset = 0;
+
+			dto_dsa_memset = !!dto_dsa_memset;
+		}
+
+		env_str = getenv("DTO_DSA_MEMCMP");
+		if (env_str != NULL) {
+			errno = 0;
+			dto_dsa_memcmp = strtoul(env_str, NULL, 10);
+			if (errno)
+				dto_dsa_memcmp = 0;
+
+			dto_dsa_memcmp = !!dto_dsa_memcmp;
+		}
+
+
 #ifdef DTO_STATS_SUPPORT
 		env_str = getenv("DTO_COLLECT_STATS");
 		if (env_str != NULL) {
@@ -1266,6 +1316,24 @@ static int init_dto(void)
 				}
 			}
 
+			env_str = getenv("DTO_ENQCMD_MAX_RETRIES");
+
+			if (env_str != NULL) {
+				errno = 0;
+				dto_enqcmd_max_retries = strtoul(env_str, NULL, 10);
+				if (errno || dto_enqcmd_max_retries == 0)
+					dto_enqcmd_max_retries = ENQCMD_MAX_RETRIES_DEFAULT;
+			}
+
+			env_str = getenv("DTO_UMWAIT_DELAY");
+
+			if (env_str != NULL) {
+				errno = 0;
+				dto_umwait_delay = strtoul(env_str, NULL, 10);
+				if (errno || dto_umwait_delay == 0)
+					dto_umwait_delay = UMWAIT_DELAY_DEFAULT;
+			}
+
 			if (dsa_init()) {
 				LOG_ERROR("Didn't find any usable DSAs. Falling back to using CPUs.\n");
 				use_std_lib_calls = 1;
@@ -1273,9 +1341,9 @@ static int init_dto(void)
 
 			// display configuration
 			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
-				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, numa_awareness: %s\n",
+				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, dto_enqcmd_max_retries: %d, dto_umwait_delay=%d, numa_awareness: %s, dto_dsa_memcpy=%d, dto_dsa_memmove=%d, dto_dsa_memset=%d, dto_dsa_memcmp=%d\n",
 				log_level, collect_stats, use_std_lib_calls, dsa_min_size,
-				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, numa_aware_names[is_numa_aware]);
+				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, dto_enqcmd_max_retries, dto_umwait_delay, numa_aware_names[is_numa_aware], dto_dsa_memcpy, dto_dsa_memmove, dto_dsa_memset, dto_dsa_memcmp);
 			for (int i = 0; i < num_wqs; i++)
 				LOG_TRACE("[%d] wq_path: %s, wq_size: %d, wq_mode: %s, dsa_cap: %lx\n", i,
 					wqs[i].wq_path, wqs[i].wq_size, wq_mode_names[wqs[i].wq_mode], wqs[i].dsa_gencap);
@@ -1600,7 +1668,7 @@ void *memset(void *s1, int c, size_t n)
 {
 	int result = 0;
 	void *ret = s1;
-	int use_orig_func = USE_ORIG_FUNC(n);
+	int use_orig_func = USE_ORIG_FUNC(n, dto_dsa_memset);
 #ifdef DTO_STATS_SUPPORT
 	struct timespec st, et;
 	size_t orig_n = n;
@@ -1650,7 +1718,7 @@ void *memcpy(void *dest, const void *src, size_t n)
 {
 	int result = 0;
 	void *ret = dest;
-	int use_orig_func = USE_ORIG_FUNC(n);
+	int use_orig_func = USE_ORIG_FUNC(n, dto_dsa_memcpy);
 #ifdef DTO_STATS_SUPPORT
 	struct timespec st, et;
 	size_t orig_n = n;
@@ -1703,7 +1771,7 @@ void *memmove(void *dest, const void *src, size_t n)
 {
 	int result = 0;
 	void *ret = dest;
-	int use_orig_func = USE_ORIG_FUNC(n);
+	int use_orig_func = USE_ORIG_FUNC(n, dto_dsa_memmove);
 #ifdef DTO_STATS_SUPPORT
 	struct timespec st, et;
 	size_t orig_n = n;
@@ -1756,7 +1824,7 @@ int memcmp(const void *s1, const void *s2, size_t n)
 {
 	int result = 0;
 	int ret;
-	int use_orig_func = USE_ORIG_FUNC(n);
+	int use_orig_func = USE_ORIG_FUNC(n, dto_dsa_memcmp);
 #ifdef DTO_STATS_SUPPORT
 	struct timespec st, et;
 	size_t orig_n = n;

From 817e678bcd0fc2afab4699ac4e6b338f5cff0370 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Mon, 29 Apr 2024 14:25:36 +0200
Subject: [PATCH 21/23] Updated description

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d781fa4..3b10404 100644
--- a/README.md
+++ b/README.md
@@ -55,8 +55,8 @@ Following environment variables control the behavior of DTO library:
    DTO_DSA_MEMMOVE=0/1, 1 (default) - DTO uses DSA to process memmove, 0 - DTO uses system memmove
    DTO_DSA_MEMSET=0/1, 1 (default) - DTO uses DSA to process memset, 0 - DTO use system memset
    DTO_DSA_MEMCMP=0/1, 1 (default) - DTO uses DSA to process memcmp, 0 - DTO use system memcmp
-   DTO_ENQCMD_MAX_RETRIES - defines maximal number of retries for enquing command into DSA queue, default is 3
-   DTO_UMWAIT_DELAY - defines delay for umwait command (check max possible value at: /sys/devices/system/cpu/umwait_control/max_time), default is 100000
+   DTO_ENQCMD_MAX_RETRIES=xxxx defines maximal number of retries for enquing command into DSA queue, default is 3
+   DTO_UMWAIT_DELAY=xxxx defines delay for umwait command (check max possible value at: /sys/devices/system/cpu/umwait_control/max_time), default is 100000
 	DTO_LOG_FILE=<dto log file path> Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid.
 	DTO_LOG_LEVEL=0/1/2 controls the log level. higher value means more verbose logging (default 0).
 ```

From 1130de009130f153a3c6426473380212fa33a441 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Mon, 29 Apr 2024 15:23:23 +0200
Subject: [PATCH 22/23] LOG_TRACE formatted

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/dto.c b/dto.c
index 54cdec6..2e9bcb0 100644
--- a/dto.c
+++ b/dto.c
@@ -1340,10 +1340,12 @@ static int init_dto(void)
 			}
 
 			// display configuration
-			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, "
-				"cpu_size_fraction: %.2f, wait_method: %s, auto_adjust_knobs: %d, dto_enqcmd_max_retries: %d, dto_umwait_delay=%d, numa_awareness: %s, dto_dsa_memcpy=%d, dto_dsa_memmove=%d, dto_dsa_memset=%d, dto_dsa_memcmp=%d\n",
-				log_level, collect_stats, use_std_lib_calls, dsa_min_size,
-				cpu_size_fraction, wait_names[wait_method], auto_adjust_knobs, dto_enqcmd_max_retries, dto_umwait_delay, numa_aware_names[is_numa_aware], dto_dsa_memcpy, dto_dsa_memmove, dto_dsa_memset, dto_dsa_memcmp);
+			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, cpu_size_fraction: %.2f, \
+							wait_method: %s, auto_adjust_knobs: %d, dto_enqcmd_max_retries: %d, dto_umwait_delay=%d, numa_awareness: %s, \
+							dto_dsa_memcpy=%d, dto_dsa_memmove=%d, dto_dsa_memset=%d, dto_dsa_memcmp=%d\n",
+							log_level, collect_stats, use_std_lib_calls, dsa_min_size, cpu_size_fraction, wait_names[wait_method],
+							auto_adjust_knobs, dto_enqcmd_max_retries, dto_umwait_delay, numa_aware_names[is_numa_aware],
+							dto_dsa_memcpy, dto_dsa_memmove, dto_dsa_memset, dto_dsa_memcmp);
 			for (int i = 0; i < num_wqs; i++)
 				LOG_TRACE("[%d] wq_path: %s, wq_size: %d, wq_mode: %s, dsa_cap: %lx\n", i,
 					wqs[i].wq_path, wqs[i].wq_size, wq_mode_names[wqs[i].wq_mode], wqs[i].dsa_gencap);

From d48a129aeb31ece14dec3a3f1fa4af99fee55256 Mon Sep 17 00:00:00 2001
From: Grzegorz Rys <grzegorz.rys@intel.com>
Date: Mon, 29 Apr 2024 15:26:49 +0200
Subject: [PATCH 23/23] LOG_TRACE formatted more

Signed-off-by: Grzegorz Rys <grzegorz.rys@intel.com>
---
 dto.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/dto.c b/dto.c
index 2e9bcb0..d3b3ac4 100644
--- a/dto.c
+++ b/dto.c
@@ -1340,12 +1340,12 @@ static int init_dto(void)
 			}
 
 			// display configuration
-			LOG_TRACE("log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, cpu_size_fraction: %.2f, \
-							wait_method: %s, auto_adjust_knobs: %d, dto_enqcmd_max_retries: %d, dto_umwait_delay=%d, numa_awareness: %s, \
-							dto_dsa_memcpy=%d, dto_dsa_memmove=%d, dto_dsa_memset=%d, dto_dsa_memcmp=%d\n",
-							log_level, collect_stats, use_std_lib_calls, dsa_min_size, cpu_size_fraction, wait_names[wait_method],
-							auto_adjust_knobs, dto_enqcmd_max_retries, dto_umwait_delay, numa_aware_names[is_numa_aware],
-							dto_dsa_memcpy, dto_dsa_memmove, dto_dsa_memset, dto_dsa_memcmp);
+			LOG_TRACE( "log_level: %d, collect_stats: %d, use_std_lib_calls: %d, dsa_min_size: %lu, cpu_size_fraction: %.2f, \
+						wait_method: %s, auto_adjust_knobs: %d, dto_enqcmd_max_retries: %d, dto_umwait_delay=%d, numa_awareness: %s, \
+						dto_dsa_memcpy=%d, dto_dsa_memmove=%d, dto_dsa_memset=%d, dto_dsa_memcmp=%d\n",
+						log_level, collect_stats, use_std_lib_calls, dsa_min_size, cpu_size_fraction,
+						wait_names[wait_method], auto_adjust_knobs, dto_enqcmd_max_retries, dto_umwait_delay, numa_aware_names[is_numa_aware],
+						dto_dsa_memcpy, dto_dsa_memmove, dto_dsa_memset, dto_dsa_memcmp);
 			for (int i = 0; i < num_wqs; i++)
 				LOG_TRACE("[%d] wq_path: %s, wq_size: %d, wq_mode: %s, dsa_cap: %lx\n", i,
 					wqs[i].wq_path, wqs[i].wq_size, wq_mode_names[wqs[i].wq_mode], wqs[i].dsa_gencap);