1414#include < vector>
1515#include < cstring>
1616#include < cstdlib>
17+ #include < cstdint>
1718
1819using std::vector;
1920
@@ -34,6 +35,12 @@ using std::vector;
3435 #include < omp.h>
3536#endif
3637
38+ #if NUMA_AWARE
39+ #include < sys/mman.h>
40+ #include < unistd.h>
41+ #include < numaif.h>
42+ #include < numa.h>
43+ #endif // NUMA_AWARE
3744
3845
3946/*
@@ -105,23 +112,69 @@ int cpu_getCurrentNumThreads() {
105112 * MEMORY ALLOCATION
106113 */
107114
115+ #if NUMA_AWARE
116+ unsigned long get_page_size () {
117+ static unsigned long page_size = 0 ;
118+ if (!page_size) {
119+ page_size = sysconf (_SC_PAGESIZE);
120+ if (page_size == ~0UL ) {
121+ perror (" Failed to get the page size" );
122+ }
123+ }
124+ return page_size;
125+ }
126+
127+ unsigned long get_numa_nodes () {
128+ static int n_nodes = 0 ;
129+ if (!n_nodes) {
130+ n_nodes = numa_num_configured_nodes ();
131+ if (n_nodes < 1 ) {
132+ perror (" Failed to get the numa node count" );
133+ }
134+ }
135+ return n_nodes;
136+ }
137+ #endif
108138
109139qcomp* cpu_allocArray (qindex length) {
140+ return (qcomp*) calloc (length, sizeof (qcomp));
141+ }
110142
111- // / @todo
112- // / here, we calloc the entire array in a serial setting, rather than one malloc
113- // / followed by threads subsequently memset'ing their own partitions. The latter
114- // / approach would distribute the array pages across NUMA nodes, accelerating
115- // / their subsequent access by the same threads (via NUMA's first-touch policy).
116- // / We have so far foregone this optimisation since a thread's memory-access pattern
117- // / in many of the QuEST functions is non-trivial, and likely to be inconsistent
118- // / with the memset pattern. As such, I expect the benefit is totally occluded
119- // / and only introduces potential new bugs - but this should be tested and confirmed!
120-
121- // we call calloc over malloc in order to fail immediately if mem isn't available;
122- // caller must handle nullptr result
123143
124- return (qcomp*) calloc (length, sizeof (qcomp));
144+ qcomp* cpu_allocNumaArray (qindex length) {
145+ #if !NUMA_AWARE
146+ return cpu_allocArray (length);
147+ #else
148+ unsigned long page_size = get_page_size ();
149+ int n_nodes = get_numa_nodes ();
150+
151+ qindex size = length * sizeof (qcomp);
152+ int pages = (size + page_size - 1 ) / page_size;
153+ void *addr = mmap (NULL , pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1 , 0 );
154+ if (n_nodes == 1 ) {
155+ return reinterpret_cast <qcomp*>(addr);
156+ }
157+
158+ // distribution strategy: floor_pages per node, distribute remain_pages as spread out as possible
159+ int floor_pages = pages / n_nodes;
160+ int spread_pages = pages % n_nodes;
161+
162+ uintptr_t pos = (uintptr_t )addr;
163+ for (int node = 0 , shift = n_nodes; node < n_nodes; ++node) {
164+ shift -= spread_pages;
165+ int node_pages = floor_pages + (shift <= 0 );
166+
167+ unsigned long node_mask = 1UL << node;
168+ mbind ((void *)pos, node_pages * page_size, MPOL_BIND, &node_mask, sizeof (node_mask) * 8 , 0 );
169+
170+ pos += node_pages * page_size;
171+ if (shift <= 0 ) {
172+ shift += n_nodes;
173+ }
174+ }
175+
176+ return reinterpret_cast <qcomp*>(addr);
177+ #endif // NUMA_AWARE
125178}
126179
127180
@@ -132,6 +185,23 @@ void cpu_deallocArray(qcomp* arr) {
132185}
133186
134187
188+ void cpu_deallocNumaArray (qcomp* arr, qindex length) {
189+ if (arr == nullptr ) {
190+ return ;
191+ }
192+
193+ #if !NUMA_AWARE
194+ return cpu_deallocArray (arr);
195+ #else
196+ unsigned long page_size = get_page_size ();
197+ qindex size = length * sizeof (qcomp);
198+ int pages = (size + page_size - 1 ) / page_size;
199+
200+ munmap (arr, pages * page_size);
201+ #endif // NUMA_AWARE
202+ }
203+
204+
135205qcomp** cpu_allocAndInitMatrixWrapper (qcomp* arr, qindex dim) {
136206
137207 // do not allocate if arr alloc failed (caller will handle)
0 commit comments