Skip to content

Commit 0eb96f9

Browse files
committed
apply recent changes
Signed-off-by: Alexander Myskov <[email protected]>
2 parents 6682d8d + 0b1f74e commit 0eb96f9

File tree

27 files changed

+769
-431
lines changed

27 files changed

+769
-431
lines changed

AI-and-Analytics/Getting-Started-Samples/IntelModin_GettingStarted/IntelModin_GettingStarted.ipynb

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,60 @@
4747
"import time"
4848
]
4949
},
50+
{
51+
"cell_type": "code",
52+
"execution_count": null,
53+
"metadata": {},
54+
"outputs": [],
55+
"source": [
56+
"# ****** Do not change the code in this cell! It verifies that the notebook is being run correctly! ******\n",
57+
"\n",
58+
"def verify_and_print_times(pandas_time, modin_time):\n",
59+
" if modin_time < pandas_time:\n",
60+
" print(f\"Modin was {pandas_time / modin_time:.2f}X faster than stock pandas!\")\n",
61+
" return\n",
62+
" print(\n",
63+
" f\"Oops, stock pandas appears to be {modin_time / pandas_time:.2f}X faster than Modin in this case. \"\n",
64+
" \"This is unlikely but could happen sometimes on certain machines/environments/datasets. \"\n",
65+
" \"One of the most probable reasons is the excessive amount of partitions being assigned to a single worker. \"\n",
66+
" \"You may visit Modin's optimization guide in order to learn more about such cases and how to fix them: \"\n",
67+
" \"\\nhttps://modin.readthedocs.io/en/latest/usage_guide/optimization_notes/index.html\\n\\n\"\n",
68+
" \"But first, verify that you're using the latest Modin version, also, try to use different executions, \"\n",
69+
" \"for basic usage we recommend non-experimental 'PandasOnRay'.\\n\"\n",
70+
" \"Current configuration is:\"\n",
71+
" )\n",
72+
" try:\n",
73+
" from modin.utils import get_current_execution\n",
74+
"\n",
75+
" execution = get_current_execution()\n",
76+
" except ImportError:\n",
77+
" # for modin version < 0.12.0\n",
78+
" try:\n",
79+
" from modin.utils import get_current_backend\n",
80+
"\n",
81+
" execution = get_current_backend()\n",
82+
" except ImportError:\n",
83+
" # for modin versions < 0.8.1\n",
84+
" execution = (\n",
85+
" \"Can't deduce the current execution, your Modin version is too old!\"\n",
86+
" )\n",
87+
" print(f\"\\tExecution: {execution}\")\n",
88+
" try:\n",
89+
" import modin.config as cfg\n",
90+
"\n",
91+
" print(\n",
92+
" f\"\\tIs experimental: {cfg.IsExperimental.get()}\\n\"\n",
93+
" f\"\\tNumber of CPUs to utilize by Modin (check that Modin uses all CPUs on your machine): {cfg.CpuCount.get()}\\n\"\n",
94+
" f\"\\tIs in debug mode (debug mode may perform slower): {cfg.IsDebug.get()}\"\n",
95+
" )\n",
96+
" except (ImportError, AttributeError):\n",
97+
" # for modin versions < 0.8.2\n",
98+
" print(\"\\tCan't deduce Modin configuration, your Modin version is too old!\")\n",
99+
" import modin\n",
100+
"\n",
101+
" print(f\"\\tModin version: {modin.__version__}\")"
102+
]
103+
},
50104
{
51105
"cell_type": "markdown",
52106
"metadata": {},
@@ -58,7 +112,7 @@
58112
"cell_type": "markdown",
59113
"metadata": {},
60114
"source": [
61-
"We will also be importing **stock Pandas** and **Modin as pd to show differentiation**. You can see importing Modin is simple and **does not require any additional steps.**"
115+
"We will also be importing **stock Pandas as pandas** and **Modin as pd to show differentiation**. You can see importing Modin is simple and **does not require any additional steps.**"
62116
]
63117
},
64118
{
@@ -612,6 +666,7 @@
612666
"modin_time = time.time() - t1\n",
613667
"\n",
614668
"print(\"Pandas Time(seconds):\",pandas_time,\"\\nModin Time(seconds):\",modin_time)\n",
669+
"verify_and_print_times(pandas_time, modin_time)\n",
615670
"outputDict={\"Pandas\":pandas_time,\"Modin\":modin_time}\n",
616671
"plotter(outputDict)"
617672
]
@@ -693,7 +748,7 @@
693748
}
694749
],
695750
"source": [
696-
"print(\"Modin was {}X faster than stock Pandas!\".format(round(pandas_time/modin_time, 2)))"
751+
"verify_and_print_times(pandas_time, modin_time)"
697752
]
698753
},
699754
{
@@ -852,7 +907,7 @@
852907
}
853908
],
854909
"source": [
855-
"print(\"Modin was {}X faster than stock Pandas!\".format(round(pandas_time/modin_time, 2)))"
910+
"verify_and_print_times(pandas_time, modin_time)"
856911
]
857912
},
858913
{
@@ -1000,7 +1055,7 @@
10001055
}
10011056
],
10021057
"source": [
1003-
"print(\"Modin was {}X faster than stock Pandas!\".format(round(pandas_time/modin_time, 2)))"
1058+
"verify_and_print_times(pandas_time, modin_time)"
10041059
]
10051060
},
10061061
{

AI-and-Analytics/Getting-Started-Samples/IntelModin_GettingStarted/sample.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
"conda activate intel-aikit-modin",
2020
"pip install -r requirements.txt # Installing notebook's dependencies",
2121
"pip install runipy # Installing 'runipy' for extended abilities to execute the notebook",
22-
"runipy IntelModin_GettingStarted.ipynb"
22+
"runipy IntelModin_GettingStarted.ipynb # Test 'Modin is faster than pandas' case",
23+
"MODIN_CPUS=1 runipy IntelModin_GettingStarted.ipynb # Test 'Modin is slower than pandas' case"
2324
]
2425
}
2526
]

DirectProgramming/DPC++FPGA/ReferenceDesigns/db/README.md

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,7 @@ The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programmi
1616
_Notice: This example design is only officially supported for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX)_
1717

1818
**Performance**
19-
In this design, we accelerate four database queries as *offload accelerators*. In an offload accelerator scheme, the queries are performed by transferring the relevant data from the CPU host to the FPGA, starting the query kernel on the FPGA, and copying the results back. This means that the relevant performance number is the latency (i.e., the wall clock time) from when the query is requested to the time the output data is accessible by the host. This includes the time to transfer data between the CPU and FPGA over PCIe (with an approximate read and write bandwidth of 6877 and 6582 MB/s, respectively). As shown in the table below, most of the total query time is spent transferring the data between the CPU and FPGA, and the query kernels themselves are a small portion of the total latency.
20-
21-
The performance data below was gathered using the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX) with a database scale factor (SF) of 1. Please see the [Database files](#database-files) section for more information on generating data for a scale factor of 1.
22-
23-
| Query | Approximate Data Transfer Time (ms) | Measured Total Query Processing Time (ms)
24-
|:--- |:--- |:---
25-
| 1 | 35 | 39
26-
| 9 | 37 | 43
27-
| 11 | 5 | 11
28-
| 12 | 16 | 26
19+
In this design, we accelerate four database queries as *offload accelerators*. In an offload accelerator scheme, the queries are performed by transferring the relevant data from the CPU host to the FPGA, starting the query kernel on the FPGA, and copying the results back. This means that the relevant performance number is the processing time (i.e., the wall clock time) from when the query is requested to the time the output data is accessible by the host. This includes the time to transfer data between the CPU and FPGA over PCIe (with an approximate read and write bandwidth of 6877 and 6582 MB/s, respectively). As shown in the table below, most of the total query time is spent transferring the data between the CPU and FPGA, and the query kernels themselves are a small portion of the total latency.
2920

3021
## Purpose
3122
The database in this tutorial has 8-tables and a set of 21 business-oriented queries with broad industry-wide relevance. This reference design shows how four queries can be accelerated using the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX) and oneAPI. To do so, we create a set of common database operators (found in the `src/db_utils/` directory) that are are combined in different ways to build the four queries.
@@ -232,7 +223,9 @@ You should see the following output in the console:
232223
Validating query 1 test results
233224
Running Q1 within 90 days of 1998-12-1
234225
Validating query 1 test results
235-
Processing time: 40.2986 ms
226+
Total processing time: 34.389 ms
227+
Kernel processing time: 3.16621 ms
228+
Throughput: 315.835 queries/s
236229
PASSED
237230
```
238231
NOTE: the scale factor 1 (SF=1) database files (`../data/sf1`) are **not** shipped with this reference design. Please refer to the [Database files](#database-files) section for information on how to generate these files yourself.

DirectProgramming/DPC++FPGA/ReferenceDesigns/db/sample.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@
145145
"cd ../..",
146146
"mkdir build-q1",
147147
"cd build-q1",
148-
"cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=1",
148+
"cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=1 -DSF_SMALL=1",
149149
"nmake report"
150150
]
151151
},
@@ -156,7 +156,7 @@
156156
"cd ../..",
157157
"mkdir build-q11",
158158
"cd build-q11",
159-
"cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=11",
159+
"cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=11 -DSF_SMALL=1",
160160
"nmake report"
161161
]
162162
},
@@ -167,7 +167,7 @@
167167
"cd ../..",
168168
"mkdir build-q12",
169169
"cd build-q12",
170-
"cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=12",
170+
"cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=12 -DSF_SMALL=1",
171171
"nmake report"
172172
]
173173
}

DirectProgramming/DPC++FPGA/ReferenceDesigns/db/src/db.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -268,8 +268,15 @@ int main(int argc, char* argv[]) {
268268
std::accumulate(total_latency.begin() + 1, total_latency.end(), 0.0) /
269269
(double)(runs - 1);
270270

271+
double kernel_latency_avg =
272+
std::accumulate(kernel_latency.begin() + 1, kernel_latency.end(), 0.0) /
273+
(double)(runs - 1);
274+
271275
// print the performance results
272276
std::cout << "Processing time: " << total_latency_avg << " ms\n";
277+
std::cout << "Kernel time: " << kernel_latency_avg << " ms\n";
278+
std::cout << "Throughput: " << ((1 / kernel_latency_avg) * 1e3)
279+
<< " queries/s\n";
273280
#endif
274281

275282
std::cout << "PASSED\n";
@@ -325,7 +332,7 @@ bool DoQuery1(queue& q, Database& dbinfo, std::string& db_root_dir,
325332
unsigned int low_date_compact = low_date.ToCompact();
326333

327334
std::cout << "Running Q1 within " << DELTA << " days of " << date.year << "-"
328-
<< date.month << "-" << date.day << "\n";
335+
<< date.month << "-" << date.day << std::endl;
329336

330337
// the query output data
331338
std::array<DBDecimal, kQuery1OutSize> sum_qty = {0}, sum_base_price = {0},
@@ -378,7 +385,7 @@ bool DoQuery9(queue& q, Database& dbinfo, std::string& db_root_dir,
378385
// convert the colour regex to uppercase characters (convention)
379386
transform(colour.begin(), colour.end(), colour.begin(), ::toupper);
380387

381-
std::cout << "Running Q9 with colour regex: " << colour << "\n";
388+
std::cout << "Running Q9 with colour regex: " << colour << std::endl;
382389

383390
// the output of the query
384391
std::array<DBDecimal, 25 * 2020> sum_profit;
@@ -424,7 +431,8 @@ bool DoQuery11(queue& q, Database& dbinfo, std::string& db_root_dir,
424431
transform(nation.begin(), nation.end(), nation.begin(), ::toupper);
425432

426433
std::cout << "Running Q11 for nation " << nation.c_str()
427-
<< " (key=" << (int)(dbinfo.n.name_key_map[nation]) << ")\n";
434+
<< " (key=" << (int)(dbinfo.n.name_key_map[nation]) << ")"
435+
<< std::endl;
428436

429437
// the query output
430438
std::vector<DBIdentifier> partkeys(kPartTableSize);
@@ -492,7 +500,7 @@ bool DoQuery12(queue& q, Database& dbinfo, std::string& db_root_dir,
492500

493501
std::cout << "Running Q12 between years " << low_date.year << " and "
494502
<< high_date.year << " for SHIPMODES " << shipmode1 << " and "
495-
<< shipmode2 << "\n";
503+
<< shipmode2 << std::endl;;
496504

497505
// the output of the query
498506
std::array<DBDecimal, 2> high_line_count, low_line_count;

DirectProgramming/DPC++FPGA/ReferenceDesigns/db/src/db_utils/Accumulator.hpp

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -89,13 +89,13 @@ class BRAMAccumulator {
8989
// initialize the memory entries
9090
void Init() {
9191
// initialize the memory entries
92-
for (IndexType i = 0; i < size; i++) {
92+
for (int i = 0; i < size; i++) {
9393
mem[i] = 0;
9494
}
9595

9696
// initialize the cache
9797
#pragma unroll
98-
for (IndexType i = 0; i < cache_size + 1; i++) {
98+
for (int i = 0; i < cache_size + 1; i++) {
9999
cache_value[i] = 0;
100100
cache_tag[i] = 0;
101101
}
@@ -104,34 +104,33 @@ class BRAMAccumulator {
104104
// accumulate 'value' into register 'index' (i.e. registers[index] += value)
105105
void Accumulate(IndexType index, StorageType value) {
106106
// get value from memory
107-
StorageType currVal = mem[index];
107+
StorageType curr_val = mem[index];
108108

109109
// check if value is in cache
110110
#pragma unroll
111-
for (IndexType i = 0; i < cache_size + 1; i++) {
111+
for (int i = 0; i < cache_size + 1; i++) {
112112
if (cache_tag[i] == index) {
113-
currVal = cache_value[i];
113+
curr_val = cache_value[i];
114114
}
115115
}
116116

117117
// write the new value to both the shift register cache and the local mem
118-
const StorageType newVal = currVal + value;
119-
mem[index] = cache_value[cache_size] = newVal;
118+
StorageType new_val = curr_val + value;
119+
mem[index] = new_val;
120+
cache_value[cache_size] = new_val;
120121
cache_tag[cache_size] = index;
121122

122123
// Cache is just a shift register, so shift it
123124
// pushing into back of the shift register done above
124125
#pragma unroll
125-
for (IndexType i = 0; i < cache_size; i++) {
126+
for (int i = 0; i < cache_size; i++) {
126127
cache_value[i] = cache_value[i + 1];
127128
cache_tag[i] = cache_tag[i + 1];
128129
}
129130
}
130131

131132
// get the value of memory at 'index'
132-
StorageType Get(IndexType index) {
133-
return mem[index];
134-
}
133+
StorageType Get(IndexType index) { return mem[index]; }
135134

136135
// internal storage
137136
StorageType mem[size];
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#ifndef __CACHED_MEMORY_HPP__
2+
#define __CACHED_MEMORY_HPP__
3+
4+
template <typename StorageType, int n, int cache_n,
5+
typename IndexType = int>
6+
class CachedMemory {
7+
// static asserts
8+
static_assert(n > 0);
9+
static_assert(cache_n >= 0);
10+
static_assert(std::is_arithmetic<StorageType>::value,
11+
"StorageType must be arithmetic to support accumulation");
12+
static_assert(std::is_integral<IndexType>::value,
13+
"IndexType must be an integral type");
14+
static_assert(std::numeric_limits<IndexType>::max() >= (n - 1),
15+
"IndexType must be large enough to index the entire array");
16+
17+
public:
18+
CachedMemory() {}
19+
20+
void Init(StorageType init_val = 0) {
21+
for (int i = 0; i < n; i++) {
22+
mem[i] = init_val;
23+
}
24+
#pragma unroll
25+
for (int i = 0; i < cache_n + 1; i++) {
26+
cache_value[i] = init_val;
27+
cache_tag[i] = 0;
28+
}
29+
}
30+
31+
auto Get(IndexType idx) {
32+
// grab the value from memory
33+
StorageType ret = mem[idx];
34+
35+
// check for this value in the cache as well
36+
#pragma unroll
37+
for (int i = 0; i < cache_n + 1; i++) {
38+
if (cache_tag[i] == idx) {
39+
ret = cache_value[i];
40+
}
41+
}
42+
43+
return ret;
44+
}
45+
46+
void Set(IndexType idx, StorageType val) {
47+
// store the new value in the actual memory, and the start of the shift
48+
// register cache
49+
mem[idx] = val;
50+
cache_value[cache_n] = val;
51+
cache_tag[cache_n] = idx;
52+
53+
// shift the shift register cache
54+
#pragma unroll
55+
for (int i = 0; i < cache_n; i++) {
56+
cache_value[i] = cache_value[i + 1];
57+
cache_tag[i] = cache_tag[i + 1];
58+
}
59+
}
60+
61+
private:
62+
// internal storage
63+
StorageType mem[n];
64+
65+
// internal cache for hiding write latency
66+
[[intel::fpga_register]]
67+
StorageType cache_value[cache_n + 1];
68+
69+
[[intel::fpga_register]]
70+
int cache_tag[cache_n + 1];
71+
};
72+
73+
#endif /* __CACHED_MEMORY_HPP__ */

0 commit comments

Comments
 (0)