oneapi-src
diff --git a/‎AI-and-Analytics/Getting-Started-Samples/IntelModin_GettingStarted/IntelModin_GettingStarted.ipynb
Lines changed: 59 additions & 4 deletions b/‎AI-and-Analytics/Getting-Started-Samples/IntelModin_GettingStarted/IntelModin_GettingStarted.ipynb
Lines changed: 59 additions & 4 deletions
diff --git a/‎AI-and-Analytics/Getting-Started-Samples/IntelModin_GettingStarted/sample.json
Lines changed: 2 additions & 1 deletion b/‎AI-and-Analytics/Getting-Started-Samples/IntelModin_GettingStarted/sample.json
Lines changed: 2 additions & 1 deletion
diff --git a/‎DirectProgramming/DPC++FPGA/ReferenceDesigns/db/README.md
Lines changed: 4 additions & 11 deletions b/‎DirectProgramming/DPC++FPGA/ReferenceDesigns/db/README.md
Lines changed: 4 additions & 11 deletions
diff --git a/‎DirectProgramming/DPC++FPGA/ReferenceDesigns/db/sample.json
Lines changed: 3 additions & 3 deletions b/‎DirectProgramming/DPC++FPGA/ReferenceDesigns/db/sample.json
Lines changed: 3 additions & 3 deletions
diff --git a/‎DirectProgramming/DPC++FPGA/ReferenceDesigns/db/src/db.cpp
Lines changed: 12 additions & 4 deletions b/‎DirectProgramming/DPC++FPGA/ReferenceDesigns/db/src/db.cpp
Lines changed: 12 additions & 4 deletions
diff --git a/‎DirectProgramming/DPC++FPGA/ReferenceDesigns/db/src/db_utils/Accumulator.hpp
Lines changed: 10 additions & 11 deletions b/‎DirectProgramming/DPC++FPGA/ReferenceDesigns/db/src/db_utils/Accumulator.hpp
Lines changed: 10 additions & 11 deletions
diff --git a/‎DirectProgramming/DPC++FPGA/ReferenceDesigns/db/src/db_utils/CachedMemory.hpp
Lines changed: 73 additions & 0 deletions b/‎DirectProgramming/DPC++FPGA/ReferenceDesigns/db/src/db_utils/CachedMemory.hpp
Lines changed: 73 additions & 0 deletions
@@ -47,6 +47,60 @@
     "import time"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ****** Do not change the code in this cell! It verifies that the notebook is being run correctly! ******\n",
+    "\n",
+    "def verify_and_print_times(pandas_time, modin_time):\n",
+    "    if modin_time < pandas_time:\n",
+    "        print(f\"Modin was {pandas_time / modin_time:.2f}X faster than stock pandas!\")\n",
+    "        return\n",
+    "    print(\n",
+    "        f\"Oops, stock pandas appears to be {modin_time / pandas_time:.2f}X faster than Modin in this case. \"\n",
+    "        \"This is unlikely but could happen sometimes on certain machines/environments/datasets. \"\n",
+    "        \"One of the most probable reasons is the excessive amount of partitions being assigned to a single worker. \"\n",
+    "        \"You may visit Modin's optimization guide in order to learn more about such cases and how to fix them: \"\n",
+    "        \"\\nhttps://modin.readthedocs.io/en/latest/usage_guide/optimization_notes/index.html\\n\\n\"\n",
+    "        \"But first, verify that you're using the latest Modin version, also, try to use different executions, \"\n",
+    "        \"for basic usage we recommend non-experimental 'PandasOnRay'.\\n\"\n",
+    "        \"Current configuration is:\"\n",
+    "    )\n",
+    "    try:\n",
+    "        from modin.utils import get_current_execution\n",
+    "\n",
+    "        execution = get_current_execution()\n",
+    "    except ImportError:\n",
+    "        # for modin version < 0.12.0\n",
+    "        try:\n",
+    "            from modin.utils import get_current_backend\n",
+    "\n",
+    "            execution = get_current_backend()\n",
+    "        except ImportError:\n",
+    "            # for modin versions < 0.8.1\n",
+    "            execution = (\n",
+    "                \"Can't deduce the current execution, your Modin version is too old!\"\n",
+    "            )\n",
+    "    print(f\"\\tExecution: {execution}\")\n",
+    "    try:\n",
+    "        import modin.config as cfg\n",
+    "\n",
+    "        print(\n",
+    "            f\"\\tIs experimental: {cfg.IsExperimental.get()}\\n\"\n",
+    "            f\"\\tNumber of CPUs to utilize by Modin (check that Modin uses all CPUs on your machine): {cfg.CpuCount.get()}\\n\"\n",
+    "            f\"\\tIs in debug mode (debug mode may perform slower): {cfg.IsDebug.get()}\"\n",
+    "        )\n",
+    "    except (ImportError, AttributeError):\n",
+    "        # for modin versions < 0.8.2\n",
+    "        print(\"\\tCan't deduce Modin configuration, your Modin version is too old!\")\n",
+    "    import modin\n",
+    "\n",
+    "    print(f\"\\tModin version: {modin.__version__}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -58,7 +112,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We will also be importing **stock Pandas** and **Modin as pd to show differentiation**. You can see importing Modin is simple and **does not require any additional steps.**"
+    "We will also be importing **stock Pandas as pandas** and **Modin as pd to show differentiation**. You can see importing Modin is simple and **does not require any additional steps.**"
    ]
   },
   {
@@ -612,6 +666,7 @@
     "modin_time = time.time() - t1\n",
     "\n",
     "print(\"Pandas Time(seconds):\",pandas_time,\"\\nModin Time(seconds):\",modin_time)\n",
+    "verify_and_print_times(pandas_time, modin_time)\n",
     "outputDict={\"Pandas\":pandas_time,\"Modin\":modin_time}\n",
     "plotter(outputDict)"
    ]
@@ -693,7 +748,7 @@
     }
    ],
    "source": [
-    "print(\"Modin was {}X faster than stock Pandas!\".format(round(pandas_time/modin_time, 2)))"
+    "verify_and_print_times(pandas_time, modin_time)"
    ]
   },
   {
@@ -852,7 +907,7 @@
     }
    ],
    "source": [
-    "print(\"Modin was {}X faster than stock Pandas!\".format(round(pandas_time/modin_time, 2)))"
+    "verify_and_print_times(pandas_time, modin_time)"
    ]
   },
   {
@@ -1000,7 +1055,7 @@
     }
    ],
    "source": [
-    "print(\"Modin was {}X faster than stock Pandas!\".format(round(pandas_time/modin_time, 2)))"
+    "verify_and_print_times(pandas_time, modin_time)"
    ]
   },
   {
 
@@ -19,7 +19,8 @@
 			"conda activate intel-aikit-modin",
 			"pip install -r requirements.txt # Installing notebook's dependencies",
 			"pip install runipy # Installing 'runipy' for extended abilities to execute the notebook",
-			"runipy IntelModin_GettingStarted.ipynb"
+			"runipy IntelModin_GettingStarted.ipynb # Test 'Modin is faster than pandas' case",
+			"MODIN_CPUS=1 runipy IntelModin_GettingStarted.ipynb # Test 'Modin is slower than pandas' case"
   		]
   	}
     ]
 
@@ -16,16 +16,7 @@ The [oneAPI Programming Guide](https://software.intel.com/en-us/oneapi-programmi
 _Notice: This example design is only officially supported for the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX)_
 
 **Performance**
-In this design, we accelerate four database queries as *offload accelerators*. In an offload accelerator scheme, the queries are performed by transferring the relevant data from the CPU host to the FPGA, starting the query kernel on the FPGA, and copying the results back. This means that the relevant performance number is the latency (i.e., the wall clock time) from when the query is requested to the time the output data is accessible by the host. This includes the time to transfer data between the CPU and FPGA over PCIe (with an approximate read and write bandwidth of 6877 and 6582 MB/s, respectively). As shown in the table below, most of the total query time is spent transferring the data between the CPU and FPGA, and the query kernels themselves are a small portion of the total latency.
-
-The performance data below was gathered using the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX) with a database scale factor (SF) of 1. Please see the [Database files](#database-files) section for more information on generating data for a scale factor of 1.
-
-| Query | Approximate Data Transfer Time (ms) | Measured Total Query Processing Time (ms)
-|:---   |:---                                 |:---
-| 1     | 35                                  | 39
-| 9     | 37                                  | 43
-| 11    | 5                                   | 11
-| 12    | 16                                  | 26
+In this design, we accelerate four database queries as *offload accelerators*. In an offload accelerator scheme, the queries are performed by transferring the relevant data from the CPU host to the FPGA, starting the query kernel on the FPGA, and copying the results back. This means that the relevant performance number is the processing time (i.e., the wall clock time) from when the query is requested to the time the output data is accessible by the host. This includes the time to transfer data between the CPU and FPGA over PCIe (with an approximate read and write bandwidth of 6877 and 6582 MB/s, respectively). As shown in the table below, most of the total query time is spent transferring the data between the CPU and FPGA, and the query kernels themselves are a small portion of the total latency.
 
 ## Purpose
 The database in this tutorial has 8-tables and a set of 21 business-oriented queries with broad industry-wide relevance. This reference design shows how four queries can be accelerated using the Intel® FPGA PAC D5005 (with Intel Stratix® 10 SX) and oneAPI. To do so, we create a set of common database operators (found in the `src/db_utils/` directory) that are are combined in different ways to build the four queries.
@@ -232,7 +223,9 @@ You should see the following output in the console:
     Validating query 1 test results
     Running Q1 within 90 days of 1998-12-1
     Validating query 1 test results
-    Processing time: 40.2986 ms
+    Total processing time: 34.389 ms
+    Kernel processing time: 3.16621 ms
+    Throughput: 315.835 queries/s
     PASSED
     ```
     NOTE: the scale factor 1 (SF=1) database files (`../data/sf1`) are **not** shipped with this reference design. Please refer to the [Database files](#database-files) section for information on how to generate these files yourself.
 
@@ -145,7 +145,7 @@
           "cd ../..",
           "mkdir build-q1",
           "cd build-q1",
-          "cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=1",
+          "cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=1 -DSF_SMALL=1",
           "nmake report"
         ]
       },
@@ -156,7 +156,7 @@
           "cd ../..",
           "mkdir build-q11",
           "cd build-q11",
-          "cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=11",
+          "cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=11 -DSF_SMALL=1",
           "nmake report"
         ]
       },
@@ -167,7 +167,7 @@
           "cd ../..",
           "mkdir build-q12",
           "cd build-q12",
-          "cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=12",
+          "cmake -G \"NMake Makefiles\" ../ReferenceDesigns/db -DQUERY=12 -DSF_SMALL=1",
           "nmake report"
         ]
       }
 
@@ -268,8 +268,15 @@ int main(int argc, char* argv[]) {
           std::accumulate(total_latency.begin() + 1, total_latency.end(), 0.0) /
           (double)(runs - 1);
 
+      double kernel_latency_avg =
+        std::accumulate(kernel_latency.begin() + 1, kernel_latency.end(), 0.0) /
+        (double)(runs - 1);
+
       // print the performance results
       std::cout << "Processing time: " << total_latency_avg << " ms\n";
+      std::cout << "Kernel time: " << kernel_latency_avg << " ms\n";
+      std::cout << "Throughput: " << ((1 / kernel_latency_avg) * 1e3)
+                << " queries/s\n";
 #endif
 
       std::cout << "PASSED\n";
@@ -325,7 +332,7 @@ bool DoQuery1(queue& q, Database& dbinfo, std::string& db_root_dir,
   unsigned int low_date_compact = low_date.ToCompact();
 
   std::cout << "Running Q1 within " << DELTA << " days of " << date.year << "-"
-            << date.month << "-" << date.day << "\n";
+            << date.month << "-" << date.day << std::endl;
 
   // the query output data
   std::array<DBDecimal, kQuery1OutSize> sum_qty = {0}, sum_base_price = {0},
@@ -378,7 +385,7 @@ bool DoQuery9(queue& q, Database& dbinfo, std::string& db_root_dir,
   // convert the colour regex to uppercase characters (convention)
   transform(colour.begin(), colour.end(), colour.begin(), ::toupper);
 
-  std::cout << "Running Q9 with colour regex: " << colour << "\n";
+  std::cout << "Running Q9 with colour regex: " << colour << std::endl;
 
   // the output of the query
   std::array<DBDecimal, 25 * 2020> sum_profit;
@@ -424,7 +431,8 @@ bool DoQuery11(queue& q, Database& dbinfo, std::string& db_root_dir,
   transform(nation.begin(), nation.end(), nation.begin(), ::toupper);
 
   std::cout << "Running Q11 for nation " << nation.c_str()
-            << " (key=" << (int)(dbinfo.n.name_key_map[nation]) << ")\n";
+            << " (key=" << (int)(dbinfo.n.name_key_map[nation]) << ")"
+            << std::endl;
 
   // the query output
   std::vector<DBIdentifier> partkeys(kPartTableSize);
@@ -492,7 +500,7 @@ bool DoQuery12(queue& q, Database& dbinfo, std::string& db_root_dir,
 
   std::cout << "Running Q12 between years " << low_date.year << " and "
             << high_date.year << " for SHIPMODES " << shipmode1 << " and "
-            << shipmode2 << "\n";
+            << shipmode2 << std::endl;;
 
   // the output of the query
   std::array<DBDecimal, 2> high_line_count, low_line_count;
 
@@ -89,13 +89,13 @@ class BRAMAccumulator {
   // initialize the memory entries
   void Init() {
     // initialize the memory entries
-    for (IndexType i = 0; i < size; i++) {
+    for (int i = 0; i < size; i++) {
       mem[i] = 0;
     }
 
 // initialize the cache
 #pragma unroll
-    for (IndexType i = 0; i < cache_size + 1; i++) {
+    for (int i = 0; i < cache_size + 1; i++) {
       cache_value[i] = 0;
       cache_tag[i] = 0;
     }
@@ -104,34 +104,33 @@ class BRAMAccumulator {
   // accumulate 'value' into register 'index' (i.e. registers[index] += value)
   void Accumulate(IndexType index, StorageType value) {
     // get value from memory
-    StorageType currVal = mem[index];
+    StorageType curr_val = mem[index];
 
 // check if value is in cache
 #pragma unroll
-    for (IndexType i = 0; i < cache_size + 1; i++) {
+    for (int i = 0; i < cache_size + 1; i++) {
       if (cache_tag[i] == index) {
-        currVal = cache_value[i];
+        curr_val = cache_value[i];
       }
     }
 
     // write the new value to both the shift register cache and the local mem
-    const StorageType newVal = currVal + value;
-    mem[index] = cache_value[cache_size] = newVal;
+    StorageType new_val = curr_val + value;
+    mem[index] = new_val;
+    cache_value[cache_size] = new_val;
     cache_tag[cache_size] = index;
 
 // Cache is just a shift register, so shift it
 // pushing into back of the shift register done above
 #pragma unroll
-    for (IndexType i = 0; i < cache_size; i++) {
+    for (int i = 0; i < cache_size; i++) {
       cache_value[i] = cache_value[i + 1];
       cache_tag[i] = cache_tag[i + 1];
     }
   }
 
   // get the value of memory at 'index'
-  StorageType Get(IndexType index) {
-    return mem[index];
-  }
+  StorageType Get(IndexType index) { return mem[index]; }
 
   // internal storage
   StorageType mem[size];
 
@@ -0,0 +1,73 @@
+#ifndef __CACHED_MEMORY_HPP__
+#define __CACHED_MEMORY_HPP__
+
+template <typename StorageType, int n, int cache_n,
+          typename IndexType = int>
+class CachedMemory {
+  // static asserts
+  static_assert(n > 0);
+  static_assert(cache_n >= 0);
+  static_assert(std::is_arithmetic<StorageType>::value,
+                "StorageType must be arithmetic to support accumulation");
+  static_assert(std::is_integral<IndexType>::value,
+                "IndexType must be an integral type");
+  static_assert(std::numeric_limits<IndexType>::max() >= (n - 1),
+                "IndexType must be large enough to index the entire array");
+
+public:
+  CachedMemory() {}
+  
+  void Init(StorageType init_val = 0) {
+    for (int i = 0; i < n; i++) {
+      mem[i] = init_val;
+    }
+    #pragma unroll
+    for (int i = 0; i < cache_n + 1; i++) {
+      cache_value[i] = init_val;
+      cache_tag[i] = 0;
+    }
+  }
+
+  auto Get(IndexType idx) {
+    // grab the value from memory
+    StorageType ret = mem[idx];
+
+    // check for this value in the cache as well
+    #pragma unroll
+    for (int i = 0; i < cache_n + 1; i++) {
+      if (cache_tag[i] == idx) {
+        ret = cache_value[i];
+      }
+    }
+
+    return ret;
+  }
+
+  void Set(IndexType idx, StorageType val) {
+    // store the new value in the actual memory, and the start of the shift
+    // register cache
+    mem[idx] = val;
+    cache_value[cache_n] = val;
+    cache_tag[cache_n] = idx;
+
+    // shift the shift register cache
+    #pragma unroll
+    for (int i = 0; i < cache_n; i++) {
+      cache_value[i] = cache_value[i + 1];
+      cache_tag[i] = cache_tag[i + 1];
+    }
+  }
+
+private:
+  // internal storage
+  StorageType mem[n];
+
+  // internal cache for hiding write latency
+  [[intel::fpga_register]]
+  StorageType cache_value[cache_n + 1];
+
+  [[intel::fpga_register]]
+  int cache_tag[cache_n + 1];
+};
+
+#endif /* __CACHED_MEMORY_HPP__ */
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,8 @@`
`19`	`19`	`"conda activate intel-aikit-modin",`
`20`	`20`	`"pip install -r requirements.txt # Installing notebook's dependencies",`
`21`	`21`	`"pip install runipy # Installing 'runipy' for extended abilities to execute the notebook",`
`22`		`- "runipy IntelModin_GettingStarted.ipynb"`
	`22`	`+ "runipy IntelModin_GettingStarted.ipynb # Test 'Modin is faster than pandas' case",`
	`23`	`+ "MODIN_CPUS=1 runipy IntelModin_GettingStarted.ipynb # Test 'Modin is slower than pandas' case"`
`23`	`24`	`]`
`24`	`25`	`}`
`25`	`26`	`]`