oneapi-src · JoeOster · Aug 25, 2020 · Jul 9, 2020 · Jul 16, 2020 · Jul 20, 2020
diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md
@@ -62,6 +62,7 @@ These intrinsics samples have relatively few modifiable parameters. However, cer
 ### Example of Output
 ```
 Dot Product computed by C:  4324.000000
+Dot Product computed by C + SIMD:  4324.000000
 Dot Product computed by Intel(R) SSE3 intrinsics:  4324.000000
 Dot Product computed by Intel(R) AVX2 intrinsics:  4324.000000
 Dot Product computed by Intel(R) AVX intrinsics:  4324.000000

diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp
@@ -30,13 +30,17 @@
  *
  */
 #include <immintrin.h>
+#include <omp.h>
 #include <pmmintrin.h>
 #include <stdio.h>
+
 #define SIZE 24  // assumes size is a multiple of 8 because
 // Intel(R) AVX registers will store 8, 32bit elements.
 
 // Computes dot product using C
 float dot_product(float *a, float *b);
+// Computes dot product using SIMD
+float dot_product_SIMD(float *a, float *b);
 // Computes dot product using Intel(R) SSE intrinsics
 float dot_product_intrin(float *a, float *b);
 // Computes dot product using Intel(R) AVX intrinsics
@@ -59,9 +63,13 @@ int main() {
     a[i] = i;
     b[i] = i;
   }
+
   product = dot_product(x, y);
   printf("Dot Product computed by C:  %f\n", product);
 
+  product = dot_product_SIMD(x, y);
+  printf("Dot Product computed by C + SIMD:  %f\n", product);
+
   product = dot_product_intrin(x, y);
   printf("Dot Product computed by Intel(R) SSE3 intrinsics:  %f\n", product);
 
@@ -106,6 +114,16 @@ float dot_product(float *a, float *b) {
   return sum;
 }
 
+float dot_product_SIMD(float *a, float *b) {
+  int i;
+  int sum = 0;
+#pragma omp simd reduction(+ : sum)
+  for (i = 0; i < SIZE; i++) {
+    sum += a[i] * b[i];
+  }
+  return sum;
+}
+
 // The Visual Studio* editor will show the following section as disabled as it
 // does not know that __INTEL_COMPILER is defined by the Intel(R) Compiler
 #if __INTEL_COMPILER
@@ -193,7 +211,7 @@ float dot_product_intrin(float *a, float *b) {
         b +
         i);  // loads unaligned array b into num2  num2= b[3]   b[2]   b[1] b[0]
     num3 = _mm_mul_ps(num1, num2);  // performs multiplication   num3 =
-                                    // a[3]*b[3]  a[2]*b[2]  a[1]*b[1]  a[0]*b[0]
+                                    // a[3]*b[3]  a[2]*b[2]  a[1]*b[1] a[0]*b[0]
     num3 = _mm_hadd_ps(num3, num3);  // performs horizontal addition
     // num3=  a[3]*b[3]+ a[2]*b[2]  a[1]*b[1]+a[0]*b[0]  a[3]*b[3]+ a[2]*b[2]
     // a[1]*b[1]+a[0]*b[0]