From 3aec3eb8851e3520a87f76b8668f6d36eb7a0926 Mon Sep 17 00:00:00 2001 From: Ethan Hirsch Date: Mon, 24 Aug 2020 14:14:05 -0700 Subject: [PATCH 1/9] added simd dot_prod implementation Signed-off-by: Ethan Hirsch --- .../Intrinsics/src/intrin_dot_sample.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp index 7a353b8853..1ac331caa6 100644 --- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp +++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp @@ -32,11 +32,15 @@ #include #include #include +#include + #define SIZE 24 // assumes size is a multiple of 8 because // Intel(R) AVX registers will store 8, 32bit elements. // Computes dot product using C float dot_product(float *a, float *b); +// Computes dot product using SIMD +float dot_product_SIMD(float *a, float *b); // Computes dot product using Intel(R) SSE intrinsics float dot_product_intrin(float *a, float *b); // Computes dot product using Intel(R) AVX intrinsics @@ -62,6 +66,9 @@ int main() { product = dot_product(x, y); printf("Dot Product computed by C: %f\n", product); + product = dot_product_SIMD(x, y); + printf("Dot Product computed by C + SIMD: %f\n", product); + product = dot_product_intrin(x, y); printf("Dot Product computed by Intel(R) SSE3 intrinsics: %f\n", product); @@ -106,6 +113,16 @@ float dot_product(float *a, float *b) { return sum; } +float dot_product_SIMD(float *a, float *b) { + int i; + int sum = 0; + #pragma omp simd + for (i = 0; i < SIZE; i++) { + sum += a[i] * b[i]; + } + return sum; +} + // The Visual Studio* editor will show the following section as disabled as it // does not know that __INTEL_COMPILER is defined by the Intel(R) Compiler #if __INTEL_COMPILER From 2b816f40b5ada52e15a1b16ddf511340444ad1b5 Mon Sep 17 00:00:00 2001 From: Ethan Hirsch Date: Mon, 24 Aug 2020 14:22:55 -0700 Subject: [PATCH 2/9] adding timing Signed-off-by: Ethan Hirsch --- .../Intrinsics/src/intrin_dot_sample.cpp | 25 ++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp index 1ac331caa6..ca9df40779 100644 --- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp +++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #define SIZE 24 // assumes size is a multiple of 8 because // Intel(R) AVX registers will store 8, 32bit elements. @@ -51,20 +52,42 @@ short MMX_dot_product(short *a, short *b); #define MMX_DOT_PROD_ENABLED (__INTEL_COMPILER || (_MSC_VER && !_WIN64)) +// Object to allow for measuring computation time +class TimeInterval { + public: + TimeInterval() : start_(std::chrono::steady_clock::now()) {} + + double Elapsed() { + auto now = std::chrono::steady_clock::now(); + return std::chrono::duration_cast(now - start_).count(); + } + + private: + using Duration = std::chrono::duration; + std::chrono::steady_clock::time_point start_; +}; + int main() { float x[SIZE], y[SIZE]; short a[SIZE], b[SIZE]; int i; float product; short mmx_product; + double time; for (i = 0; i < SIZE; i++) { x[i] = i; y[i] = i; a[i] = i; b[i] = i; } - product = dot_product(x, y); + + { + TimeInterval t; + product = dot_product(x, y); + time = t.Elapsed(); + } printf("Dot Product computed by C: %f\n", product); + printf("---Computation time: %f\n", time); product = dot_product_SIMD(x, y); printf("Dot Product computed by C + SIMD: %f\n", product); From 3aef29e590be6b4db2df7643699eb8ac0f2d4c51 Mon Sep 17 00:00:00 2001 From: Ethan Hirsch Date: Mon, 24 Aug 2020 14:24:16 -0700 Subject: [PATCH 3/9] increase size Signed-off-by: Ethan Hirsch --- .../CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp index ca9df40779..6a7901eedb 100644 --- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp +++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp @@ -35,7 +35,7 @@ #include #include -#define SIZE 24 // assumes size is a multiple of 8 because +#define SIZE 128 // assumes size is a multiple of 8 because // Intel(R) AVX registers will store 8, 32bit elements. // Computes dot product using C From 04a2bd93b151409c968f5c0266ababf5d61f2a41 Mon Sep 17 00:00:00 2001 From: Ethan Hirsch Date: Mon, 24 Aug 2020 14:28:41 -0700 Subject: [PATCH 4/9] removed timing bc it's useless lol Signed-off-by: Ethan Hirsch --- .../Intrinsics/src/intrin_dot_sample.cpp | 23 +------------------ 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp index 6a7901eedb..a0f9c9d2f8 100644 --- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp +++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp @@ -52,28 +52,12 @@ short MMX_dot_product(short *a, short *b); #define MMX_DOT_PROD_ENABLED (__INTEL_COMPILER || (_MSC_VER && !_WIN64)) -// Object to allow for measuring computation time -class TimeInterval { - public: - TimeInterval() : start_(std::chrono::steady_clock::now()) {} - - double Elapsed() { - auto now = std::chrono::steady_clock::now(); - return std::chrono::duration_cast(now - start_).count(); - } - - private: - using Duration = std::chrono::duration; - std::chrono::steady_clock::time_point start_; -}; - int main() { float x[SIZE], y[SIZE]; short a[SIZE], b[SIZE]; int i; float product; short mmx_product; - double time; for (i = 0; i < SIZE; i++) { x[i] = i; y[i] = i; @@ -81,13 +65,8 @@ int main() { b[i] = i; } - { - TimeInterval t; - product = dot_product(x, y); - time = t.Elapsed(); - } + product = dot_product(x, y); printf("Dot Product computed by C: %f\n", product); - printf("---Computation time: %f\n", time); product = dot_product_SIMD(x, y); printf("Dot Product computed by C + SIMD: %f\n", product); From 745e53bd3656bb0be2d25d756088c1c413d51204 Mon Sep 17 00:00:00 2001 From: Ethan Hirsch Date: Mon, 24 Aug 2020 14:52:13 -0700 Subject: [PATCH 5/9] added reduction to simd Signed-off-by: Ethan Hirsch --- .../CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp index a0f9c9d2f8..f8208d65e6 100644 --- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp +++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp @@ -118,7 +118,7 @@ float dot_product(float *a, float *b) { float dot_product_SIMD(float *a, float *b) { int i; int sum = 0; - #pragma omp simd + #pragma omp simd reduction(+:sum) for (i = 0; i < SIZE; i++) { sum += a[i] * b[i]; } From b6206bc23f4e12ddfd46198983f7e85d0856a195 Mon Sep 17 00:00:00 2001 From: Ethan Hirsch Date: Mon, 24 Aug 2020 14:55:02 -0700 Subject: [PATCH 6/9] sample size Signed-off-by: Ethan Hirsch --- .../CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp index f8208d65e6..c5e707c284 100644 --- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp +++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp @@ -35,7 +35,7 @@ #include #include -#define SIZE 128 // assumes size is a multiple of 8 because +#define SIZE 24 // assumes size is a multiple of 8 because // Intel(R) AVX registers will store 8, 32bit elements. // Computes dot product using C From 0022320dd9d5ab36ebd139e5e26527cb10bd63b3 Mon Sep 17 00:00:00 2001 From: Ethan Hirsch Date: Mon, 24 Aug 2020 14:56:33 -0700 Subject: [PATCH 7/9] updated sample output Signed-off-by: Ethan Hirsch --- .../C++/CompilerInfrastructure/Intrinsics/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md index a99d5b006c..1124c82f7d 100644 --- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md +++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/README.md @@ -62,6 +62,7 @@ These intrinsics samples have relatively few modifiable parameters. However, cer ### Example of Output ``` Dot Product computed by C: 4324.000000 +Dot Product computed by C + SIMD: 4324.000000 Dot Product computed by Intel(R) SSE3 intrinsics: 4324.000000 Dot Product computed by Intel(R) AVX2 intrinsics: 4324.000000 Dot Product computed by Intel(R) AVX intrinsics: 4324.000000 From 4ba1e5f45b596ca74df88d80a5dc221e9f8ed909 Mon Sep 17 00:00:00 2001 From: Ethan Hirsch Date: Mon, 24 Aug 2020 15:39:27 -0700 Subject: [PATCH 8/9] removed unused lib Signed-off-by: Ethan Hirsch --- .../CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp index c5e707c284..441a5a530e 100644 --- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp +++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp @@ -33,7 +33,6 @@ #include #include #include -#include #define SIZE 24 // assumes size is a multiple of 8 because // Intel(R) AVX registers will store 8, 32bit elements. From 9a75101e5bd8a8537e289f02b37a70cebe3a7bf0 Mon Sep 17 00:00:00 2001 From: Ethan Hirsch Date: Tue, 25 Aug 2020 09:37:35 -0700 Subject: [PATCH 9/9] fixed formatting with clang-format Signed-off-by: Ethan Hirsch --- .../Intrinsics/src/intrin_dot_sample.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp index 441a5a530e..f4774140db 100644 --- a/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp +++ b/DirectProgramming/C++/CompilerInfrastructure/Intrinsics/src/intrin_dot_sample.cpp @@ -30,9 +30,9 @@ * */ #include +#include #include #include -#include #define SIZE 24 // assumes size is a multiple of 8 because // Intel(R) AVX registers will store 8, 32bit elements. @@ -117,7 +117,7 @@ float dot_product(float *a, float *b) { float dot_product_SIMD(float *a, float *b) { int i; int sum = 0; - #pragma omp simd reduction(+:sum) +#pragma omp simd reduction(+ : sum) for (i = 0; i < SIZE; i++) { sum += a[i] * b[i]; } @@ -211,7 +211,7 @@ float dot_product_intrin(float *a, float *b) { b + i); // loads unaligned array b into num2 num2= b[3] b[2] b[1] b[0] num3 = _mm_mul_ps(num1, num2); // performs multiplication num3 = - // a[3]*b[3] a[2]*b[2] a[1]*b[1] a[0]*b[0] + // a[3]*b[3] a[2]*b[2] a[1]*b[1] a[0]*b[0] num3 = _mm_hadd_ps(num3, num3); // performs horizontal addition // num3= a[3]*b[3]+ a[2]*b[2] a[1]*b[1]+a[0]*b[0] a[3]*b[3]+ a[2]*b[2] // a[1]*b[1]+a[0]*b[0]