diff --git a/CODEOWNERS b/CODEOWNERS new file mode 100644 index 0000000000..ef60028867 --- /dev/null +++ b/CODEOWNERS @@ -0,0 +1,23 @@ +# Each line is a file pattern followed by one or more owners. + +# These owners will be the default owners for everything in +# the repo. Unless a later match takes precedence, +# they will be requested for +# review when someone opens a pull request. +* @JoeOster @pmpeter1 @tomlenth + +# This owner owns any files in the given +# directory at the root of the repository and any of its +# subdirectories. +/DirectProgramming/DPC++/ @JoeOster @moushumi-maria, @pmpeter1 +/DirectProgramming/Jupyter/ @JoeOster @praveenkk123 +/DirectProgramming/C++/ @JoeOster @pmpeter1 @Propanu +/DirectProgramming/Fortran/ @JoeOster @pmpeter1 @Propanu +/Tools/ @JoeOster @Propanu +/Libraries/ @JoeOster @Propanu @mav-intel @JoeOster + +# Reviwers for all sample.json modification +*sample.json @pfische1 @mkitez + +# Reviewer for all readme files +*/README.md @tomlenth diff --git a/DirectProgramming/C++/MandelbrotOMP/Makefile b/DirectProgramming/C++/MandelbrotOMP/Makefile new file mode 100644 index 0000000000..306f762aa7 --- /dev/null +++ b/DirectProgramming/C++/MandelbrotOMP/Makefile @@ -0,0 +1,51 @@ +#============================================================== +# +# SAMPLE SOURCE CODE - SUBJECT TO THE TERMS OF SAMPLE CODE LICENSE AGREEMENT, +# http://software.intel.com/en-us/articles/intel-sample-source-code-license-agreement/ +# +# Copyright Intel Corporation +# +# THIS FILE IS PROVIDED "AS IS" WITH NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING BUT +# NOT LIMITED TO ANY IMPLIED WARRANTY OF MERCHANTABILITY, FITNESS FOR A PARTICULAR +# PURPOSE, NON-INFRINGEMENT OF INTELLECTUAL PROPERTY RIGHTS. +# +# ============================================================= +CXX := icpc +SRCDIR := src +BUILDDIR := release +CFLAGS := -O3 -ipo -qopenmp -std=c++11 +EXTRA_CFLAGS := +LIBFLAGS := -qopenmp + +ifdef perf_num + EXTRA_CFLAGS += -D PERF_NUM +endif + +TARGET := $(BUILDDIR)/MergeSort + +icpc: $(TARGET) + +SOURCES := $(wildcard $(SRCDIR)/*.cpp) +OBJECTS := $(patsubst $(SRCDIR)/%,$(BUILDDIR)/%,$(SOURCES:.cpp=.o)) + +$(TARGET): $(OBJECTS) + @echo " Linking..." + $(CXX) $^ $(LIBFLAGS) -o $(TARGET) + +$(BUILDDIR)/%.o: $(SRCDIR)/%.cpp + @mkdir -p $(BUILDDIR) + $(CXX) -c $(CFLAGS) $(EXTRA_CFLAGS) -o $@ $< + +run: $(TARGET) +ifeq ($(shell uname -s),Darwin) + @export DYLD_LIBRARY_PATH="$(LIBRARY_PATH)"; ./$(TARGET) $(option) +else + ./$(TARGET) $(option) +endif + +clean: + @echo " Cleaning..." + @rm -fr $(BUILDDIR) $(TARGET) 2>/dev/null || true + @rm -f *.png + +.PHONY: clean diff --git a/DirectProgramming/C++/MandelbrotOMP/README.md b/DirectProgramming/C++/MandelbrotOMP/README.md new file mode 100644 index 0000000000..63dbe2063b --- /dev/null +++ b/DirectProgramming/C++/MandelbrotOMP/README.md @@ -0,0 +1,98 @@ +# `Mandelbrot` Sample + +Mandelbrot is an infinitely complex fractal patterning that is derived from a simple formula. This sample demonstrates how to accelerate program performance with SIMD and parallelization using OpenMP*, in the context of calculating the mandelbrot set. + + +| Optimized for | Description +|:--- |:--- +| OS | MacOS Catalina or newer; Linux* Ubuntu* 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® C++ Compiler 19.1 or newer +| What you will learn | How to optimize a scalar implementation using OpenMP pragmas +| Time to complete | 15 minutes + +Performance number tabulation + +| Mandelbrot Version | Performance data +|:--- |:--- +| Scalar baseline | 1.0 +| OpenMP SIMD | 2x speedup +| OpenMP parallel | 6x speedup +| OpenMP SIMD + parallel | 10x speedup + + +## Purpose + +Mandelbrot is a C++ application that generates a fractal image by tracking how many iterations of the function z_n+1 = z_n^2 + c remain bounded, where c is a coordinate on the complex plane and z_0 = 0. In performing this calculation, complex numbers belonging to the mandelbrot set will take infinite iterations as they will always remain bounded. So a maximum depth of iterations is set so that the program may execute in finite time. + +Each point on the complex plane can be calculated independently, which lends the calculation of the mandelbrot image to parallelism. Furthermore, since the calculation on each point is identical, the program can take advantage of SIMD directives to get even greater performance. This code sample demonstrates how to optimize a serial implementation of the mandelbrot image calculation using OpenMP pragmas for SIMD and parallelization. + + +## Key Implementation Details + +The 4 mandelbrot function implementations are identically written, with the only difference being the progressive use of OpenMP pragmas for enabling parallelization and SIMD. + + +## License + +This code sample is licensed under MIT license. + + +## Building the `Mandelbrot` Program + +Perform the following steps: +1. Build the program using the following `make` commands. +``` +$ export perf_num=1 *optional, will enable performance tabulation mode +$ make +``` + +2. Run the program: + ``` + make run + ``` + +3. Clean the program using: + ``` + make clean + ``` + + +## Running the Sample + +### Application Parameters +You can modify the Mandelbrot parameters from within the main() definition near the head. The configurable parameters allow one to modify the dimensions(resolution) of the output image, which will also affect the execution time of the program. max_depth defines the upper limit of iterations the mandelbrot function will take to calculate a single point: + int height = 1024; + int width = 2048; + int max_depth = 100; + +In mandelbrot.cpp, the schedule(, ) pragmas in the OpenMP parallel for sections can me modified to change the parallelization parameters. changing between static and dynamic affects how work items are distributed between threads, and the chunk_size affects the size of each work item. On line 69, there is a preprocessor definition NUM_THREADS: Changing this value affects the number of threads dedicated to each parallel section. The ideal number of threads will vary based on the device hardware. + +### Example of Output +``` +This example will check how many iterations of z_n+1 = z_n^2 + c a complex set will remain bounded. Pick which parallel method you would like to use. +[0] all tests +[1] serial/scalar +[2] OpenMP SIMD +[3] OpenMP Parallel +[4] OpenMP Both + > 0 + +Running all tests + +Starting serial, scalar Mandelbrot... +Calculation finished. Processing time was 198ms +Saving image as mandelbrot_serial.png + +Starting OMP SIMD Mandelbrot... +Calculation finished. Processing time was 186ms +Saving image as mandelbrot_simd.png + +Starting OMP Parallel Mandelbrot... +Calculation finished. Processing time was 33ms +Saving image as mandelbrot_parallel.png + +Starting OMP SIMD + Parallel Mandelbrot... +Calculation finished. Processing time was 31ms +Saving image as mandelbrot_simd_parallel.png +``` \ No newline at end of file diff --git a/DirectProgramming/C++/MandelbrotOMP/license.txt b/DirectProgramming/C++/MandelbrotOMP/license.txt new file mode 100644 index 0000000000..65315c87a4 --- /dev/null +++ b/DirectProgramming/C++/MandelbrotOMP/license.txt @@ -0,0 +1,8 @@ +Copyright 2020 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/DirectProgramming/C++/MandelbrotOMP/sample.json b/DirectProgramming/C++/MandelbrotOMP/sample.json new file mode 100644 index 0000000000..ece8ab4756 --- /dev/null +++ b/DirectProgramming/C++/MandelbrotOMP/sample.json @@ -0,0 +1,20 @@ +{ + "name": "Mandelbrot OpenMP*", + "description": "Calculates the mandelbrot set and outputs a bmp image representation using OpenMP*", + "categories": ["Toolkit/Intel® oneAPI HPC Toolkit"], + "os": ["linux", "darwin"], + "builder": ["cmake"], + "languages": [{"cpp":{}}], + "toolchain": ["icc"], + "guid": "DD113F58-4D91-41BB-B46E-6CF2C0D9F6F9", + "ciTests": { + "linux": [ + { "id": "standard", "steps": [ "make", "make run", "make clean" ] }, + { "id": "perf_num", "env": [ "export perf_num=1" ], "steps": [ "make", "make run", "make clean" ] } + ], + "darwin": [ + { "id": "standard", "steps": [ "make", "make run", "make clean" ] }, + { "id": "perf_num", "env": [ "export perf_num=1" ], "steps": [ "make", "make run", "make clean" ] } + ] + } +} \ No newline at end of file diff --git a/DirectProgramming/C++/MandelbrotOMP/src/main.cpp b/DirectProgramming/C++/MandelbrotOMP/src/main.cpp new file mode 100644 index 0000000000..c0dea81098 --- /dev/null +++ b/DirectProgramming/C++/MandelbrotOMP/src/main.cpp @@ -0,0 +1,259 @@ +//============================================================== +// +// Copyright 2020 Intel Corporation +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. +// +// =============================================================== + +// Initial conditions: rectangle (for image) = { (-2.5, -0.875), (1, 0.875) } +// height = 1024 +// width = 2048 +// max_depth = 100 +// +// Finds the mandelbrot set given initial conditions, and saves results to a png +// image. The real portion of the complex number is the x-axis, and the +// imaginary portion is the y-axis +// +// You can optionally compile with GCC and MSC, but just the linear, scalar +// version will compile and it will not have all optimizations + +#include +#include +#include + +#include +#include + +#include "mandelbrot.hpp" +#include "timer.hpp" +#define STB_IMAGE_IMPLEMENTATION +#include "../stb/stb_image.h" +#define STB_IMAGE_WRITE_IMPLEMENTATION +#include "../stb/stb_image_write.h" + +void write_image(const char* filename, int width, int height, + unsigned char* output) { + stbi_write_png(filename, width, height, 1, output, width); +} + +int main(int argc, char* argv[]) { + double x0 = -2.5; + double y0 = -0.875; + double x1 = 1; + double y1 = 0.875; + + // Modifiable parameters: + int height = 1024; + int width = 2048; // Width should be a multiple of 8 + int max_depth = 100; + + assert(width % 8 == 0); + +#ifndef __INTEL_COMPILER + CUtilTimer timer; + printf( + "This example will check how many iterations of z_n+1 = z_n^2 + c a " + "complex set will remain bounded.\n"); +#ifdef PERF_NUM + double avg_time = 0; + for (int i = 0; i < 5; ++i) { +#endif + printf("Starting serial, scalar Mandelbrot...\n"); + timer.start(); + unsigned char* output = + serial_mandelbrot(x0, y0, x1, y1, width, height, max_depth); + timer.stop(); + printf("Calculation finished. Processing time was %.0fms\n", + timer.get_time() * 1000.0); + printf("Saving image...\n\n"); + write_image("mandelbrot_serial.png", width, height, output); + _mm_free(output); +#ifdef PERF_NUM + avg_time += timer.get_time(); + } + printf("avg time: %.0fms\n", avg_time * 1000.0 / 5); +#endif +#else + int option = 0; +#ifndef PERF_NUM + // Checks to see if option was given at command line + if (argc > 1) { + // Prints out instructions and quits + if (argv[1][0] == 'h') { + printf( + "This example will check how many iterations of z_n+1 = z_n^2 + c a " + "complex set will remain bounded. Pick which parallel method you " + "would like to use.\n"); + printf( + "[0] all tests\n[1] serial/scalar\n[2] OpenMP SIMD\n[3] OpenMP " + "Parallel\n[4] OpenMP Both\n > "); + return 0; + } else { + option = atoi(argv[1]); + } + } + // If no options are given, prompt user to choose an option + else { + printf( + "This example will check how many iterations of z_n+1 = z_n^2 + c a " + "complex set will remain bounded. Pick which parallel method you would " + "like to use.\n"); + printf( + "[0] all tests\n[1] serial/scalar\n[2] OpenMP SIMD\n[3] OpenMP " + "Parallel\n[4] OpenMP Both\n > "); + scanf("%i", &option); + } +#endif // !PERF_NUM + + CUtilTimer timer; + double serial_time, omp_simd_time, omp_parallel_time, omp_both_time; + unsigned char* output; + switch (option) { + case 0: { +#ifdef PERF_NUM + double avg_time[4] = {0.0}; + for (int i = 0; i < 5; ++i) { +#endif + printf("\nRunning all tests\n"); + + printf("\nStarting serial, scalar Mandelbrot...\n"); + timer.start(); + output = serial_mandelbrot(x0, y0, x1, y1, width, height, max_depth); + timer.stop(); + serial_time = timer.get_time(); + printf("Calculation finished. Processing time was %.0fms\n", + serial_time * 1000.0); + printf("Saving image as mandelbrot_serial.png\n"); + write_image("mandelbrot_serial.png", width, height, output); + _mm_free(output); + + printf("\nStarting OMP SIMD Mandelbrot...\n"); + timer.start(); + output = simd_mandelbrot(x0, y0, x1, y1, width, height, max_depth); + timer.stop(); + omp_simd_time = timer.get_time(); + printf("Calculation finished. Processing time was %.0fms\n", + omp_simd_time * 1000.0); + printf("Saving image as mandelbrot_simd.png\n"); + write_image("mandelbrot_simd.png", width, height, output); + _mm_free(output); + + printf("\nStarting OMP Parallel Mandelbrot...\n"); + timer.start(); + output = parallel_mandelbrot(x0, y0, x1, y1, width, height, max_depth); + timer.stop(); + omp_parallel_time = timer.get_time(); + printf("Calculation finished. Processing time was %.0fms\n", + omp_parallel_time * 1000.0); + printf("Saving image as mandelbrot_parallel.png\n"); + write_image("mandelbrot_parallel.png", width, height, output); + _mm_free(output); + + printf("\nStarting OMP SIMD + Parallel Mandelbrot...\n"); + timer.start(); + output = omp_mandelbrot(x0, y0, x1, y1, width, height, max_depth); + timer.stop(); + omp_both_time = timer.get_time(); + printf("Calculation finished. Processing time was %.0fms\n", + omp_both_time * 1000.0); + printf("Saving image as mandelbrot_simd_parallel.png\n"); + write_image("mandelbrot_simd_parallel.png", width, height, output); + _mm_free(output); +#ifndef PERF_NUM + } +#endif +#ifdef PERF_NUM + avg_time[0] += serial_time; + avg_time[1] += omp_simd_time; + avg_time[2] += omp_parallel_time; + avg_time[3] += omp_both_time; + } + printf("\navg time (serial) : %.0fms\n", + avg_time[0] * 1000.0 / 5); + printf("avg time (simd) : %.0fms\n", + avg_time[1] * 1000.0 / 5); + printf("avg time (parallel) : %.0fms\n", + avg_time[2] * 1000.0 / 5); + printf("avg time (simd+parallel) : %.0fms\n\n", + avg_time[3] * 1000.0 / 5); + } +#endif + break; + + case 1: { + printf("\nStarting serial, scalar Mandelbrot...\n"); + timer.start(); + output = serial_mandelbrot(x0, y0, x1, y1, width, height, max_depth); + timer.stop(); + printf("Calculation finished. Processing time was %.0fms\n", + timer.get_time() * 1000.0); + printf("Saving image as mandelbrot_serial.png\n"); + write_image("mandelbrot_serial.png", width, height, output); + _mm_free(output); + break; + } + + case 2: { + printf("\nStarting OMP SIMD Mandelbrot...\n"); + timer.start(); + output = simd_mandelbrot(x0, y0, x1, y1, width, height, max_depth); + timer.stop(); + printf("Calculation finished. Processing time was %.0fms\n", + timer.get_time() * 1000.0); + printf("Saving image as mandelbrot_simd.png\n"); + write_image("mandelbrot_simd.png", width, height, output); + _mm_free(output); + break; + } + + case 3: { + printf("\nStarting OMP Parallel Mandelbrot...\n"); + timer.start(); + output = parallel_mandelbrot(x0, y0, x1, y1, width, height, max_depth); + timer.stop(); + printf("Calculation finished. Processing time was %.0fms\n", + timer.get_time() * 1000.0); + printf("Saving image as mandelbrot_parallel.png\n"); + write_image("mandelbrot_parallel.png", width, height, output); + _mm_free(output); + break; + } + + case 4: { + printf("\nStarting OMP Mandelbrot...\n"); + timer.start(); + output = omp_mandelbrot(x0, y0, x1, y1, width, height, max_depth); + timer.stop(); + printf("Calculation finished. Processing time was %.0fms\n", + timer.get_time() * 1000.0); + printf("Saving image as mandelbrot_simd_parallel.png\n"); + write_image("mandelbrot_simd_parallel.png", width, height, output); + _mm_free(output); + break; + } + + default: { + printf("Please pick a valid option\n"); + break; + } +} +#endif + return 0; +} diff --git a/DirectProgramming/C++/MandelbrotOMP/src/mandelbrot.cpp b/DirectProgramming/C++/MandelbrotOMP/src/mandelbrot.cpp new file mode 100644 index 0000000000..594d72adeb --- /dev/null +++ b/DirectProgramming/C++/MandelbrotOMP/src/mandelbrot.cpp @@ -0,0 +1,254 @@ +//============================================================== +// +// Copyright 2020 Intel Corporation +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. +// +// =============================================================== + +// Each of these methods calculate how deeply numbers on a complex plane remains +// in the Mandelbrot set. On top of the serial/scalar version, there is a +// cilk_for version, a pragma simd version, and a combined cilk_for/pragma simd +// version + +#include "mandelbrot.hpp" + +#include +#ifdef __INTEL_COMPILER +#include +#endif +#include +// Description: +// Determines how deeply points in the complex plane, spaced on a uniform grid, +// remain in the Mandelbrot set. The uniform grid is specified by the rectangle +// (x1, y1) - (x0, y0). Mandelbrot set is determined by remaining bounded after +// iteration of z_n+1 = z_n^2 + c, up to max_depth. +// +// Everything is done in a linear, scalar fashion +// +// [in]: x0, y0, x1, y1, width, height, max_depth +// [out]: output (caller must deallocate) +unsigned char* serial_mandelbrot(double x0, double y0, double x1, double y1, + int width, int height, int max_depth) { + double xstep = (x1 - x0) / width; + double ystep = (y1 - y0) / height; + unsigned char* output = static_cast( + _mm_malloc(width * height * sizeof(unsigned char), 64)); + + // Traverse the sample space in equally spaced steps with width * height + // samples + for (int j = 0; j < height; ++j) { + for (int i = 0; i < width; ++i) { + double z_real = x0 + i * xstep; + double z_imaginary = y0 + j * ystep; + double c_real = z_real; + double c_imaginary = z_imaginary; + + // depth should be an int, but the vectorizer will not vectorize, + // complaining about mixed data types switching it to double is worth the + // small cost in performance to let the vectorizer work + double depth = 0; + // Figures out how many recurrences are required before divergence, up to + // max_depth + while (depth < max_depth) { + if (z_real * z_real + z_imaginary * z_imaginary > 4.0) { + break; // Escape from a circle of radius 2 + } + double temp_real = z_real * z_real - z_imaginary * z_imaginary; + double temp_imaginary = 2.0 * z_real * z_imaginary; + z_real = c_real + temp_real; + z_imaginary = c_imaginary + temp_imaginary; + + ++depth; + } + output[j * width + i] = static_cast( + static_cast(depth) / max_depth * 255); + } + } + return output; +} + +#ifdef __INTEL_COMPILER + +#define NUM_THREADS \ + 8 // USER: Experiment with various threadcounts for parallelization + +// Description: +// Determines how deeply points in the complex plane, spaced on a uniform grid, +// remain in the Mandelbrot set. The uniform grid is specified by the rectangle +// (x1, y1) - (x0, y0). Mandelbrot set is determined by remaining bounded after +// iteration of z_n+1 = z_n^2 + c, up to max_depth. +// +// Optimized with OpenMP's SIMD constructs. +// +// [in]: x0, y0, x1, y1, width, height, max_depth +// [out]: output (caller must deallocate) +unsigned char* simd_mandelbrot(double x0, double y0, double x1, double y1, + int width, int height, int max_depth) { + double xstep = (x1 - x0) / width; + double ystep = (y1 - y0) / height; + unsigned char* output = static_cast( + _mm_malloc(width * height * sizeof(unsigned char), 64)); + + // Traverse the sample space in equally spaced steps with width * height + // samples + for (int j = 0; j < height; ++j) { +#pragma omp simd // vectorize code + for (int i = 0; i < width; ++i) { + double z_real = x0 + i * xstep; + double z_imaginary = y0 + j * ystep; + double c_real = z_real; + double c_imaginary = z_imaginary; + + // depth should be an int, but the vectorizer will not vectorize, + // complaining about mixed data types switching it to double is worth the + // small cost in performance to let the vectorizer work + double depth = 0; + // Figures out how many recurrences are required before divergence, up to + // max_depth + while (depth < max_depth) { + if (z_real * z_real + z_imaginary * z_imaginary > 4.0) { + break; // Escape from a circle of radius 2 + } + double temp_real = z_real * z_real - z_imaginary * z_imaginary; + double temp_imaginary = 2.0 * z_real * z_imaginary; + z_real = c_real + temp_real; + z_imaginary = c_imaginary + temp_imaginary; + + ++depth; + } + output[j * width + i] = static_cast( + static_cast(depth) / max_depth * 255); + } + } + return output; +} + +// Description: +// Determines how deeply points in the complex plane, spaced on a uniform grid, +// remain in the Mandelbrot set. The uniform grid is specified by the rectangle +// (x1, y1) - (x0, y0). Mandelbrot set is determined by remaining bounded after +// iteration of z_n+1 = z_n^2 + c, up to max_depth. +// +// Optimized with OpenMP's parallelization constructs. +// +// [in]: x0, y0, x1, y1, width, height, max_depth +// [out]: output (caller must deallocate) +unsigned char* parallel_mandelbrot(double x0, double y0, double x1, double y1, + int width, int height, int max_depth) { + double xstep = (x1 - x0) / width; + double ystep = (y1 - y0) / height; + unsigned char* output = static_cast( + _mm_malloc(width * height * sizeof(unsigned char), 64)); + + omp_set_num_threads(NUM_THREADS); + // Traverse the sample space in equally spaced steps with width * height + // samples +#pragma omp parallel for schedule( \ + dynamic, 1) // USER: Experiment with static/dynamic partitioning + // dynamic partitioning is advantageous as the while loop for calculating + // depth makes iterations vary in terms of time. + for (int j = 0; j < height; ++j) { + for (int i = 0; i < width; ++i) { + double z_real = x0 + i * xstep; + double z_imaginary = y0 + j * ystep; + double c_real = z_real; + double c_imaginary = z_imaginary; + + // depth should be an int, but the vectorizer will not vectorize, + // complaining about mixed data types switching it to double is worth the + // small cost in performance to let the vectorizer work + double depth = 0; + // Figures out how many recurrences are required before divergence, up to + // max_depth + while (depth < max_depth) { + if (z_real * z_real + z_imaginary * z_imaginary > 4.0) { + break; // Escape from a circle of radius 2 + } + double temp_real = z_real * z_real - z_imaginary * z_imaginary; + double temp_imaginary = 2.0 * z_real * z_imaginary; + z_real = c_real + temp_real; + z_imaginary = c_imaginary + temp_imaginary; + + ++depth; + } + output[j * width + i] = static_cast( + static_cast(depth) / max_depth * 255); + } + } + return output; +} + +// Description: +// Determines how deeply points in the complex plane, spaced on a uniform grid, +// remain in the Mandelbrot set. The uniform grid is specified by the rectangle +// (x1, y1) - (x0, y0). Mandelbrot set is determined by remaining bounded after +// iteration of z_n+1 = z_n^2 + c, up to max_depth. +// +// Optimized with OpenMP's parallelization and SIMD constructs. +// +// [in]: x0, y0, x1, y1, width, height, max_depth +// [out]: output (caller must deallocate) +unsigned char* omp_mandelbrot(double x0, double y0, double x1, double y1, + int width, int height, int max_depth) { + double xstep = (x1 - x0) / width; + double ystep = (y1 - y0) / height; + unsigned char* output = static_cast( + _mm_malloc(width * height * sizeof(unsigned char), 64)); + + omp_set_num_threads(NUM_THREADS); + // Traverse the sample space in equally spaced steps with width * height + // samples +#pragma omp parallel for schedule( \ + dynamic, 1) // USER: Experiment with static/dynamic partitioning + // dynamic partitioning is advantageous as the while loop for calculating + // depth makes iterations vary in terms of time. + for (int j = 0; j < height; ++j) { +#pragma omp simd // vectorize code + for (int i = 0; i < width; ++i) { + double z_real = x0 + i * xstep; + double z_imaginary = y0 + j * ystep; + double c_real = z_real; + double c_imaginary = z_imaginary; + + // depth should be an int, but the vectorizer will not vectorize, + // complaining about mixed data types switching it to double is worth the + // small cost in performance to let the vectorizer work + double depth = 0; + // Figures out how many recurrences are required before divergence, up to + // max_depth + while (depth < max_depth) { + if (z_real * z_real + z_imaginary * z_imaginary > 4.0) { + break; // Escape from a circle of radius 2 + } + double temp_real = z_real * z_real - z_imaginary * z_imaginary; + double temp_imaginary = 2.0 * z_real * z_imaginary; + z_real = c_real + temp_real; + z_imaginary = c_imaginary + temp_imaginary; + + ++depth; + } + output[j * width + i] = static_cast( + static_cast(depth) / max_depth * 255); + } + } + return output; +} + +#endif // __INTEL_COMPILER diff --git a/DirectProgramming/C++/MandelbrotOMP/src/mandelbrot.hpp b/DirectProgramming/C++/MandelbrotOMP/src/mandelbrot.hpp new file mode 100644 index 0000000000..3d67e43c60 --- /dev/null +++ b/DirectProgramming/C++/MandelbrotOMP/src/mandelbrot.hpp @@ -0,0 +1,56 @@ +//============================================================== +// +// Copyright 2020 Intel Corporation +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. +// +// =============================================================== + +#ifndef MANDELBROT_H +#define MANDELBROT_H + +// Checks how many iterations of the complex quadratic polynomial z_n+1 = z_n^2 +// + c keeps a set of complex numbers bounded, to a certain max depth. Mapping +// of these depths to a complex plane will result in the telltale mandelbrot set +// image Uses strictly scalar methods to calculate number of iterations (depth) +unsigned char* serial_mandelbrot(double x0, double y0, double x1, double y1, + int width, int height, int max_depth); + +// Checks how many iterations of the complex quadratic polynomial z_n+1 = z_n^2 +// + c keeps a set of complex numbers bounded, to a certain max depth. Mapping +// of these depths to a complex plane will result in the telltale mandelbrot set +// image Uses OpenMP SIMD for optimization +unsigned char* simd_mandelbrot(double x0, double y0, double x1, double y1, + int width, int height, int max_depth); + +// Checks how many iterations of the complex quadratic polynomial z_n+1 = z_n^2 +// + c keeps a set of complex numbers bounded, to a certain max depth. Mapping +// of these depths to a complex plane will result in the telltale mandelbrot set +// image Uses OpenMP Parallelization for optimization +unsigned char* parallel_mandelbrot(double x0, double y0, double x1, double y1, + int width, int height, int max_depth); + +// Checks how many iterations of the complex quadratic polynomial z_n+1 = z_n^2 +// + c keeps a set of complex numbers bounded, to a certain max depth Mapping of +// these depths to a complex plane will result in the telltale mandelbrot set +// image Uses OpenMP SIMD + Parallelization for optimization +unsigned char* omp_mandelbrot(double x0, double y0, double x1, double y1, + int width, int height, int max_depth); + +#endif // MANDELBROT_H diff --git a/DirectProgramming/C++/MandelbrotOMP/src/timer.cpp b/DirectProgramming/C++/MandelbrotOMP/src/timer.cpp new file mode 100644 index 0000000000..d05456e143 --- /dev/null +++ b/DirectProgramming/C++/MandelbrotOMP/src/timer.cpp @@ -0,0 +1,51 @@ +//============================================================== +// +// Copyright 2020 Intel Corporation +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. +// +// =============================================================== + +#include "timer.hpp" + +#include + +using namespace std::chrono; + +// Description: +// Registers the current clock tick value in m_start_clock_tick, current time +// value in m_start_time Microsoft Windows* uses __rdtsc for clock ticks and +// QueryPerformanceFrequency/QueryPerformanceCounter for time Linux*/OS X* uses +// the rdtsc instruction for clock ticks and get_timeofday for time +void CUtilTimer::start() { m_start_time = high_resolution_clock::now(); } + +// Description: +// Registers the current clock tick value in m_end_clock_tick, current time +// value in m_end_time Windows uses __rdtsc for clock ticks and +// QueryPerformanceFrequency/QueryPerformanceCounter for time Linux*/OS X* uses +// the rdtsc instruction for clock ticks and get_timeofday for time +void CUtilTimer::stop() { m_end_time = high_resolution_clock::now(); } + +// Description: +// Returns the number of seconds taken between start and stop +double CUtilTimer::get_time() { + duration time_span = + duration_cast >(m_end_time - m_start_time); + return time_span.count(); +} diff --git a/DirectProgramming/C++/MandelbrotOMP/src/timer.hpp b/DirectProgramming/C++/MandelbrotOMP/src/timer.hpp new file mode 100644 index 0000000000..9387bda422 --- /dev/null +++ b/DirectProgramming/C++/MandelbrotOMP/src/timer.hpp @@ -0,0 +1,48 @@ +//============================================================== +// +// Copyright 2020 Intel Corporation +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the "Software"), +// to deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. +// +// =============================================================== + +#ifndef TIMER_H +#define TIMER_H + +#include + +using namespace std::chrono; + +class CUtilTimer { + public: + // Registers the current clock tick and time value in m_start_clock_tick and + // m_start_time + void start(); + // Registers the current clock tick and time value in m_end_clock_tick and + // m_end_time + void stop(); + // Returns the number of seconds taken between start and stop + double get_time(); + + private: + // start and end time + high_resolution_clock::time_point m_start_time, m_end_time; +}; + +#endif // TIMER_H diff --git a/DirectProgramming/C++/MandelbrotOMP/stb/LICENSE.txt b/DirectProgramming/C++/MandelbrotOMP/stb/LICENSE.txt new file mode 100644 index 0000000000..6f8ce313e1 --- /dev/null +++ b/DirectProgramming/C++/MandelbrotOMP/stb/LICENSE.txt @@ -0,0 +1,37 @@ +This software is available under 2 licenses -- choose whichever you prefer. +------------------------------------------------------------------------------ +ALTERNATIVE A - MIT License +Copyright (c) 2017 Sean Barrett +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +------------------------------------------------------------------------------ +ALTERNATIVE B - Public Domain (www.unlicense.org) +This is free and unencumbered software released into the public domain. +Anyone is free to copy, modify, publish, use, compile, sell, or distribute this +software, either in source code form or as a compiled binary, for any purpose, +commercial or non-commercial, and by any means. +In jurisdictions that recognize copyright laws, the author or authors of this +software dedicate any and all copyright interest in the software to the public +domain. We make this dedication for the benefit of the public at large and to +the detriment of our heirs and successors. We intend this dedication to be an +overt act of relinquishment in perpetuity of all present and future rights to +this software under copyright law. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/stb/stb.h b/DirectProgramming/C++/MandelbrotOMP/stb/stb.h similarity index 100% rename from DirectProgramming/DPC++/CombinationalLogic/sepia-filter/stb/stb.h rename to DirectProgramming/C++/MandelbrotOMP/stb/stb.h diff --git a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/stb/stb_image.h b/DirectProgramming/C++/MandelbrotOMP/stb/stb_image.h similarity index 100% rename from DirectProgramming/DPC++/CombinationalLogic/sepia-filter/stb/stb_image.h rename to DirectProgramming/C++/MandelbrotOMP/stb/stb_image.h diff --git a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/stb/stb_image_write.h b/DirectProgramming/C++/MandelbrotOMP/stb/stb_image_write.h similarity index 100% rename from DirectProgramming/DPC++/CombinationalLogic/sepia-filter/stb/stb_image_write.h rename to DirectProgramming/C++/MandelbrotOMP/stb/stb_image_write.h diff --git a/DirectProgramming/C++/MergesortOMP/README.md b/DirectProgramming/C++/MergesortOMP/README.md index c477593092..ce51161a1a 100644 --- a/DirectProgramming/C++/MergesortOMP/README.md +++ b/DirectProgramming/C++/MergesortOMP/README.md @@ -29,7 +29,7 @@ This code sample demonstrates how to convert a scalar implementation of merge so ## Key Implementation Details -Write some stuff here +The OpenMP* version of the merge sort implementation uses #pragma omp task in its recursive calls, which allows the recursive calls to be handled by different threads. The #pragma omp taskawait preceeding the function call to merge() ensures the two recursive calls are completed before merge() is executed. Through this use of OpenMP* pragmas, the recursive sorting algorithm can effectively run in parallel, where each recursion is a unique task able to be performed by any available thread. ## License diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/CMakeLists.txt b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/CMakeLists.txt new file mode 100644 index 0000000000..d92953a3a1 --- /dev/null +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/CMakeLists.txt @@ -0,0 +1,13 @@ +cmake_minimum_required (VERSION 3.0) + +set(CMAKE_CXX_COMPILER dpcpp) + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() +project (mandelbrot) +add_subdirectory (src) diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/License.txt b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/License.txt new file mode 100644 index 0000000000..8f608e972a --- /dev/null +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/License.txt @@ -0,0 +1,8 @@ +Copyright 2019 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/README.md b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/README.md new file mode 100644 index 0000000000..1026556c03 --- /dev/null +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/README.md @@ -0,0 +1,96 @@ +# `Mandelbrot` Sample + +Mandelbrot is an infinitely complex fractal patterning that is derived from a simple formula. It demonstrates using DPC++ for offloading computations to a GPU (or other devices) and shows how processing time can be optimized and improved with parallelism. + +For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux* Ubuntu* 18.04; Windows 10 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler beta; +| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler +| Time to complete | 15 minutes + +## Purpose +Mandelbrot is a DPC++ application that generates a fractal image by initializing a matrix of 512 x 512, where the computation at each point (pixel) is entirely independent of the computation at other points. The sample includes both parallel and serial calculation of the set, allowing for a direct comparison of results. The parallel implementation can demonstrate the use of Unified Shared Memory (USM) or buffers. You can modify parameters such as the number of rows, columns, and iterations to evaluate the difference in performance and load between USM and buffers. This is further described at the end of this document in the "Running the Sample" section. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU if a compatible GPU is not detected. The device used for compilation is displayed in the output along with elapsed time to render the mandelbrot image. This is helpful for comparing different offload implementations based on complexity of the computation. + +## Key Implementation Details +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command groups. + +## License +This code sample is licensed under MIT license. + +## Building the `Mandelbrot` Program for CPU and GPU + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +Perform the following steps: +1. Build the program using the following `cmake` commands. +``` +$ mkdir build +$ cd build +$ cmake .. +$ make +``` + +> Note: by default, exectables are created for both USM and buffers. You can build individually with the following: +> Create buffers executable: make mandelbrot +> Create USM executable: make mandelbrot_usm + +2. Run the program (default uses buffers): + ``` + make run + ``` +> Note: for USM use `make run_usm` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System Using Visual Studio* Version 2017 or Newer +* Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + +>If you see the following error message when compiling this sample: +> +``` +Error 'dpc_common.hpp' file not found +``` +>You need to add the following directory to the list of include folders, that are required by your project, in your project's Visual Studio project property panel. The missing include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +* Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019" + Run - MSBuild mandelbrot.sln /t:Rebuild /p:Configuration="Release" + + +## Running the Sample +### Application Parameters +You can modify the Mandelbrot parameters from within mandel.hpp. The configurable parameters include: + row_size = + col_size = + max_iterations = + repetitions = +The default row and column size is 512. Max interatins and repetions are both 100. By adjusting the parameters, you can observe how the performance varies using the different offload techniques. Note that if the values drop below 128 for row and column, the output is limted to just text in the ouput window. + +### Example of Output +``` +Platform Name: Intel(R) OpenCL HD Graphics + Platform Version: OpenCL 2.1 + Device Name: Intel(R) Gen9 HD Graphics NEO + Max Work Group: 256 + Max Compute Units: 24 + +Parallel Mandelbrot set using buffers. +Rendered image output to file: mandelbrot.png (output too large to display in text) + Serial time: 0.0430331s + Parallel time: 0.00224131s +Successfully computed Mandelbrot set. +``` \ No newline at end of file diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.sln b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.sln new file mode 100644 index 0000000000..2b8298d2f0 --- /dev/null +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.852 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "mandelbrot", "mandelbrot.vcxproj", "{C2DF6F42-60A8-4FFD-AE40-F37E9EF9875E}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {C2DF6F42-60A8-4FFD-AE40-F37E9EF9875E}.Debug|x64.ActiveCfg = Debug|x64 + {C2DF6F42-60A8-4FFD-AE40-F37E9EF9875E}.Debug|x64.Build.0 = Debug|x64 + {C2DF6F42-60A8-4FFD-AE40-F37E9EF9875E}.Release|x64.ActiveCfg = Release|x64 + {C2DF6F42-60A8-4FFD-AE40-F37E9EF9875E}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {075ECF2A-BDB7-4151-826F-CAC1B35A18F2} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj new file mode 100644 index 0000000000..50c7200cd7 --- /dev/null +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj @@ -0,0 +1,161 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + + + + + + + + 15.0 + {c2df6f42-60a8-4ffd-ae40-f37e9ef9875e} + Win32Proj + mandelbrot + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + oneAPI Data Parallel C++ Compiler + Unicode + + + Application + false + oneAPI Data Parallel C++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + Level3 + + + Console + true + $(ONEAPI_ROOT)\compiler\latest\windows\bin\libsycl-complex.o + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + Level3 + + + Console + true + true + true + $(ONEAPI_ROOT)\compiler\latest\windows\bin\libsycl-complex.o + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj.filters b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj.filters new file mode 100644 index 0000000000..dcc683dbce --- /dev/null +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj.filters @@ -0,0 +1,36 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Header Files + + + Header Files + + + + + + + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj.user b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj.user new file mode 100644 index 0000000000..2354252aa4 --- /dev/null +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/mandelbrot.vcxproj.user @@ -0,0 +1,10 @@ + + + + $(LocalDebuggerEnvironment) + WindowsLocalDebugger + + + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/sample.json b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/sample.json new file mode 100644 index 0000000000..ec3467d005 --- /dev/null +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/sample.json @@ -0,0 +1,30 @@ +{ + "guid": "8572B85D-0B32-40B1-8112-538F480C8660", + "name": "Mandelbrot", + "categories": [ "Toolkit/Intel® oneAPI HPC Toolkit" ], + "description": "The Mandelbrot set - a fractal example in mathematics", + "toolchain": [ "dpcpp" ], + "languages": [ { "cpp": {} } ], + "targetDevice": [ "CPU", "GPU" ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild mandelbrot.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "mandelbrot.exe" + ] + }] + + } +} diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/CMakeLists.txt b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/CMakeLists.txt new file mode 100644 index 0000000000..9cd8f8f64d --- /dev/null +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/CMakeLists.txt @@ -0,0 +1,11 @@ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -std=c++17") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") + +add_executable(mandelbrot main.cpp) +target_link_libraries(mandelbrot OpenCL sycl $ENV{ONEAPI_ROOT}/compiler/latest/linux/lib/libsycl-complex.o) +add_custom_target(run ${CMAKE_COMMAND} -E env SYCL_BE=PI_OPENCL ./mandelbrot) + +add_executable(mandelbrot_usm main.cpp) +target_compile_definitions(mandelbrot_usm PRIVATE MANDELBROT_USM) +target_link_libraries(mandelbrot_usm OpenCL sycl $ENV{ONEAPI_ROOT}/compiler/latest/linux/lib/libsycl-complex.o) +add_custom_target(run_usm ${CMAKE_COMMAND} -E env SYCL_BE=PI_OPENCL ./mandelbrot_usm) diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/main.cpp b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/main.cpp new file mode 100644 index 0000000000..cc8a9514ef --- /dev/null +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/main.cpp @@ -0,0 +1,91 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include +#include +#include + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp +#include "dpc_common.hpp" +#include "mandel.hpp" + +using namespace std; +using namespace sycl; + +void ShowDevice(queue &q) { + // Output platform and device information. + auto device = q.get_device(); + auto p_name = device.get_platform().get_info(); + cout << std::setw(20) << "Platform Name: " << p_name << "\n"; + auto p_version = device.get_platform().get_info(); + cout << std::setw(20) << "Platform Version: " << p_version << "\n"; + auto d_name = device.get_info(); + cout << std::setw(20) << "Device Name: " << d_name << "\n"; + auto max_work_group = device.get_info(); + cout << std::setw(20) << "Max Work Group: " << max_work_group << "\n"; + auto max_compute_units = device.get_info(); + cout << std::setw(20) << "Max Compute Units: " << max_compute_units << "\n\n"; +} + +void Execute(queue &q) { + // Demonstrate the Mandelbrot calculation serial and parallel. +#ifdef MANDELBROT_USM + cout << "Parallel Mandelbrot set using USM.\n"; + MandelParallelUsm m_par(row_size, col_size, max_iterations, &q); +#else + cout << "Parallel Mandelbrot set using buffers.\n"; + MandelParallel m_par(row_size, col_size, max_iterations); +#endif + + MandelSerial m_ser(row_size, col_size, max_iterations); + + // Run the code once to trigger JIT. + m_par.Evaluate(q); + + // Run the parallel version and time it. + dpc_common::TimeInterval t_par; + for (int i = 0; i < repetitions; ++i) m_par.Evaluate(q); + double parallel_time = t_par.Elapsed(); + + // Print the results. + m_par.Print(); + m_par.WriteImage(); + + // Run the serial version. + dpc_common::TimeInterval t_ser; + m_ser.Evaluate(); + double serial_time = t_ser.Elapsed(); + + // Report the results. + cout << std::setw(20) << "Serial time: " << serial_time << "s\n"; + cout << std::setw(20) << "Parallel time: " << (parallel_time / repetitions) + << "s\n"; + + // Validate. + m_par.Verify(m_ser); +} + +int main(int argc, char *argv[]) { + try { + // Create a queue on the default device. Set SYCL_DEVICE_TYPE environment + // variable to (CPU|GPU|FPGA|HOST) to change the device. + queue q(default_selector{}, dpc_common::exception_handler); + + // Display the device info. + ShowDevice(q); + + // Compute Mandelbrot set. + Execute(q); + } catch (...) { + // Some other exception detected. + cout << "Failed to compute Mandelbrot set.\n"; + std::terminate(); + } + + cout << "Successfully computed Mandelbrot set.\n"; + return 0; +} diff --git a/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/mandel.hpp b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/mandel.hpp new file mode 100644 index 0000000000..991478032c --- /dev/null +++ b/DirectProgramming/DPC++/CombinationalLogic/mandelbrot/src/mandel.hpp @@ -0,0 +1,286 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#pragma once + +#include +#include +#include +#include + +// stb/*.h files can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/stb/*.h +#define STB_IMAGE_IMPLEMENTATION +#include "stb/stb_image.h" +#define STB_IMAGE_WRITE_IMPLEMENTATION +#include "stb/stb_image_write.h" + +using namespace std; +using namespace sycl; + +constexpr int row_size = 512; +constexpr int col_size = 512; +constexpr int max_iterations = 100; +constexpr int repetitions = 100; + +// Parameters used in Mandelbrot including number of row, column, and iteration. +struct MandelParameters { + int row_count_; + int col_count_; + int max_iterations_; + + typedef std::complex ComplexF; + + MandelParameters(int row_count, int col_count, int max_iterations) + : row_count_(row_count), + col_count_(col_count), + max_iterations_(max_iterations) {} + + int row_count() const { return row_count_; } + int col_count() const { return col_count_; } + int max_iterations() const { return max_iterations_; } + + // Scale from 0..row_count to -1.5..0.5 + float ScaleRow(int i) const { return -1.5f + (i * (2.0f / row_count_)); } + + // Scale from 0..col_count to -1..1 + float ScaleCol(int i) const { return -1.0f + (i * (2.0f / col_count_)); } + + // Mandelbrot set are points that do not diverge within max_iterations. + int Point(const ComplexF &c) const { + int count = 0; + ComplexF z = 0; + + for (int i = 0; i < max_iterations_; ++i) { + auto r = z.real(); + auto im = z.imag(); + + // Leave loop if diverging. + if (((r * r) + (im * im)) >= 4.0f) { + break; + } + + z = z * z + c; + count++; + } + + return count; + } +}; + +// Shared functions for computing Mandelbrot set. +class Mandel { + private: + MandelParameters p_; + + protected: + int *data_; + + public: + Mandel(int row_count, int col_count, int max_iterations) + : p_(row_count, col_count, max_iterations) { + data_ = nullptr; + } + + virtual ~Mandel() {} + virtual void Alloc() { data_ = new int[p_.row_count() * p_.col_count()]; } + virtual void Free() { delete[] data_; } + + MandelParameters GetParameters() const { return p_; } + + void WriteImage() { + constexpr int channel_num{3}; + int row_count = p_.row_count(); + int col_count = p_.col_count(); + + uint8_t *pixels = new uint8_t[col_count * row_count * channel_num]; + + int index = 0; + + for (int j = 0; j < row_count; ++j) { + for (int i = 0; i < col_count; ++i) { + float normalized = (1.0 * data_[i * col_count + j]) / max_iterations; + int color = int(normalized * 0xFFFFFF); // 16M color. + + int r = (color >> 16) & 0xFF; + int g = (color >> 8) & 0xFF; + int b = color & 0xFF; + + pixels[index++] = r; + pixels[index++] = g; + pixels[index++] = b; + } + } + + stbi_write_png("mandelbrot.png", row_count, col_count, channel_num, pixels, + col_count * channel_num); + + delete[] pixels; + } + + // Use only for debugging with small dimensions. + void Print() { + if (p_.row_count() > 128 || p_.col_count() > 128) { + cout << " Rendered image output to file: mandelbrot.png " + "(output too large to display in text)\n"; + + return; + } + + for (int i = 0; i < p_.row_count(); ++i) { + for (int j = 0; j < p_.col_count_; ++j) { + cout << std::setw(1) + << ((GetValue(i, j) >= p_.max_iterations()) ? "x" : " "); + } + + cout << "\n"; + } + } + + // Accessor for data and count values. + int *data() const { return data_; } + + // Accessor to read a value from the mandelbrot data matrix. + int GetValue(int i, int j) const { return data_[i * p_.col_count_ + j]; } + + // Mutator to store a value into the mandelbrot data matrix. + void SetValue(int i, int j, float v) { data_[i * p_.col_count_ + j] = v; } + + // Validate the results match. + void Verify(Mandel &m) { + if ((m.p_.row_count() != p_.row_count_) || + (m.p_.col_count() != p_.col_count_)) { + cout << "Fail verification - matrix size is different\n"; + throw std::runtime_error("Verification failure"); + } + + int diff = 0; + + for (int i = 0; i < p_.row_count(); ++i) { + for (int j = 0; j < p_.col_count(); ++j) { + if (m.GetValue(i, j) != GetValue(i, j)) diff++; + } + } + + double tolerance = 0.05; + double ratio = (double)diff / (double)(p_.row_count() * p_.col_count()); + +#if _DEBUG + cout << "diff: " << diff << "\n"; + cout << "total count: " << p_.row_count() * p_.col_count() << "\n"; +#endif + + if (ratio > tolerance) { + cout << "Fail verification - diff larger than tolerance\n"; + throw std::runtime_error("Verification failure"); + } + +#if _DEBUG + cout << "Pass verification\n"; +#endif + } +}; + +// Serial implementation for computing Mandelbrot set. +class MandelSerial : public Mandel { + public: + MandelSerial(int row_count, int col_count, int max_iterations) + : Mandel(row_count, col_count, max_iterations) { + Alloc(); + } + + ~MandelSerial() { Free(); } + + void Evaluate() { + // Iterate over image and compute mandel for each point. + MandelParameters p = GetParameters(); + + for (int i = 0; i < p.row_count(); ++i) { + for (int j = 0; j < p.col_count(); ++j) { + auto c = MandelParameters::ComplexF(p.ScaleRow(i), p.ScaleCol(j)); + SetValue(i, j, p.Point(c)); + } + } + } +}; + +// Parallel implementation for computing Mandelbrot set using buffers. +class MandelParallel : public Mandel { + public: + MandelParallel(int row_count, int col_count, int max_iterations) + : Mandel(row_count, col_count, max_iterations) { + Alloc(); + } + + ~MandelParallel() { Free(); } + + void Evaluate(queue &q) { + // Iterate over image and check if each point is in Mandelbrot set. + MandelParameters p = GetParameters(); + + const int rows = p.row_count(); + const int cols = p.col_count(); + + buffer data_buf(data(), range(rows, cols)); + + // We submit a command group to the queue. + q.submit([&](handler &h) { + // Get access to the buffer. + auto b = data_buf.get_access(h); + + // Iterate over image and compute mandel for each point. + h.parallel_for(range(rows, cols), [=](id<2> index) { + int i = int(index[0]); + int j = int(index[1]); + auto c = MandelParameters::ComplexF(p.ScaleRow(i), p.ScaleCol(j)); + b[index] = p.Point(c); + }); + }); + } +}; + +// Parallel implementation for computing Mandelbrot set using Unified Shared +// Memory (USM). +class MandelParallelUsm : public Mandel { + private: + queue *q; + + public: + MandelParallelUsm(int row_count, int col_count, int max_iterations, queue *q) + : Mandel(row_count, col_count, max_iterations) { + this->q = q; + Alloc(); + } + + ~MandelParallelUsm() { Free(); } + + virtual void Alloc() { + MandelParameters p = GetParameters(); + data_ = malloc_shared(p.row_count() * p.col_count(), *q); + } + + virtual void Free() { free(data_, *q); } + + void Evaluate(queue &q) { + // Iterate over image and check if each point is in Mandelbrot set. + MandelParameters p = GetParameters(); + + const int rows = p.row_count(); + const int cols = p.col_count(); + auto ldata = data_; + + // Iterate over image and compute mandel for each point. + auto e = q.parallel_for(range(rows * cols), [=](id<1> index) { + int i = index / cols; + int j = index % cols; + auto c = MandelParameters::ComplexF(p.ScaleRow(i), p.ScaleCol(j)); + ldata[index] = p.Point(c); + }); + + // Wait for the asynchronous computation on device to complete. + e.wait(); + } +}; diff --git a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md index 9e000a2900..04af6b6292 100644 --- a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md +++ b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/README.md @@ -52,7 +52,7 @@ Perform the following steps: ### On a Windows* System Using Visual Studio* version 2017 or Newer * Build the program using VS2017 or VS2019: Right click on the solution file and open using either VS2017 or VS2019 IDE. Right click on the project in Solution explorer and select Rebuild. From top menu select Debug -> Start without Debugging. -* Build the program using MSBuild: Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019". Run - MSBuild mandelbrot.sln /t:Rebuild /p:Configuration="Release" +* Build the program using MSBuild: Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019". Run - MSBuild sepia-filter.sln /t:Rebuild /p:Configuration="Release" ## Running the sample diff --git a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sepia-filter.vcxproj b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sepia-filter.vcxproj index 41bac3812e..37b575d4d5 100644 --- a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sepia-filter.vcxproj +++ b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/sepia-filter.vcxproj @@ -15,7 +15,7 @@ {d2dbaafa-24e1-4ec9-ae71-1ef6a3bd67fc} Win32Proj sepia_filter - 10.0.17763.0 + $(WindowsSDKVersion.Replace("\","")) diff --git a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/src/sepia_dpcpp.cpp b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/src/sepia_dpcpp.cpp index 50c20aab45..5bda90c457 100644 --- a/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/src/sepia_dpcpp.cpp +++ b/DirectProgramming/DPC++/CombinationalLogic/sepia-filter/src/sepia_dpcpp.cpp @@ -8,12 +8,17 @@ #include #include "CL/sycl.hpp" #include "device_selector.hpp" + +// dpc_common.hpp can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/dpc_common.hpp #include "dpc_common.hpp" +// stb/*.h files can be found in the dev-utilities include folder. +// e.g., $ONEAPI_ROOT/dev-utilities//include/stb/*.h #define STB_IMAGE_IMPLEMENTATION -#include "../stb/stb_image.h" +#include "stb/stb_image.h" #define STB_IMAGE_WRITE_IMPLEMENTATION -#include "../stb/stb_image_write.h" +#include "stb/stb_image_write.h" using namespace std; using namespace sycl; diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile index 3f63267f90..b758899000 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile @@ -19,7 +19,7 @@ run: ./$(USM_EXE_NAME) run_buffers: - ./$(BBUFFER_EXE_NAME) + ./$(BUFFER_EXE_NAME) clean: rm -rf $(USM_EXE_NAME) $(BUFFER_EXE_NAME) \ No newline at end of file diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.fpga b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.fpga index 6c4005dd18..8a37c9710e 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.fpga +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.fpga @@ -1,60 +1,67 @@ CXX := dpcpp CXXFLAGS = -O2 -g -std=c++17 -SRC := src/simple-add-usm.cpp -BUFFERS_SRC := src/simple-add-buffers.cpp +USM_SRC := src/simple-add-usm.cpp +SRC := src/simple-add-buffers.cpp -.PHONY: fpga_emu run_emu clean +.PHONY: hw hw_usm fpga_emu fpga_emu_usm run_emu run_emu_usm clean -fpga_emu_buffers: simple-add-buffers.fpga_emu_buffers -fpga_emu: simple-add-usm.fpga_emu +fpga_emu: simple-add-buffers.fpga_emu +fpga_emu_usm: simple-add-usm.fpga_emu_usm -hw_buffers: simple-add-buffers.fpga -hw: simple-add-usm.fpga +hw: simple-add-buffers.fpga +hw_usm: simple-add-usm.fpga -report_buffers: simple-add-buffers_report.a_buffers -report: simple-add-usm_report.a +report: simple-add-buffers_report.a_buffers +report_usm: simple-add-usm_report.a_usm -simple-add-buffers.fpga_emu_buffers: $(BUFFERS_SRC) +simple-add-buffers.fpga_emu: $(SRC) $(CXX) $(CXXFLAGS) -fintelfpga $^ -o $@ -DFPGA_EMULATOR=1 -simple-add-usm.fpga_emu: $(SRC) - $(CXX) $(CXXFLAGS) -fintelfpga $^ -o $@ -DFPGA_EMULATOR=1 +simple-add-usm.fpga_emu_usm: $(USM_SRC) + @#$(CXX) $(CXXFLAGS) -fintelfpga $^ -o $@ -DFPGA_EMULATOR=1 + @echo USM is not supported for FPGAs, yet -a_buffers.o: $(BUFFERS_SRC) - $(CXX) $(CXXFLAGS) -fintelfpga -c $^ -o $@ -DFPGA=1 -a.o: $(SRC) +a_buffers.o: $(SRC) $(CXX) $(CXXFLAGS) -fintelfpga -c $^ -o $@ -DFPGA=1 +a_usm.o: $(USM_SRC) + @#$(CXX) $(CXXFLAGS) -fintelfpga -c $^ -o $@ -DFPGA=1 + @echo USM is not supported for FPGAs, yet simple-add-buffers.fpga: a_buffers.o $(CXX) $(CXXFLAGS) -fintelfpga $^ -o $@ -Xshardware -simple-add-usm.fpga: a.o - $(CXX) $(CXXFLAGS) -fintelfpga $^ -o $@ -Xshardware +simple-add-usm.fpga: a_usm.o + @#$(CXX) $(CXXFLAGS) -fintelfpga $^ -o $@ -Xshardware + @echo USM is not supported for FPGAs, yet -run_emu_buffers: simple-add-buffers.fpga_emu_buffers - ./simple-add-buffers.fpga_emu_buffers -run_emu: simple-add-usm.fpga_emu - ./simple-add-usm.fpga_emu +run_emu: simple-add-buffers.fpga_emu + ./simple-add-buffers.fpga_emu +run_emu_usm: simple-add-usm.fpga_emu_usm + @#./simple-add-usm.fpga_emu_usm + @echo USM is not supported for FPGAs, yet -run_hw_buffers: simple-add-buffers.fpga +run_hw: simple-add-buffers.fpga ./simple-add-buffers.fpga -run_hw: simple-add-usm.fpga - ./simple-add-usm.fpga +run_hw_usm: simple-add-usm.fpga + @#./simple-add-usm.fpga + @echo USM is not supported for FPGAs, yet -dev_buffers.o: $(BUFFERS_SRC) - $(CXX) $(CXXFLAGS) -fintelfpga -c $^ -o $@ -DFPGA=1 -dev.o: $(SRC) +dev_buffers.o: $(SRC) $(CXX) $(CXXFLAGS) -fintelfpga -c $^ -o $@ -DFPGA=1 +dev_usm.o: $(USM_SRC) + @#$(CXX) $(CXXFLAGS) -fintelfpga -c $^ -o $@ -DFPGA=1 + @echo USM is not supported for FPGAs, yet simple-add-buffers_report.a_buffers: dev_buffers.o $(CXX) $(CXXFLAGS) -fintelfpga -fsycl-link $^ -o $@ -Xshardware -simple-add-usm_report.a: dev.o - $(CXX) $(CXXFLAGS) -fintelfpga -fsycl-link $^ -o $@ -Xshardware +simple-add-usm_report.a_usm: dev_usm.o + @#$(CXX) $(CXXFLAGS) -fintelfpga -fsycl-link $^ -o $@ -Xshardware + @echo USM is not supported for FPGAs, yet clean: - rm -rf *.o *.d *.out *.mon *.emu *.aocr *.aoco *.prj *.fpga_emu *.fpga_emu_buffers simple-add-buffers.fpga simple-add-usm.fpga *.a \ No newline at end of file + rm -rf *.o *.d *.out *.mon *.emu *.aocr *.aoco *.prj *.fpga_emu *.fpga_emu_usm simple-add-buffers.fpga simple-add-usm.fpga *.a diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.win b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.win index 06c741b905..d59ec56431 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.win +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.win @@ -1,15 +1,25 @@ -CXX = dpcpp-cl -CXXFLAGS = -O2 -EHsc -Zi -EXE_NAME = simple-add-usm.exe -SOURCES = src/simple-add-usm.cpp - -all: main - -main: - $(CXX) $(CXXFLAGS) -o $(EXE_NAME) $(SOURCES) - -run: - $(EXE_NAME) - -clean: - del -rf $(EXE_NAME) +CXX = dpcpp-cl +CXXFLAGS = -O2 -EHsc -Zi + +EXE_NAME = simple-add-usm.exe +SOURCES = src/simple-add-usm.cpp + +BUFFERS_EXE_NAME = simple-add-buffers.exe +BUFFERS_SOURCES = src/simple-add-buffers.cpp + +all: main + +main: + $(CXX) $(CXXFLAGS) -o $(EXE_NAME) $(SOURCES) + +build_buffers: + $(CXX) $(CXXFLAGS) -o $(BUFFERS_EXE_NAME) $(BUFFERS_SOURCES) + +run: + $(EXE_NAME) + +run_buffers: + $(BUFFERS_EXE_NAME) + +clean: + del -rf $(EXE_NAME) $(BUFFERS_EXE_NAME) diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.win.fpga b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.win.fpga index 6a6478f83f..c77f1c19a3 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.win.fpga +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/Makefile.win.fpga @@ -1,15 +1,26 @@ CXX = dpcpp-cl CXXFLAGS = -O2 -EHsc -Zi -SOURCES = src/simple-add-usm.cpp -EXE_NAME_FPGA_EMU = simple-add-usm.fpga_emu.exe +LDFLAGS = + +SRC = src/simple-add-buffers.cpp +USM_SRC = src/simple-add-usm.cpp + +EXE = simple-add-buffers.fpga_emu.exe +USM_EXE = simple-add-usm.fpga_emu_usm.exe all: fpga_emu -fpga_emu: - $(CXX) $(CXXFLAGS) -fintelfpga $(SOURCES) -o $(EXE_NAME_FPGA_EMU) -DFPGA_EMULATOR=1 +fpga_emu: $(SRC) + $(CXX) $(CXXFLAGS) -fintelfpga $(SRC) -o $(EXE) -DFPGA_EMULATOR=1 + +fpga_emu_usm: $(SRC) + @echo USM is not supported for FPGAs, yet run: - $(EXE_NAME_FPGA_EMU) + $(EXE) + +run_usm: + @echo USM is not supported for FPGAs, yet clean: - del -rf $(EXE_NAME_FPGA_EMU) + del /f *.o *.d *.out *.mon *.emu *.aocr *.aoco *.prj *.fpga_emu *.fpga_emu_usm *.a $(EXE) $(USM_EXE) diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md index 94cc2dd722..662c9df298 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/README.md @@ -27,6 +27,9 @@ This code sample is licensed under MIT license. ## Building the `simple add DPC++` Program for CPU and GPU +## Include Files +The include folder is located at "%ONEAPI_ROOT%\dev-utilities\latest\include" on your development system. + ### On a Linux* System Perform the following steps: 1. Build the `simple-add-dpc++` program using the following make commands (default uses USM): @@ -123,15 +126,7 @@ Perform the following steps: 2. Select the menu sequence **File** > **Open** > **Project/Solution**. 3. Locate the `simple-add` folder. 4. Select the `simple-add.sln` file. -5. Select the configuration 'Debug-fpga' or 'Release-fpga' that have the necessary project settings already below: - - Under the 'Project Property' dialog: - - a. Select the **DPC++** tab. - b. In the **General** subtab, the **Perform ahead of time compilation for the FPGA** setting is set to **Yes**. - c. In the **Preprocessor** subtab, the **Preprocessor Definitions" setting has **FPGA_EMULATOR** added. - d. Close the dialog. - +5. Select the configuration 'Debug-fpga' 6. Select **Project** > **Build** menu option to build the selected configuration. 7. Select **Debug** > **Start Without Debugging** menu option to run the program. diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json index 74cadb2120..cb7d58bb6a 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/sample.json @@ -1,4 +1,5 @@ { + "guid" : "49C65CB6-F9FA-4E3C-B8BE-4A141E4E0F07", "name": "Simple Add", "categories": ["Toolkit/Get Started", "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU, GPU and FPGA"], "description": "Simple program that adds two large vectors in parallel. Provides a ‘Hello World!’ like sample to ensure your environment is setup correctly using Data Parallel C++.", @@ -12,30 +13,31 @@ { "id": "cpu_usm", "steps": [ + "make clean", "make all", - "make run", - "make clean" + "make run" ] }, { "id": "cpu_buffers", "steps": [ + "make clean", "make build_buffers", - "make run_buffers", - "make clean" + "make run_buffers" ] }, { "id": "fpga_emu_buffers", "steps": [ + "make clean -f Makefile.fpga", "make fpga_emu -f Makefile.fpga", - "make run_emu -f Makefile.fpga", - "make clean -f Makefile.fpga" + "make run_emu -f Makefile.fpga" ] }, { "id": "fpga_report_buffers", "steps": [ + "make clean -f Makefile.fpga", "make report -f Makefile.fpga" ] } diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/simple-add-buffers.vcxproj b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/simple-add-buffers.vcxproj new file mode 100644 index 0000000000..0b65928e13 --- /dev/null +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/simple-add-buffers.vcxproj @@ -0,0 +1,196 @@ + + + + + Debug-fpga + x64 + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {5d244b6f-a460-4179-aa5a-4f3bdff79f6b} + Win32Proj + simple_add_buffers + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + + + + + + + true + + + true + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + + + Console + true + + + + + + + + + false + %(PreprocessorDefinitions) + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + + + + + true + FPGA_EMULATOR=1;%(PreprocessorDefinitions) + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + + + + + + + Console + true + true + true + + + + + + + + + false + %(PreprocessorDefinitions) + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + + + + diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/simple-add-usm.vcxproj b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/simple-add-usm.vcxproj index ce173eb881..4514fffe00 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/simple-add-usm.vcxproj +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/simple-add-usm.vcxproj @@ -1,140 +1,196 @@ - - - - - Debug - x64 - - - Release - x64 - - - - 15.0 - {1c022788-7017-4279-8c45-fae9940cc775} - Win32Proj - simple_add_usm - $(WindowsSDKVersion) - - - - Application - true - Intel(R) oneAPI DPC++ Compiler - Unicode - - - Application - false - Intel(R) oneAPI DPC++ Compiler - true - Unicode - - - Application - true - Intel(R) oneAPI DPC++ Compiler - Unicode - - - Application - false - Intel(R) oneAPI DPC++ Compiler - true - Unicode - - - - - - - - - - - - - - - - - - - - - true - - - true - - - false - - - false - - - - - - - - - - Console - true - - - - - - - - - /std:c++17 - - - Console - true - - - - - - - - - - - Console - true - true - true - - - - - - - - - /std:c++17 - - - Console - true - true - true - - - - - - - - - - - - + + + + + Debug-fpga + x64 + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {1c022788-7017-4279-8c45-fae9940cc775} + Win32Proj + simple_add_usm + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + + + + + + + true + + + true + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + + + Console + true + + + + + + + + + + + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + + + + + /std:c++17 + true + FPGA_EMULATOR=1;%(PreprocessorDefinitions) + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + + + + + + + Console + true + true + true + + + + + + + + + /std:c++17 + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/simple-add.sln b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/simple-add.sln index 0a2063cb4e..4dd5579876 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/simple-add.sln +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/simple-add.sln @@ -5,28 +5,27 @@ VisualStudioVersion = 15.0.28307.1145 MinimumVisualStudioVersion = 10.0.40219.1 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simple-add-usm", "simple-add-usm.vcxproj", "{1C022788-7017-4279-8C45-FAE9940CC775}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "simple-add-buffers", "simple-add-buffers.vcxproj", "{5D244B6F-A460-4179-AA5A-4F3BDFF79F6B}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 - Debug|x86 = Debug|x86 Debug-fpga|x64 = Debug-fpga|x64 - Debug-fpga|x86 = Debug-fpga|x86 Release|x64 = Release|x64 - Release|x86 = Release|x86 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {1C022788-7017-4279-8C45-FAE9940CC775}.Debug|x64.ActiveCfg = Debug|x64 {1C022788-7017-4279-8C45-FAE9940CC775}.Debug|x64.Build.0 = Debug|x64 - {1C022788-7017-4279-8C45-FAE9940CC775}.Debug|x86.ActiveCfg = Debug|x64 - {1C022788-7017-4279-8C45-FAE9940CC775}.Debug|x86.Build.0 = Debug|x64 - {1C022788-7017-4279-8C45-FAE9940CC775}.Debug-fpga|x64.ActiveCfg = Debug|x64 - {1C022788-7017-4279-8C45-FAE9940CC775}.Debug-fpga|x64.Build.0 = Debug|x64 - {1C022788-7017-4279-8C45-FAE9940CC775}.Debug-fpga|x86.ActiveCfg = Debug|x64 - {1C022788-7017-4279-8C45-FAE9940CC775}.Debug-fpga|x86.Build.0 = Debug|x64 - {1C022788-7017-4279-8C45-FAE9940CC775}.Release|x64.ActiveCfg = Debug|x64 - {1C022788-7017-4279-8C45-FAE9940CC775}.Release|x64.Build.0 = Debug|x64 - {1C022788-7017-4279-8C45-FAE9940CC775}.Release|x86.ActiveCfg = Debug|x64 - {1C022788-7017-4279-8C45-FAE9940CC775}.Release|x86.Build.0 = Debug|x64 + {1C022788-7017-4279-8C45-FAE9940CC775}.Debug-fpga|x64.ActiveCfg = Debug-fpga|x64 + {1C022788-7017-4279-8C45-FAE9940CC775}.Debug-fpga|x64.Build.0 = Debug-fpga|x64 + {1C022788-7017-4279-8C45-FAE9940CC775}.Release|x64.ActiveCfg = Release|x64 + {1C022788-7017-4279-8C45-FAE9940CC775}.Release|x64.Build.0 = Release|x64 + {5D244B6F-A460-4179-AA5A-4F3BDFF79F6B}.Debug|x64.ActiveCfg = Debug|x64 + {5D244B6F-A460-4179-AA5A-4F3BDFF79F6B}.Debug|x64.Build.0 = Debug|x64 + {5D244B6F-A460-4179-AA5A-4F3BDFF79F6B}.Debug-fpga|x64.ActiveCfg = Debug-fpga|x64 + {5D244B6F-A460-4179-AA5A-4F3BDFF79F6B}.Debug-fpga|x64.Build.0 = Debug-fpga|x64 + {5D244B6F-A460-4179-AA5A-4F3BDFF79F6B}.Release|x64.ActiveCfg = Release|x64 + {5D244B6F-A460-4179-AA5A-4F3BDFF79F6B}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/dpc_common.hpp b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/dpc_common.hpp deleted file mode 100644 index d9f9194e6a..0000000000 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/dpc_common.hpp +++ /dev/null @@ -1,63 +0,0 @@ -//============================================================== -// Copyright © 2020 Intel Corporation -// -// SPDX-License-Identifier: MIT -// ============================================================= - -#ifndef _DP_HPP -#define _DP_HPP - -#pragma once - -#include -#include - -#include - -namespace dpc { -// this exception handler with catch async exceptions -static auto exception_handler = [](cl::sycl::exception_list eList) { - for (std::exception_ptr const &e : eList) { - try { - std::rethrow_exception(e); - } catch (std::exception const &e) { -#if _DEBUG - std::cout << "Failure" << std::endl; -#endif - std::terminate(); - } - } -}; - -class queue : public cl::sycl::queue { - // Enable profiling by default - cl::sycl::property_list prop_list = - cl::sycl::property_list{cl::sycl::property::queue::enable_profiling()}; - - public: - queue() - : cl::sycl::queue(cl::sycl::default_selector{}, exception_handler, prop_list) {} - queue(cl::sycl::device_selector &d) - : cl::sycl::queue(d, exception_handler, prop_list) {} - queue(cl::sycl::device_selector &d, cl::sycl::property_list &p) - : cl::sycl::queue(d, exception_handler, p) {} -}; - -using Duration = std::chrono::duration; - -class Timer { - public: - Timer() : start(std::chrono::steady_clock::now()) {} - - Duration elapsed() { - auto now = std::chrono::steady_clock::now(); - return std::chrono::duration_cast(now - start); - } - - private: - std::chrono::steady_clock::time_point start; -}; - -}; // namespace dpc - -#endif diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-buffers.cpp b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-buffers.cpp index 9fe2e8c1e6..349ffe2026 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-buffers.cpp +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-buffers.cpp @@ -28,10 +28,11 @@ #endif using namespace sycl; +using namespace std; // Array type and data size for this example. constexpr size_t array_size = 10000; -typedef std::array IntArray; +typedef array IntArray; //************************************ // Iota in DPC++ on device. @@ -85,24 +86,24 @@ int main() { for (size_t i = 0; i < sequential.size(); i++) sequential[i] = value + i; try { - queue q(d_selector, dpc::exception_handler); + queue q(d_selector, dpc_common::exception_handler); // Print out the device information used for the kernel code. - std::cout << "Running on device: " - << q.get_device().get_info() << "\n"; - std::cout << "Array size: " << parallel.size() << "\n"; + cout << "Running on device: " + << q.get_device().get_info() << "\n"; + cout << "Array size: " << parallel.size() << "\n"; // Parallel iota in DPC++. IotaParallel(q, parallel, value); - } catch (exception const &e) { - std::cout << "An exception is caught while computing on device.\n"; - std::terminate(); + } catch (std::exception const &e) { + cout << "An exception is caught while computing on device.\n"; + terminate(); } // Verify two results are equal. for (size_t i = 0; i < sequential.size(); i++) { if (parallel[i] != sequential[i]) { - std::cout << "Failed on device.\n"; + cout << "Failed on device.\n"; return -1; } } @@ -114,10 +115,10 @@ int main() { for (int i = 0; i < indices_size; i++) { int j = indices[i]; if (i == indices_size - 1) std::cout << "...\n"; - std::cout << "[" << j << "]: " << j << " + " << value << " = " - << parallel[j] << "\n"; + cout << "[" << j << "]: " << j << " + " << value << " = " + << parallel[j] << "\n"; } - std::cout << "Successfully completed on device.\n"; + cout << "Successfully completed on device.\n"; return 0; } diff --git a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-usm.cpp b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-usm.cpp index e7d19a0ff0..1f37eaa361 100644 --- a/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-usm.cpp +++ b/DirectProgramming/DPC++/DenseLinearAlgebra/simple-add/src/simple-add-usm.cpp @@ -28,6 +28,7 @@ #endif using namespace sycl; +using namespace std; // Array size for this example. constexpr size_t array_size = 10000; @@ -71,12 +72,12 @@ int main() { constexpr int value = 100000; try { - queue q(d_selector, dpc::exception_handler); + queue q(d_selector, dpc_common::exception_handler); // Print out the device information used for the kernel code. - std::cout << "Running on device: " - << q.get_device().get_info() << "\n"; - std::cout << "Array size: " << array_size << "\n"; + cout << "Running on device: " + << q.get_device().get_info() << "\n"; + cout << "Array size: " << array_size << "\n"; int *sequential = malloc_shared(array_size, q); int *parallel = malloc_shared(array_size, q); @@ -85,7 +86,7 @@ int main() { if (sequential != nullptr) free(sequential, q); if (parallel != nullptr) free(parallel, q); - std::cout << "Shared memory allocation failure.\n"; + cout << "Shared memory allocation failure.\n"; return -1; } @@ -98,7 +99,7 @@ int main() { // Verify two results are equal. for (size_t i = 0; i < array_size; i++) { if (parallel[i] != sequential[i]) { - std::cout << "Failed on device.\n"; + cout << "Failed on device.\n"; return -1; } } @@ -110,17 +111,17 @@ int main() { for (int i = 0; i < indices_size; i++) { int j = indices[i]; if (i == indices_size - 1) std::cout << "...\n"; - std::cout << "[" << j << "]: " << j << " + " << value << " = " - << sequential[j] << "\n"; + cout << "[" << j << "]: " << j << " + " << value << " = " + << sequential[j] << "\n"; } free(sequential, q); free(parallel, q); - } catch (exception const &e) { - std::cout << "An exception is caught while computing on device.\n"; - std::terminate(); + } catch (std::exception const &e) { + cout << "An exception is caught while computing on device.\n"; + terminate(); } - std::cout << "Successfully completed on device.\n"; + cout << "Successfully completed on device.\n"; return 0; } diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/CMakeLists.txt b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/CMakeLists.txt new file mode 100644 index 0000000000..b9c9718926 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/CMakeLists.txt @@ -0,0 +1,30 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +project (bitonic-sort) + +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp-cl") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (bitonic-sort src/bitonic-sort.cpp) + +add_custom_target (run + COMMAND bitonic-sort 21 47 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt new file mode 100644 index 0000000000..6e9524bd74 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/License.txt @@ -0,0 +1,7 @@ +Copyright 2020 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md new file mode 100644 index 0000000000..8066fd1f31 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/README.md @@ -0,0 +1,120 @@ +# `Bitonic Sort` sample + +This code sample demonstrates the implementation of bitonic sort using Intel Data Parallel C++ to +offload the computation to a GPU. In this implementation, a random sequence of 2**n elements is given +(n is a positive number) as input, the algorithm sorts the sequence in parallel. The result sequence is +in ascending order. + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | Implement bitonic sort using Intel DPC++ compiler +| Time to complete | 15 minutes + + +## Purpose + +The algorithm converts a randomized sequence of numbers into +a bitonic sequence (two ordered sequences), and then merge these two ordered +sequences into a ordered sequence. Bitonic sort algorithm is briefly +described as followed: + +- First, it decomposes the randomized sequence of size 2\*\*n into 2\*\*(n-1) +pairs where each pair consists of 2 consecutive elements. Note that each pair +is a bitonic sequence. +- Step 0: for each pair (sequence of size 2), the two elements are swapped so +that the two consecutive pairs form a bitonic sequence in increasing order, +the next two pairs form the second bitonic sequence in decreasing order, the +next two pairs form the third bitonic sequence in increasing order, etc, .... +At the end of this step, we have 2\*\*(n-1) bitonic sequences of size 2, and +they follow an order increasing, decreasing, increasing, .., decreasing. +Thus, they form 2\*\*(n-2) bitonic sequences of size 4. +- Step 1: for each new 2\*\*(n-2) bitonic sequences of size 4, (each new +sequence consists of 2 consecutive previous sequences), it swaps the elements +so that at the end of step 1, we have 2\*\*(n-2) bitonic sequences of size 4, +and they follow an order: increasing, decreasing, increasing, ..., +decreasing. Thus, they form 2\*\*(n-3) bitonic sequences of size 8. +- Same logic applies until we reach the last step. +- Step n: at this last step, we have one bitonic sequence of size 2\*\*n. The +elements in the sequence are swapped until we have a sequence in increasing +oder. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU +if a compatible GPU is not detected. + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes device selector, buffer, accessor, kernel, and command g +roups. Unified Shared Memory (USM) is used for data management. + +## License +This code sample is licensed under MIT license + +## Building the `bitonic-sort` Program for CPU and GPU + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +1. Build the program using the following `cmake` commands. + ``` + $ cd bitonic-sort + $ mkdir build + $ cd build + $ cmake .. + $ make + ``` + +2. Run the program: + ``` + make run + ``` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System + * Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + + * Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for + VS2019" + Run - MSBuild bitonic-sort.sln /t:Rebuild /p:Configuration="Release" + +## Running the sample +### Application Parameters + + Usage: bitonic-sort + +where + +exponent is a positive number. The according length of the sequence is 2**exponent. + +seed is the seed used by the random generator to generate the randomness. + +The sample offloads the computation to GPU and then performs the computation in serial in the CPU. +The results from the parallel and serial computation are compared. If the results are matched and +the ascending order is verified, the application will display a “Success!” message. + +### Example of Output +``` +$ ./bitonic-sort 21 47 +Array size: 2097152, seed: 47 +Device: Intel(R) Gen9 HD Graphics NEO +Kernel time: 0.416827 sec +CPU serial time: 0.60523 sec +Success! +``` \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.sln b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.sln new file mode 100644 index 0000000000..e558305981 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.1062 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "bitonic-sort", "bitonic-sort.vcxproj", "{46454D0B-76F3-45EB-A186-F315A2E22DEA}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Debug|x64.ActiveCfg = Debug|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Debug|x64.Build.0 = Debug|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Release|x64.ActiveCfg = Release|x64 + {46454D0B-76F3-45EB-A186-F315A2E22DEA}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {B1D84B81-F5D5-4459-AA6E-38B695FB908B} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj new file mode 100644 index 0000000000..9289102064 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj @@ -0,0 +1,137 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {46454d0b-76f3-45eb-a186-f315a2e22dea} + Win32Proj + bitonic_sort + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + + + + + + + Console + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + + + + + + + + + + + Console + true + true + true + + + + + + + + + %ONEAPI_ROOT%\dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.filters b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.filters new file mode 100644 index 0000000000..82a4ddcfc9 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.user b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.user new file mode 100644 index 0000000000..582de7464a --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/bitonic-sort.vcxproj.user @@ -0,0 +1,11 @@ + + + + 21 47 + WindowsLocalDebugger + + + 21 47 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json new file mode 100644 index 0000000000..c382d764e1 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/sample.json @@ -0,0 +1,29 @@ +{ + "guid": "4D5B57B8-6F34-4A11-89F5-3F07E766DB39", + "name": "bitonic-sort", + "categories": [ "Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/CPU and GPU" ], + "description": "Bitonic Sort using Intel® oneAPI DPC++ Language", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild bitonic-sort.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "bitonic-sort.exe 21 47" + ] + }] + } +} diff --git a/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp new file mode 100644 index 0000000000..e0e4312520 --- /dev/null +++ b/DirectProgramming/DPC++/GraphTraversal/bitonic-sort/src/bitonic-sort.cpp @@ -0,0 +1,253 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// +// Bitonic Sort: this algorithm converts a randomized sequence of numbers into +// a bitonic sequence (two ordered sequences), and then merge these two ordered +// sequences into a ordered sequence. Bitonic sort algorithm is briefly +// described as followed: +// +// - First, it decomposes the randomized sequence of size 2**n into 2**(n-1) +// pairs where each pair consists of 2 consecutive elements. Note that each pair +// is a bitonic sequence. +// - Step 0: for each pair (sequence of size 2), the two elements are swapped so +// that the two consecutive pairs form a bitonic sequence in increasing order, +// the next two pairs form the second bitonic sequence in decreasing order, the +// next two pairs form the third bitonic sequence in increasing order, etc, ... +// . At the end of this step, we have 2**(n-1) bitonic sequences of size 2, and +// they follow an order increasing, decreasing, increasing, .., decreasing. +// Thus, they form 2**(n-2) bitonic sequences of size 4. +// - Step 1: for each new 2**(n-2) bitonic sequences of size 4, (each new +// sequence consists of 2 consecutive previous sequences), it swaps the elements +// so that at the end of step 1, we have 2**(n-2) bitonic sequences of size 4, +// and they follow an order: increasing, decreasing, increasing, ..., +// decreasing. Thus, they form 2**(n-3) bitonic sequences of size 8. +// - Same logic applies until we reach the last step. +// - Step n: at this last step, we have one bitonic sequence of size 2**n. The +// elements in the sequence are swapped until we have a sequence in increasing +// oder. +// +// In this implementation, a randomized sequence of size 2**n is given (n is a +// positive number). Unified Shared Memory (USM) is used for data management. At +// each stage, a part of step, the host redefines the ordered sequenes and sends +// data to the kernel. The kernel swaps the elements accordingly in parallel. +// +#include +#include +#include + +using namespace sycl; +using namespace std; + +void ParallelBitonicSort(int a[], int n, queue &q) { + // n: the exponent used to set the array size. Array size = power(2, n) + int size = pow(2, n); + + // step from 0, 1, 2, ...., n-1 + for (int step = 0; step < n; step++) { + // for each step s, stage goes s, s-1, ..., 0 + for (int stage = step; stage >= 0; stage--) { + // In each state, construct a number (num_seq) of bitonic sequences of + // size seq_len (2, 4, ...) num_seq stores the number of bitonic sequences + // at each stage. seq_len stores the length of the bitonic sequence at + // each stage. + int seq_len = pow(2, stage + 1); +#if DEBUG + int num_seq = pow(2, (n - stage - 1)); // Used for debug purpose. + std::cout << "step num:" << step << " stage num:" << stage + << " num_seq:" << num_seq << "(" << seq_len << ") => "; +#endif + // Constant used in the kernel: 2**(step-stage). + int two_power = 1 << (step - stage); + + // Offload the work to kernel. + q.submit([&](handler &h) { + h.parallel_for(range<1>(size), [=](id<1> i) { + // Assign the bitonic sequence number. + int seq_num = i / seq_len; + + // Variable used to identified the swapped element. + int swapped_ele = -1; + + // Because the elements in the first half in the bitonic + // sequence may swap with elements in the second half, + // only the first half of elements in each sequence is + // required (seq_len/2). + int h_len = seq_len / 2; + + if (i < (seq_len * seq_num) + h_len) swapped_ele = i + h_len; + + // Check whether increasing or decreasing order. + int odd = seq_num / two_power; + + // Boolean variable used to determine "increasing" or + // "decreasing" order. + bool increasing = ((odd % 2) == 0); + + // Swap the elements in the bitonic sequence if needed + if (swapped_ele != -1) { + if (((a[i] > a[swapped_ele]) && increasing) || + ((a[i] < a[swapped_ele]) && !increasing)) { + int temp = a[i]; + a[i] = a[swapped_ele]; + a[swapped_ele] = temp; + } + } + }); + }); + q.wait(); + } // end stage + } // end step +} + +// Loop over the bitonic sequences at each stage in serial. +void SwapElements(int step, int stage, int num_sequence, int seq_len, + int *array) { + for (int seq_num = 0; seq_num < num_sequence; seq_num++) { + int odd = seq_num / (pow(2, (step - stage))); + bool increasing = ((odd % 2) == 0); + + int h_len = seq_len / 2; + + // For all elements in a bitonic sequence, swap them if needed + for (int i = seq_num * seq_len; i < seq_num * seq_len + h_len; i++) { + int swapped_ele = i + h_len; + + if (((array[i] > array[swapped_ele]) && increasing) || + ((array[i] < array[swapped_ele]) && !increasing)) { + int temp = array[i]; + array[i] = array[swapped_ele]; + array[swapped_ele] = temp; + } + } // end for all elements in a sequence + } // end all sequences +} + +// Function sorts an array in serial using bitonic sort algorithm. The size of +// the array is indicated by the exponent n: the array size is 2 ** n. +inline void BitonicSort(int a[], int n) { + // n: the exponent indicating the array size = 2 ** n. + + // step from 0, 1, 2, ...., n-1 + for (int step = 0; step < n; step++) { + // for each step s, stage goes s, s-1,..., 0 + for (int stage = step; stage >= 0; stage--) { + // Sequences (same size) are formed at each stage. + int num_sequence = pow(2, (n - stage - 1)); + // The length of the sequences (2, 4, ...). + int sequence_len = pow(2, stage + 1); + + SwapElements(step, stage, num_sequence, sequence_len, a); + } + } +} + +// Function showing the array. +void DisplayArray(int a[], int array_size) { + for (int i = 0; i < array_size; ++i) std::cout << a[i] << " "; + std::cout << "\n"; +} + +void Usage(std::string prog_name, int exponent) { + std::cout << " Incorrect parameters\n"; + std::cout << " Usage: " << prog_name << " n k \n\n"; + std::cout << " n: Integer exponent presenting the size of the input array. " + "The number of element in\n"; + std::cout << " the array must be power of 2 (e.g., 1, 2, 4, ...). Please " + "enter the corresponding\n"; + std::cout << " exponent betwwen 0 and " << exponent - 1 << ".\n"; + std::cout << " k: Seed used to generate a random sequence.\n"; +} + +int main(int argc, char *argv[]) { + int n, seed, size; + int exp_max = log2(std::numeric_limits::max()); + + // Read parameters. + try { + n = std::stoi(argv[1]); + + // Verify the boundary of acceptance. + if (n < 0 || n >= exp_max) { + Usage(argv[0], exp_max); + return -1; + } + + seed = std::stoi(argv[2]); + size = pow(2, n); + } catch (...) { + Usage(argv[0], exp_max); + return -1; + } + + std::cout << "\nArray size: " << size << ", seed: " << seed << "\n"; + + // Create queue on implementation-chosen default device. + queue q; + + std::cout << "Device: " << q.get_device().get_info() + << "\n"; + + // USM allocation using malloc_shared: data stores a sequence of random + // numbers. + int *data = malloc_shared(size, q); + + // Memory allocated for host access only. + int *data2 = (int *)malloc(size * sizeof(int)); + + // Initialize the array randomly using a seed. + srand(seed); + + for (int i = 0; i < size; i++) data[i] = data2[i] = rand() % 1000; + +#if DEBUG + std::cout << "\ndata before:\n"; + DisplayArray(data, size); +#endif + + // Start timer + dpc_common::TimeInterval t_par; + + ParallelBitonicSort(data, n, q); + + std::cout << "Kernel time: " << t_par.Elapsed() << " sec\n"; + +#if DEBUG + std::cout << "\ndata after sorting using parallel bitonic sort:\n"; + DisplayArray(data, size); +#endif + + // Start timer + dpc_common::TimeInterval t_ser; + + // Bitonic sort in CPU (serial) + BitonicSort(data2, n); + + std::cout << "CPU serial time: " << t_ser.Elapsed() << " sec\n"; + + // Verify both bitonic sort algorithms in kernel and in CPU. + bool pass = true; + for (int i = 0; i < size - 1; i++) { + // Validate the sequence order is increasing in both kernel and CPU. + if ((data[i] > data[i + 1]) || (data[i] != data2[i])) { + pass = false; + break; + } + } + + // Clean USM resources. + free(data, q); + + // Clean CPU memory. + free(data2); + + if (!pass) { + std::cout << "\nFailed!\n"; + return -2; + } + + std::cout << "\nSuccess!\n"; + return 0; +} diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/CMakeLists.txt b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/CMakeLists.txt new file mode 100644 index 0000000000..9dd05e922e --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/CMakeLists.txt @@ -0,0 +1,27 @@ +# required cmake version +cmake_minimum_required(VERSION 3.5) + +# CMakeLists.txt for ISO2DFD_DPCPP project +project (iso2dfd_dpcpp) + +set(CMAKE_CXX_COMPILER "dpcpp") + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fsycl -std=c++17") + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lOpenCL -lsycl") + +add_executable (iso2dfd src/iso2dfd.cpp) + +add_custom_target (run + COMMAND iso2dfd 1000 1000 2000 + WORKING_DIRECTORY ${CMAKE_PROJECT_DIR} +) + diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/License.txt b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/License.txt new file mode 100644 index 0000000000..6e9524bd74 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/License.txt @@ -0,0 +1,7 @@ +Copyright 2020 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/README.md b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/README.md new file mode 100644 index 0000000000..604dd14b56 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/README.md @@ -0,0 +1,145 @@ +# ISO2DFD sample + +ISO2DFD: Intel® oneAPI DPC++ Language Basics Using +2D-Finite-Difference-Wave Propagation + +The ISO2DFD sample refers to Two-Dimensional Finite-Difference Wave Propagation in Isotropic Media. It is a two-dimensional stencil to simulate a wave propagating in a 2D isotropic medium and illustrates the basics of the DPC++ programming language using direct programming. + +A complete code walk-through for this sample can be found at: +https://software.intel.com/en-us/articles/code-sample-two-dimensional-finite-difference-wave-propagation-in-isotropic-media-iso2dfd + +For comprehensive instructions regarding DPC++ Programming, go to +https://software.intel.com/en-us/oneapi-programming-guide +and search based on relevant terms noted in the comments. + + +| Optimized for | Description +|:--- |:--- +| OS | Linux Ubuntu 18.04 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler (beta); Intel C++ Compiler (beta) +| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler +| Time to complete | 10 minutes + + +## Purpose + +ISO2DFD is a finite difference stencil kernel for solving the 2D acoustic isotropic wave equation. In +this sample, we chose the problem of solving a Partial Differential Equation (PDE), using a +finite-difference method, to illustrate the essential elements of the DPC++ programming language: +queues, buffers/accessors, and kernels. Use it as an entry point to start programming in DPC++ or as a +proxy to develop or better understand complicated code for similar problems. + +Using Data Parallel C++, the sample will explicitly run on the GPU as well as CPU to calculate a +result. The output will include GPU device name. The results from the two devices are compared and, if +the sample ran correctly, report a success message. The output of the wavefield can be plotted using +the SU Seismic processing library, which has utilities to display seismic wavefields and can be +downloaded from John Stockwell’s SeisUnix GitHub* (https://github.com/JohnWStockwellJr/SeisUnix/wiki/ +Seismic-Unix-install-on-Ubuntu) + + +## Key implementation details + +SYCL implementation explained. + +* DPC++ queues (including device selectors and exception handlers). +* DPC++ buffers and accessors. +* The ability to call a function inside a kernel definition and pass accessor arguments as pointers. A +function called inside the kernel performs a computation (it updates a grid point specified by the +global ID variable) for a single time step. + + +## License + +This code sample is licensed under MIT license. + + +## Building the `iso2dfd` Program for CPU and GPU + +### Running Samples In DevCloud + +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, +FPGA) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI +Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +Perform the following steps: +1. Build the program using the following `cmake` commands. + + ``` + cd iso2dfd_dpcpp && + mkdir build && + cd build && + cmake .. && + make -j + ``` + +2. Run the program on Gen9 + + ``` + make run + ``` + +3. Clean the program + + ``` + make clean + ``` + +### On a Windows* System Using Visual Studio* Version 2017 or Newer +* Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + +>If you see the following error message when compiling this sample: +> +``` +Error 'dpc_common.hpp' file not found +``` +>You need to add the following directory to the list of include folders, that are required by your project, in your project's Visual Studio project property panel. The missing include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system. + +* Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019" + Run - MSBuild iso2dfd.sln /t:Rebuild /p:Configuration="Release" + + +## Running the Sample +### Application Parameters + +You can execute the code with different parameters. For example the following command will run the iso2dfd executable using a 1000x1000 grid size and it will iterate over 2000 time steps. + + ``` + ./iso2dfd 1000 1000 2000 + ``` + + Usage: ./iso2dfd n1 n2 Iterations + + n1 n2 : Grid sizes for the stencil + Iterations : Number of timesteps. + + * Find graphical output for sample execution in the online tutorial at: + https://software.intel.com/en-us/articles/code-sample-two-dimensional-finite-difference-wave-propagation-in-isotropic-media-iso2dfd + +### Example of Output + + ``` + Initializing ... + Grid Sizes: 1000 1000 + Iterations: 2000 + + Computing wavefield in device .. + Running on Intel(R) Gen9 HD Graphics NEO + The Device Max Work Group Size is : 256 + The Device Max EUCount is : 24 + SYCL time: 3282 ms + + Computing wavefield in CPU .. + Initializing ... + CPU time: 8846 ms + + Final wavefields from device and CPU are equivalent: Success + Final wavefields (from device and CPU) written to disk + Finished. + [100%] Built target run + ``` diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.sln b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.sln new file mode 100644 index 0000000000..174faa6896 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.960 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "iso2dfd", "iso2dfd.vcxproj", "{1AE3DD06-C3F0-4746-B126-EEB6A94CF35C}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {1AE3DD06-C3F0-4746-B126-EEB6A94CF35C}.Debug|x64.ActiveCfg = Debug|x64 + {1AE3DD06-C3F0-4746-B126-EEB6A94CF35C}.Debug|x64.Build.0 = Debug|x64 + {1AE3DD06-C3F0-4746-B126-EEB6A94CF35C}.Release|x64.ActiveCfg = Release|x64 + {1AE3DD06-C3F0-4746-B126-EEB6A94CF35C}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {AC4985B6-FFDE-4420-B533-7D4318863288} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.vcxproj b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.vcxproj new file mode 100644 index 0000000000..7d258692a9 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.vcxproj @@ -0,0 +1,151 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + 15.0 + {1ae3dd06-c3f0-4746-b126-eeb6a94cf35c} + Win32Proj + iso2dfd + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + $(ONEAPI_ROOT)dev-utilities\latest\include + + + Console + true + true + true + + + + + + diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.vcxproj.filters b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.vcxproj.filters new file mode 100644 index 0000000000..1b7c40576f --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.vcxproj.filters @@ -0,0 +1,22 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.vcxproj.user b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.vcxproj.user new file mode 100644 index 0000000000..f5d6e260eb --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/iso2dfd.vcxproj.user @@ -0,0 +1,11 @@ + + + + 1000 1000 2000 + WindowsLocalDebugger + + + 1000 1000 2000 + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/sample.json b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/sample.json new file mode 100644 index 0000000000..f97a3bd596 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/sample.json @@ -0,0 +1,30 @@ +{ + "guid": "9483C0F0-7D63-4E99-86C5-37C40F77B2AE" , + "name": "iso2dfd_dpcpp", + "categories": [ "Toolkit/Intel® oneAPI HPC Toolkit" ], + "description": "ISO2DFD: Intel® oneAPI DPC++ Language Basics Using 2D Finite-Difference-Wave Propagation", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows"], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild iso2dfd.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "iso2dfd.exe 1000 1000 2000" + ] + }] + + } +} diff --git a/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/src/iso2dfd.cpp b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/src/iso2dfd.cpp new file mode 100644 index 0000000000..e4638ba703 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso2dfd_dpcpp/src/iso2dfd.cpp @@ -0,0 +1,380 @@ +//============================================================== +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +// ISO2DFD: Intel® oneAPI DPC++ Language Basics Using 2D-Finite-Difference-Wave +// Propagation +// +// ISO2DFD is a finite difference stencil kernel for solving the 2D acoustic +// isotropic wave equation. Kernels in this sample are implemented as 2nd order +// in space, 2nd order in time scheme without boundary conditions. Using Data +// Parallel C++, the sample will explicitly run on the GPU as well as CPU to +// calculate a result. If successful, the output will include GPU device name. +// +// A complete online tutorial for this code sample can be found at : +// https://software.intel.com/en-us/articles/code-sample-two-dimensional-finite-difference-wave-propagation-in-isotropic-media-iso2dfd +// +// For comprehensive instructions regarding DPC++ Programming, go to +// https://software.intel.com/en-us/oneapi-programming-guide +// and search based on relevant terms noted in the comments. +// +// DPC++ material used in this code sample: +// +// Basic structures of DPC++: +// DPC++ Queues (including device selectors and exception handlers) +// DPC++ Buffers and accessors (communicate data between the host and the device) +// DPC++ Kernels (including parallel_for function and range<2> objects) +// + +#include +#include +#include +#include +#include +#include +#include + +#include "dpc_common.hpp" + +using namespace cl::sycl; +using namespace std; + +/* + * Parameters to define coefficients + * half_length: Radius of the stencil + * Sample source code is tested for half_length=1 resulting in + * 2nd order Stencil finite difference kernel + */ + +constexpr float DT = 0.002f; +constexpr float DXY = 20.0f; +constexpr unsigned int half_length = 1; + +/* + * Host-Code + * Utility function to display input arguments + */ +void Usage(const string &program_name) { + cout << " Incorrect parameters\n"; + cout << " Usage: "; + cout << program_name << " n1 n2 Iterations\n\n"; + cout << " n1 n2 : Grid sizes for the stencil\n"; + cout << " Iterations : No. of timesteps.\n"; +} + +/* + * Host-Code + * Function used for initialization + */ +void Initialize(float* ptr_prev, float* ptr_next, float* ptr_vel, size_t n_rows, + size_t n_cols) { + cout << "Initializing ...\n"; + + // Define source wavelet + float wavelet[12] = {0.016387336, -0.041464937, -0.067372555, 0.386110067, + 0.812723635, 0.416998396, 0.076488599, -0.059434419, + 0.023680172, 0.005611435, 0.001823209, -0.000720549}; + + // Initialize arrays + for (size_t i = 0; i < n_rows; i++) { + size_t offset = i * n_cols; + + for (int k = 0; k < n_cols; k++) { + ptr_prev[offset + k] = 0.0f; + ptr_next[offset + k] = 0.0f; + // pre-compute squared value of sample wave velocity v*v (v = 1500 m/s) + ptr_vel[offset + k] = (1500.0f * 1500.0f); + } + } + // Add a source to initial wavefield as an initial condition + for (int s = 11; s >= 0; s--) { + for (int i = n_rows / 2 - s; i < n_rows / 2 + s; i++) { + size_t offset = i * n_cols; + for (int k = n_cols / 2 - s; k < n_cols / 2 + s; k++) { + ptr_prev[offset + k] = wavelet[s]; + } + } + } +} + +/* + * Host-Code + * Utility function to print device info + */ +void PrintTargetInfo(queue& q) { + auto device = q.get_device(); + auto max_block_size = + device.get_info(); + + auto max_EU_count = + device.get_info(); + + cout<< " Running on " << device.get_info()<<"\n"; + cout<< " The Device Max Work Group Size is : "<< max_block_size<<"\n"; + cout<< " The Device Max EUCount is : " << max_EU_count<<"\n"; +} + +/* + * Host-Code + * Utility function to calculate L2-norm between resulting buffer and reference + * buffer + */ +bool WithinEpsilon(float* output, float* reference, const size_t dim_x, + const size_t dim_y, const unsigned int radius, + const float delta = 0.01f) { + ofstream err_file; + err_file.open("error_diff.txt"); + + bool error = false; + double norm2 = 0; + + for (size_t iy = 0; iy < dim_y; iy++) { + for (size_t ix = 0; ix < dim_x; ix++) { + if (ix >= radius && ix < (dim_x - radius) && iy >= radius && + iy < (dim_y - radius)) { + float difference = fabsf(*reference - *output); + norm2 += difference * difference; + if (difference > delta) { + error = true; + err_file<<" ERROR: "< it, float* next, float* prev, + float* vel, const float dtDIVdxy, int n_rows, + int n_cols) { + float value = 0.0; + + // Compute global id + // We can use the get.global.id() function of the item variable + // to compute global id. The 2D array is laid out in memory in row major + // order. + size_t gid_row = it.get(0); + size_t gid_col = it.get(1); + size_t gid = (gid_row)*n_cols + gid_col; + + // Computation to solve wave equation in 2D + // First check if gid is inside the effective grid (not in halo) + if ((gid_col >= half_length && gid_col < n_cols - half_length) && + (gid_row >= half_length && gid_row < n_rows - half_length)) { + // Stencil code to update grid point at position given by global id (gid) + // New time step for grid point is computed based on the values of the + // the immediate neighbors in both the horizontal and vertical + // directions, as well as the value of grid point at a previous time step + value = 0.0; + value += prev[gid + 1] - 2.0 * prev[gid] + prev[gid - 1]; + value += prev[gid + n_cols] - 2.0 * prev[gid] + prev[gid - n_cols]; + value *= dtDIVdxy * vel[gid]; + next[gid] = 2.0f * prev[gid] - next[gid] + value; + } +} + +int main(int argc, char* argv[]) { + // Arrays used to update the wavefield + float* prev_base; + float* next_base; + float* next_cpu; + // Array to store wave velocity + float* vel_base; + + bool error = false; + + size_t n_rows, n_cols; + unsigned int n_iterations; + + // Read parameters + try { + n_rows = stoi(argv[1]); + n_cols = stoi(argv[2]); + n_iterations = stoi(argv[3]); + } + + catch (...) { + Usage(argv[0]); + return 1; + } + + // Compute the total size of grid + size_t n_size = n_rows * n_cols; + + // Allocate arrays to hold wavefield and velocity + prev_base = new float[n_size]; + next_base = new float[n_size]; + next_cpu = new float[n_size]; + vel_base = new float[n_size]; + + // Compute constant value (delta t)^2 (delta x)^2. To be used in wavefield + // update + float dtDIVdxy = (DT * DT) / (DXY * DXY); + + // Initialize arrays and introduce initial conditions (source) + Initialize(prev_base, next_base, vel_base, n_rows, n_cols); + + cout << "Grid Sizes: " << n_rows << " " << n_cols << "\n"; + cout << "Iterations: " << n_iterations << "\n\n"; + + // Define device selector as 'default' + default_selector device_selector; + + // Create a device queue using DPC++ class queue + queue q(device_selector, dpc_common::exception_handler); + + cout << "Computing wavefield in device ..\n"; + // Display info about device + PrintTargetInfo(q); + + // Start timer + dpc_common::TimeInterval t_offload; + + { // Begin buffer scope + // Create buffers using DPC++ class buffer + buffer b_next(next_base, range(n_size)); + buffer b_prev(prev_base, range(n_size)); + buffer b_vel(vel_base, range(n_size)); + + // Iterate over time steps + for (unsigned int k = 0; k < n_iterations; k += 1) { + // Submit command group for execution + q.submit([&](auto &h) { + // Create accessors + auto next = b_next.get_access(h); + auto prev = b_prev.get_access(h); + auto vel = b_vel.get_access(h); + + // Define local and global range + auto global_range = range<2>(n_rows, n_cols); + + // Send a DPC++ kernel (lambda) for parallel execution + // The function that executes a single iteration is called + // "iso_2dfd_iteration_global" + // alternating the 'next' and 'prev' parameters which effectively + // swaps their content at every iteration. + if (k % 2 == 0) + h.parallel_for(global_range, [=](id<2> it) { + Iso2dfdIterationGlobal(it, next.get_pointer(), + prev.get_pointer(), vel.get_pointer(), + dtDIVdxy, n_rows, n_cols); + }); + else + h.parallel_for(global_range, [=](id<2> it) { + Iso2dfdIterationGlobal(it, prev.get_pointer(), + next.get_pointer(), vel.get_pointer(), + dtDIVdxy, n_rows, n_cols); + }); + }); + + } // end for + + } // buffer scope + + // Wait for commands to complete. Enforce synchronization on the command queue + q.wait_and_throw(); + + // Compute and display time used by device + auto time = t_offload.Elapsed(); + + cout << "Offload time: " << time << " ms\n\n"; + + // Output final wavefield (computed by device) to binary file + ofstream out_file; + out_file.open("wavefield_snapshot.bin", ios::out | ios::binary); + out_file.write(reinterpret_cast(next_base), n_size * sizeof(float)); + out_file.close(); + + // Compute wavefield on CPU (for validation) + + cout << "Computing wavefield in CPU ..\n"; + // Re-initialize arrays + Initialize(prev_base, next_cpu, vel_base, n_rows, n_cols); + + // Compute wavefield on CPU + // Start timer for CPU + dpc_common::TimeInterval t_cpu; + + Iso2dfdIterationCpu(next_cpu, prev_base, vel_base, dtDIVdxy, n_rows, n_cols, + n_iterations); + + // Compute and display time used by CPU + time = t_cpu.Elapsed(); + + cout << "CPU time: " << time << " ms\n\n"; + + // Compute error (difference between final wavefields computed in device and + // CPU) + error = WithinEpsilon(next_base, next_cpu, n_rows, n_cols, half_length, 0.1f); + + // If error greater than threshold (last parameter in error function), report + if (error) + cout << "Final wavefields from device and CPU are different: Error\n"; + else + cout << "Final wavefields from device and CPU are equivalent: Success\n"; + + // Output final wavefield (computed by CPU) to binary file + out_file.open("wavefield_snapshot_cpu.bin", ios::out | ios::binary); + out_file.write(reinterpret_cast(next_cpu), n_size * sizeof(float)); + out_file.close(); + + cout << "Final wavefields (from device and CPU) written to disk\n"; + cout << "Finished.\n"; + + // Cleanup + delete[] prev_base; + delete[] next_base; + delete[] vel_base; + + return error ? 1 : 0; +} diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/CMakeLists.txt b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/CMakeLists.txt new file mode 100644 index 0000000000..ab666d05d1 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/CMakeLists.txt @@ -0,0 +1,8 @@ +cmake_minimum_required (VERSION 3.0) +project (ISO3DFD_DPCPP) +if(WIN32) + set(CMAKE_CXX_COMPILER "dpcpp-cl") +else() + set(CMAKE_CXX_COMPILER "dpcpp") +endif() +add_subdirectory (src) diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt new file mode 100644 index 0000000000..148940418d --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/License.txt @@ -0,0 +1,7 @@ +Copyright 2020 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md new file mode 100644 index 0000000000..516f9c1ba6 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/README.md @@ -0,0 +1,145 @@ +# `ISO3DFD DPC++` Sample + +The ISO3DFD sample refers to Three-Dimensional Finite-Difference Wave Propagation in Isotropic Media. It is a three-dimensional stencil to simulate a wave propagating in a 3D isotropic medium and shows some of the more common challenges when targeting SYCL devices (GPU/CPU) in more complex applications. + +For comprehensive instructions regarding DPC++ Programming, go to https://software.intel.com/en-us/oneapi-programming-guide and search based on relevant terms noted in the comments. + +| Optimized for | Description +|:--- |:--- +| OS | Linux* Ubuntu* 18.04; Windows 10 +| Hardware | Skylake with GEN9 or newer +| Software | Intel® oneAPI DPC++ Compiler beta; +| What you will learn | How to offload the computation to GPU using Intel DPC++ compiler +| Time to complete | 15 minutes + +Performance number tabulation [if applicable -- **NO for beta**] + +| iso3dfd sample | Performance data +|:--- |:--- +| Scalar baseline -O2 | 1.0 +| SYCL | 2x speedup + + +## Purpose + +ISO3DFD is a finite difference stencil kernel for solving the 3D acoustic isotropic wave equation which can be used as a proxy for propogating a seismic wave. Kernels in this sample are implemented as 16th order in space, with symmetric coefficients, and 2nd order in time scheme without boundary conditions.. Using Data Parallel C++, the sample can explicitly run on the GPU and/or CPU to propagate a seismic wave which is a compute intensive task. + +The code will attempt first to execute on an available GPU and fallback to the system's CPU if a compatible GPU is not detected. By default, the output will print the device name where the DPC++ code ran along with the grid computation metrics - flops and effective throughput. For validating results, a serial version of the application will be run on CPU and results will be compared to the DPC++ version. + + +## Key Implementation Details + +The basic DPC++ implementation explained in the code includes includes the use of the following : +* DPC++ local buffers and accessors (declare local memory buffers and accessors to be accessed and managed by each DPC++ workgroup) +* Code for Shared Local Memory (SLM) optimizations +* DPC++ kernels (including parallel_for function and nd-range<3> objects) +* DPC++ queues (including custom device selector and exception handlers) + + +## License + +This code sample is licensed under MIT license. + + +## Building the `ISO3DFD` Program for CPU and GPU + +### Include Files +The include folder is located at `%ONEAPI_ROOT%\dev-utilities\latest\include` on your development system". + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU) as well whether to run in batch or interactive mode. For more information see the Intel® oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +Perform the following steps: +1. Build the program using the following `cmake` commands. +``` +$ mkdir build +$ cd build +$ cmake .. +$ make -j +``` + +> Note: by default, executable is build with kernel with direct global memory usage. You can build the kernel with shared local memory (SLM) buffers with the following: +``` +cmake -DSHARED_KERNEL=1 .. +make -j +``` + +2. Run the program : + ``` + make run + ``` +> Note: for selecting CPU as a SYCL device use `make run_cpu` + +3. Clean the program using: + ``` + make clean + ``` + +### On a Windows* System Using Visual Studio* Version 2017 or Newer +``` +* Build the program using VS2017 or VS2019 + Right click on the solution file and open using either VS2017 or VS2019 IDE. + Right click on the project in Solution explorer and select Rebuild. + From top menu select Debug -> Start without Debugging. + +* Build the program using MSBuild + Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019" + Run - MSBuild mandelbrot.sln /t:Rebuild /p:Configuration="Release" +``` + +## Running the Sample +``` +make run +``` + +### Application Parameters +You can modify the ISO3DFD parameters from the command line. + * Configurable Application Parameters + + Usage: src/iso3dfd.exe n1 n2 n3 b1 b2 b3 Iterations [omp|sycl] [gpu|cpu] + + n1 n2 n3 : Grid sizes for the stencil + b1 b2 b3 OR : cache block sizes for cpu openmp version. + b1 b2 : Thread block sizes in X and Y dimension for SYCL version. + and b3 : size of slice of work in Z dimension for SYCL version. + Iterations : No. of timesteps. + [omp|sycl] : Optional: Run the OpenMP or the SYCL variant. Default is to use both for validation + [gpu|cpu] : Optional: Device to run the SYCL version Default is to use the GPU if available, if not fallback to CPU + +### Example of Output +``` +Grid Sizes: 256 256 256 +Memory Usage: 230 MB + ***** Running C++ Serial variant ***** +Initializing ... +-------------------------------------- +time : 2.92984 secs +throughput : 57.2632 Mpts/s +flops : 3.49306 GFlops +bytes : 0.687159 GBytes/s + +-------------------------------------- + +-------------------------------------- + ***** Running SYCL variant ***** +Initializing ... + Running on Intel(R) Gen9 + The Device Max Work Group Size is : 256 + The Device Max EUCount is : 48 + The blockSize x is : 32 + The blockSize y is : 8 + Using Global Memory Kernel +-------------------------------------- +time : 0.597494 secs +throughput : 280.793 Mpts/s +flops : 17.1284 GFlops +bytes : 3.36952 GBytes/s + +-------------------------------------- + +-------------------------------------- +Final wavefields from SYCL device and CPU are equivalent: Success +-------------------------------------- +``` + diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/device_selector.hpp b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/device_selector.hpp new file mode 100644 index 0000000000..dcef0afa0e --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/device_selector.hpp @@ -0,0 +1,47 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef DEVICESELECTOR_HPP +#define DEVICESELECTOR_HPP + +#include +#include +#include +#include "CL/sycl.hpp" + +// This is the class provided to SYCL runtime by the application to decide +// on which device to run, or whether to run at all. +// When selecting a device, SYCL runtime first takes (1) a selector provided by +// the program or a default one and (2) the set of all available devices. Then +// it passes each device to the '()' operator of the selector. Device, for +// which '()' returned the highest number, is selected. If a negative number +// was returned for all devices, then the selection process will cause an +// exception. +class MyDeviceSelector : public sycl::device_selector { + public: + MyDeviceSelector(const std::string &p) : pattern(p) { + // std::cout << "Looking for \"" << p << "\" devices" << std::endl; + } + + // This is the function which gives a "rating" to devices. + virtual int operator()(const sycl::device &device) const override { + // The template parameter to device.get_info can be a variety of properties + // defined by the SYCL spec's sycl::info:: enum. Properties may have + // different types. Here we query name which is a string. + const std::string name = device.get_info(); + // std::cout << "Trying device: " << name << "..." << std::endl; + // std::cout << " Vendor: " << + // device.get_info() << std::endl; + + // Device with pattern in the name is prioritized: + return (name.find(pattern) != std::string::npos) ? 100 : 1; + } + + private: + std::string pattern; +}; + +#endif diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/iso3dfd.h b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/iso3dfd.h new file mode 100644 index 0000000000..50c65cd6f6 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/include/iso3dfd.h @@ -0,0 +1,51 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include +using namespace sycl; + +#include +#include +#include +#include +#include +/* + * Parameters to define coefficients + * kHalfLength: Radius of the stencil + * Sample source code is tested for kHalfLength=8 resulting in + * 16th order Stencil finite difference kernel + */ +constexpr float dt = 0.002f; +constexpr float dxyz = 50.0f; +constexpr unsigned int kHalfLength = 8; + +/* + * Padding to test and eliminate shared local memory bank conflicts for + * the shared local memory(slm) version of the kernel executing on GPU + */ +constexpr unsigned int kPad = 0; + +bool Iso3dfdDevice(sycl::queue &q, float *ptr_next, float *ptr_prev, + float *ptr_vel, float *ptr_coeff, size_t n1, size_t n2, + size_t n3, size_t n1_block, size_t n2_block, + size_t n3_block, size_t end_z, unsigned int num_iterations); + +void PrintTargetInfo(sycl::queue &q, unsigned int dim_x, unsigned int dim_y); + +void Usage(const std::string &program_name); + +void PrintStats(double time, size_t n1, size_t n2, size_t n3, + unsigned int num_iterations); + +bool WithinEpsilon(float *output, float *reference, const size_t dim_x, + const size_t dim_y, const size_t dim_z, + const unsigned int radius, const int zadjust, + const float delta); + +bool CheckGridDimension(size_t n1, size_t n2, size_t n3, unsigned int dim_x, + unsigned int dim_y, unsigned int block_z); + +bool CheckBlockDimension(sycl::queue &q, unsigned int dim_x, unsigned int dim_y); diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.sln b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.sln new file mode 100755 index 0000000000..33e315d59e --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.27130.2010 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "iso3dfd_dpcpp", "iso3dfd_dpcpp.vcxproj", "{07DA0A96-CA76-4446-9586-99A145B9A9C8}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {07DA0A96-CA76-4446-9586-99A145B9A9C8}.Debug|x64.ActiveCfg = Debug|x64 + {07DA0A96-CA76-4446-9586-99A145B9A9C8}.Debug|x64.Build.0 = Debug|x64 + {07DA0A96-CA76-4446-9586-99A145B9A9C8}.Release|x64.ActiveCfg = Release|x64 + {07DA0A96-CA76-4446-9586-99A145B9A9C8}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {3F515120-AC09-42A9-97D3-A26B1251EC9D} + EndGlobalSection +EndGlobal diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj new file mode 100755 index 0000000000..17aba91fe9 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj @@ -0,0 +1,157 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {07da0a96-ca76-4446-9586-99a145b9a9c8} + Win32Proj + iso3dfd_dpcpp + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + + + false + + + false + + + + Use + Level3 + Disabled + true + true + pch.h + include;$(ONEAPI_ROOT)dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + include;$(ONEAPI_ROOT)dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + include;$(ONEAPI_ROOT)dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + include;$(ONEAPI_ROOT)dev-utilities\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + + + + + + + + + + + + + + + diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.filters b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.filters new file mode 100755 index 0000000000..e314961ca6 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.filters @@ -0,0 +1,36 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + Source Files + + + Source Files + + + + + Header Files + + + Header Files + + + diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.user b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.user new file mode 100755 index 0000000000..5675a6f273 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/iso3dfd_dpcpp.vcxproj.user @@ -0,0 +1,11 @@ + + + + 256 256 256 32 8 64 100 sycl gpu + WindowsLocalDebugger + + + 256 256 256 32 8 64 100 sycl gpu + WindowsLocalDebugger + + \ No newline at end of file diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json new file mode 100755 index 0000000000..9d6ed588ab --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/sample.json @@ -0,0 +1,30 @@ +{ + "guid": "3A7DA713-6083-4CA3-B66E-A3DF21744EB4", + "name": "iso3dfd_dpcpp", + "categories": [ "Toolkit/Intel® oneAPI HPC Toolkit" ], + "description": "A finite difference stencil kernel for solving 3D acoustic isotropic wave equation", + "toolchain": [ "dpcpp" ], + "targetDevice": [ "CPU", "GPU" ], + "languages": [ { "cpp": {} } ], + "os": [ "linux", "windows" ], + "builder": [ "ide", "cmake" ], + "ciTests": { + "linux": [{ + "steps": [ + "mkdir build", + "cd build", + "cmake ..", + "make", + "make run" + ] + }], + "windows": [{ + "steps": [ + "MSBuild iso3dfd_dpcpp.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64/Release", + "iso3dfd_dpcpp.exe 256 256 256 32 8 64 10 gpu" + ] + }] + + } +} diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/CMakeLists.txt b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/CMakeLists.txt new file mode 100644 index 0000000000..4801b32e96 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/CMakeLists.txt @@ -0,0 +1,27 @@ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 --std=c++17") +include_directories("../include/") + +OPTION(SHARED_KERNEL "Use SLM Kernel Version - Only for GPU" OFF) +if(SHARED_KERNEL) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_SHARED") +endif(SHARED_KERNEL) + +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") + +add_executable (iso3dfd.exe iso3dfd.cpp iso3dfd_kernels.cpp utils.cpp) +target_link_libraries(iso3dfd.exe OpenCL sycl) +if(WIN32) + add_custom_target (run iso3dfd.exe 256 256 256 32 8 64 10 gpu) + add_custom_target (run_cpu iso3dfd.exe 256 256 256 256 1 1 10 cpu) +else() + add_custom_target (run iso3dfd.exe 256 256 256 32 8 64 10 gpu) + add_custom_target (run_cpu iso3dfd.exe 256 256 256 256 1 1 10 cpu) +endif() diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd.cpp b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd.cpp new file mode 100644 index 0000000000..e2c4a687d8 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd.cpp @@ -0,0 +1,343 @@ +//============================================================== +// Copyright 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +// ISO3DFD: Intel oneAPI DPC++ Language Basics Using 3D-Finite-Difference-Wave +// Propagation +// +// ISO3DFD is a finite difference stencil kernel for solving the 3D acoustic +// isotropic wave equation. Kernels in this sample are implemented as 16th order +// in space, 2nd order in time scheme without boundary conditions. Using Data +// Parallel C++, the sample can explicitly run on the GPU and/or CPU to +// calculate a result. If successful, the output will print the device name +// where the DPC++ code ran along with the grid computation metrics - flops +// and effective throughput +// +// For comprehensive instructions regarding DPC++ Programming, go to +// https://software.intel.com/en-us/oneapi-programming-guide +// and search based on relevant terms noted in the comments. +// +// DPC++ material used in this code sample: +// +// DPC++ Queues (including device selectors and exception handlers) +// DPC++ Custom device selector +// DPC++ Buffers and accessors (communicate data between the host and the +// device) +// DPC++ Kernels (including parallel_for function and nd-range<3> +// objects) +// Shared Local Memory (SLM) optimizations (DPC++) +// DPC++ Basic synchronization (barrier function) +// +#include "iso3dfd.h" +#include +#include "device_selector.hpp" +#include "dpc_common.hpp" + +/* + * Host-Code + * Function used for initialization + */ +void Initialize(float* ptr_prev, float* ptr_next, float* ptr_vel, size_t n1, + size_t n2, size_t n3) { + std::cout << "Initializing ... \n"; + size_t dim2 = n2 * n1; + + for (size_t i = 0; i < n3; i++) { + for (size_t j = 0; j < n2; j++) { + size_t offset = i * dim2 + j * n1; +#pragma omp simd + for (int k = 0; k < n1; k++) { + ptr_prev[offset + k] = 0.0f; + ptr_next[offset + k] = 0.0f; + ptr_vel[offset + k] = + 2250000.0f * dt * dt; // Integration of the v*v and dt*dt + } + } + } + // Add a source to initial wavefield as an initial condition + float val = 1.f; + for (int s = 5; s >= 0; s--) { + for (int i = n3 / 2 - s; i < n3 / 2 + s; i++) { + for (int j = n2 / 4 - s; j < n2 / 4 + s; j++) { + size_t offset = i * dim2 + j * n1; + for (int k = n1 / 4 - s; k < n1 / 4 + s; k++) { + ptr_prev[offset + k] = val; + } + } + } + val *= 10; + } +} + +/* + * Host-Code + * OpenMP implementation for single iteration of iso3dfd kernel. + * This function is used as reference implementation for verification and + * also to compare performance of OpenMP and DPC++ on CPU + * Additional Details: + * https://software.intel.com/en-us/articles/eight-optimizations-for-3-dimensional-finite-difference-3dfd-code-with-an-isotropic-iso + */ +void Iso3dfdIteration(float* ptr_next_base, float* ptr_prev_base, + float* ptr_vel_base, float* coeff, const size_t n1, + const size_t n2, const size_t n3, const size_t n1_block, + const size_t n2_block, const size_t n3_block) { + size_t dimn1n2 = n1 * n2; + size_t n3End = n3 - kHalfLength; + size_t n2End = n2 - kHalfLength; + size_t n1End = n1 - kHalfLength; + +#pragma omp parallel default(shared) +#pragma omp for schedule(static) collapse(3) + for (size_t bz = kHalfLength; bz < n3End; + bz += n3_block) { // start of cache blocking + for (size_t by = kHalfLength; by < n2End; by += n2_block) { + for (size_t bx = kHalfLength; bx < n1End; bx += n1_block) { + int izEnd = std::min(bz + n3_block, n3End); + int iyEnd = std::min(by + n2_block, n2End); + int ixEnd = std::min(n1_block, n1End - bx); + for (size_t iz = bz; iz < izEnd; iz++) { // start of inner iterations + for (size_t iy = by; iy < iyEnd; iy++) { + float* ptr_next = ptr_next_base + iz * dimn1n2 + iy * n1 + bx; + float* ptr_prev = ptr_prev_base + iz * dimn1n2 + iy * n1 + bx; + float* ptr_vel = ptr_vel_base + iz * dimn1n2 + iy * n1 + bx; +#pragma omp simd + for (size_t ix = 0; ix < ixEnd; ix++) { + float value = 0.0; + value += ptr_prev[ix] * coeff[0]; +#pragma unroll(kHalfLength) + for (unsigned int ir = 1; ir <= kHalfLength; ir++) { + value += coeff[ir] * + ((ptr_prev[ix + ir] + ptr_prev[ix - ir]) + + (ptr_prev[ix + ir * n1] + ptr_prev[ix - ir * n1]) + + (ptr_prev[ix + ir * dimn1n2] + + ptr_prev[ix - ir * dimn1n2])); + } + ptr_next[ix] = + 2.0f * ptr_prev[ix] - ptr_next[ix] + value * ptr_vel[ix]; + } + } + } // end of inner iterations + } + } + } // end of cache blocking +} + +/* + * Host-Code + * Driver function for ISO3DFD OpenMP code + * Uses ptr_next and ptr_prev as ping-pong buffers to achieve + * accelerated wave propogation + */ +void Iso3dfd(float* ptr_next, float* ptr_prev, float* ptr_vel, float* coeff, + const size_t n1, const size_t n2, const size_t n3, + const unsigned int nreps, const size_t n1_block, + const size_t n2_block, const size_t n3_block) { + for (unsigned int it = 0; it < nreps; it += 1) { + Iso3dfdIteration(ptr_next, ptr_prev, ptr_vel, coeff, n1, n2, n3, n1_block, + n2_block, n3_block); + + // here's where boundary conditions and halo exchanges happen + // Swap previous & next between iterations + it++; + if (it < nreps) + Iso3dfdIteration(ptr_prev, ptr_next, ptr_vel, coeff, n1, n2, n3, n1_block, + n2_block, n3_block); + } // time loop +} + +/* + * Host-Code + * Main function to drive the sample application + */ +int main(int argc, char* argv[]) { + // Arrays used to update the wavefield + float* prev_base; + float* next_base; + // Array to store wave velocity + float* vel_base; + // Array to store results for comparison + float* temp; + + bool sycl = true; + bool omp = true; + bool error = false; + bool is_gpu = true; + + size_t n1, n2, n3; + size_t n1_block, n2_block, n3_block; + unsigned int num_iterations; + + // Read Input Parameters + try { + n1 = std::stoi(argv[1]) + (2 * kHalfLength); + n2 = std::stoi(argv[2]) + (2 * kHalfLength); + n3 = std::stoi(argv[3]) + (2 * kHalfLength); + n1_block = std::stoi(argv[4]); + n2_block = std::stoi(argv[5]); + n3_block = std::stoi(argv[6]); + num_iterations = std::stoi(argv[7]); + } + + catch (...) { + Usage(argv[0]); + return 1; + } + + // Read optional arguments to select version and device + for (auto arg = 8; arg < argc; arg++) { + std::string arg_value = argv[arg]; + transform(arg_value.begin(), arg_value.end(), arg_value.begin(), ::tolower); + + if (arg_value == "omp") { + omp = true; + sycl = false; + } else if (arg_value == "sycl") { + omp = false; + sycl = true; + } else if (arg_value == "gpu") { + is_gpu = true; + } else if (arg_value == "cpu") { + is_gpu = false; + } else { + Usage(argv[0]); + return 1; + } + } + + // Validate input sizes for the grid and block dimensions + if (CheckGridDimension(n1 - 2 * kHalfLength, n2 - 2 * kHalfLength, + n3 - 2 * kHalfLength, n1_block, n2_block, n3_block)) { + Usage(argv[0]); + return 1; + } + + // Compute the total size of grid + size_t nsize = n1 * n2 * n3; + + prev_base = new float[nsize]; + next_base = new float[nsize]; + vel_base = new float[nsize]; + + // Compute coefficients to be used in wavefield update + float coeff[kHalfLength + 1] = {-3.0548446, +1.7777778, -3.1111111e-1, + +7.572087e-2, -1.76767677e-2, +3.480962e-3, + -5.180005e-4, +5.074287e-5, -2.42812e-6}; + + // Apply the DX DY and DZ to coefficients + coeff[0] = (3.0f * coeff[0]) / (dxyz * dxyz); + for (int i = 1; i <= kHalfLength; i++) { + coeff[i] = coeff[i] / (dxyz * dxyz); + } + + std::cout << "Grid Sizes: " << n1 - 2 * kHalfLength << " " + << n2 - 2 * kHalfLength << " " << n3 - 2 * kHalfLength << "\n"; + std::cout << "Memory Usage: " << ((3 * nsize * sizeof(float)) / (1024 * 1024)) + << " MB\n"; + + // Check if running OpenMP OR Serial version on CPU + if (omp) { +#if defined(_OPENMP) + std::cout << " ***** Running OpenMP variant *****\n"; +#else + std::cout << " ***** Running C++ Serial variant *****\n"; +#endif + + // Initialize arrays and introduce initial conditions (source) + Initialize(prev_base, next_base, vel_base, n1, n2, n3); + + // Start timer + dpc_common::TimeInterval t_ser; + // Invoke the driver function to perform 3D wave propogation + // using OpenMP/Serial version + Iso3dfd(next_base, prev_base, vel_base, coeff, n1, n2, n3, num_iterations, + n1_block, n2_block, n3_block); + + // End timer + PrintStats(t_ser.Elapsed() * 1e3, n1, n2, n3, num_iterations); + } + + // Check if running both OpenMP/Serial and DPC++ version + // Keeping a copy of output buffer from OpenMP version + // for comparison + if (omp && sycl) { + temp = new float[nsize]; + if (num_iterations % 2) + memcpy(temp, next_base, nsize * sizeof(float)); + else + memcpy(temp, prev_base, nsize * sizeof(float)); + } + + // Check if running DPC++/SYCL version + if (sycl) { + std::cout << " ***** Running SYCL variant *****\n"; + // Initialize arrays and introduce initial conditions (source) + Initialize(prev_base, next_base, vel_base, n1, n2, n3); + + // Initializing a string pattern to allow a custom device selector + // pick a SYCL device as per user's preference and available devices + // Default value of pattern is set to CPU + std::string pattern("CPU"); + std::string pattern_gpu("Gen"); + + // Replacing the pattern string to Gen if running on a GPU + if (is_gpu) { + pattern.replace(0, 3, pattern_gpu); + } + + // Create a custom device selector using DPC++ device selector class + MyDeviceSelector device_sel(pattern); + + // Create a device queue using DPC++ class queue with a custom + // device selector + queue q(device_sel, dpc_common::exception_handler); + + // Validate if the block sizes selected are + // within range for the selected SYCL device + if (CheckBlockDimension(q, n1_block, n2_block)) { + Usage(argv[0]); + return 1; + } + + // Start timer + dpc_common::TimeInterval t_dpc; + + // Invoke the driver function to perform 3D wave propogation + // using DPC++ version on the selected SYCL device + Iso3dfdDevice(q, next_base, prev_base, vel_base, coeff, n1, n2, n3, + n1_block, n2_block, n3_block, n3 - kHalfLength, + num_iterations); + // Wait for the commands to complete. Enforce synchronization on the command + // queue + q.wait_and_throw(); + + // End timer + PrintStats(t_dpc.Elapsed() * 1e3, n1, n2, n3, num_iterations); + } + + // If running both OpenMP/Serial and DPC++ version + // Comparing results + if (omp && sycl) { + if (num_iterations % 2) { + error = WithinEpsilon(next_base, temp, n1, n2, n3, kHalfLength, 0, 0.1f); + } else { + error = WithinEpsilon(prev_base, temp, n1, n2, n3, kHalfLength, 0, 0.1f); + } + if (error) { + std::cout << "Final wavefields from SYCL device and CPU are not " + << "equivalent: Fail\n"; + } else { + std::cout << "Final wavefields from SYCL device and CPU are equivalent:" + << " Success\n"; + } + std::cout << "--------------------------------------\n"; + delete[] temp; + } + + delete[] prev_base; + delete[] next_base; + delete[] vel_base; + + return error ? 1 : 0; +} diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd_kernels.cpp b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd_kernels.cpp new file mode 100644 index 0000000000..1b7bdec23e --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/iso3dfd_kernels.cpp @@ -0,0 +1,419 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= +// ISO3DFD: Intel® oneAPI DPC++ Language Basics Using 3D-Finite-Difference-Wave +// Propagation +// +// ISO3DFD is a finite difference stencil kernel for solving the 3D acoustic +// isotropic wave equation which can be used as a proxy for propogating a +// seismic wave. Kernels in this sample are implemented as 16th order in space, +// with symmetric coefficients, and 2nd order in time scheme without boundary +// conditions.. Using Data Parallel C++, the sample can explicitly run on the +// GPU and/or CPU to propagate a seismic wave which is a compute intensive task. +// If successful, the output will print the device name +// where the DPC++ code ran along with the grid computation metrics - flops +// and effective throughput. +// +// For comprehensive instructions regarding DPC++ Programming, go to +// https://software.intel.com/en-us/oneapi-programming-guide +// and search based on relevant terms noted in the comments. +// +// DPC++ material used in this code sample: +// +// DPC++ Queues (including device selectors and exception handlers) +// DPC++ Custom device selector +// DPC++ Buffers and accessors (communicate data between the host and the +// device) +// DPC++ Kernels (including parallel_for function and nd-range<3> +// objects) +// Shared Local Memory (SLM) optimizations (DPC++) +// DPC++ Basic synchronization (barrier function) +// +#include "iso3dfd.h" + +/* + * Device-Code - Optimized for GPU + * SYCL implementation for single iteration of iso3dfd kernel + * using shared local memory optimizations + * + * ND-Range kernel is used to spawn work-items in x, y dimension + * Each work-item then traverses in the z-dimension + * + * z-dimension slicing can be used to vary the total number + * global work-items. + * + * SLM Padding can be used to eliminate SLM bank conflicts if + * there are any + */ +void Iso3dfdIterationSLM(sycl::nd_item<3> it, float *next, float *prev, + float *vel, const float *coeff, float *tab, size_t nx, + size_t nxy, size_t bx, size_t by, size_t z_offset, + int full_end_z) { + // Compute local-id for each work-item + auto id0 = it.get_local_id(2); + auto id1 = it.get_local_id(1); + + // Compute the position in local memory each work-item + // will fetch data from global memory into shared + // local memory + auto stride = it.get_local_range(2) + 2 * kHalfLength + kPad; + auto identifiant = (id0 + kHalfLength) + (id1 + kHalfLength) * stride; + + // We compute the start and the end position in the grid + // for each work-item. + // Each work-items local value gid is updated to track the + // current cell/grid point it is working with. + // This position is calculated with the help of slice-ID and number of + // grid points each work-item will process. + // Offset of kHalfLength is also used to account for HALO + auto begin_z = it.get_global_id(0) * z_offset + kHalfLength; + auto end_z = begin_z + z_offset; + if (end_z > full_end_z) end_z = full_end_z; + + auto gid = (it.get_global_id(2) + bx) + ((it.get_global_id(1) + by) * nx) + + (begin_z * nxy); + + // front and back temporary arrays are used to ensure + // the grid values in z-dimension are read once, shifted in + // these array and re-used multiple times before being discarded + // + // This is an optimization technique to enable data-reuse and + // improve overall FLOPS to BYTES read ratio + float front[kHalfLength + 1]; + float back[kHalfLength]; + float c[kHalfLength + 1]; + + for (auto iter = 0; iter < kHalfLength; iter++) { + front[iter] = prev[gid + iter * nxy]; + } + c[0] = coeff[0]; + + for (auto iter = 1; iter <= kHalfLength; iter++) { + back[iter - 1] = prev[gid - iter * nxy]; + c[iter] = coeff[iter]; + } + + // Shared Local Memory (SLM) optimizations (DPC++) + // Set some flags to indicate if the current work-item + // should read from global memory to shared local memory buffer + // or not + auto items_x = it.get_local_range(2); + auto items_y = it.get_local_range(1); + + bool copy_halo_y = false, copy_halo_x = false; + if (id1 < kHalfLength) copy_halo_y = true; + if (id0 < kHalfLength) copy_halo_x = true; + + for (auto i = begin_z; i < end_z; i++) { + // Shared Local Memory (SLM) optimizations (DPC++) + // If work-item is flagged to read into SLM buffer + if (copy_halo_y) { + tab[identifiant - kHalfLength * stride] = prev[gid - kHalfLength * nx]; + tab[identifiant + items_y * stride] = prev[gid + items_y * nx]; + } + if (copy_halo_x) { + tab[identifiant - kHalfLength] = prev[gid - kHalfLength]; + tab[identifiant + items_x] = prev[gid + items_x]; + } + tab[identifiant] = front[0]; + + // DPC++ Basic synchronization (barrier function) + // Force synchronization within a work-group + // using barrier function to ensure + // all the work-items have completed reading into the SLM buffer + it.barrier(access::fence_space::local_space); + + // Only one new data-point read from global memory + // in z-dimension (depth) + front[kHalfLength] = prev[gid + kHalfLength * nxy]; + + // Stencil code to update grid point at position given by global id (gid) + // New time step for grid point is computed based on the values of the + // the immediate neighbors - horizontal, vertical and depth + // directions(kHalfLength number of points in each direction), + // as well as the value of grid point at a previous time step + // + // Neighbors in the depth (z-dimension) are read out of + // front and back arrays + // Neighbors in the horizontal and vertical (x, y dimension) are + // read from the SLM buffers + float value = c[0] * front[0]; +#pragma unroll(kHalfLength) + for (auto iter = 1; iter <= kHalfLength; iter++) { + value += c[iter] * + (front[iter] + back[iter - 1] + tab[identifiant + iter] + + tab[identifiant - iter] + tab[identifiant + iter * stride] + + tab[identifiant - iter * stride]); + } + next[gid] = 2.0f * front[0] - next[gid] + value * vel[gid]; + + // Update the gid to advance in the z-dimension + gid += nxy; + + // Input data in front and back are shifted to discard the + // oldest value and read one new value. + for (auto iter = kHalfLength - 1; iter > 0; iter--) { + back[iter] = back[iter - 1]; + } + back[0] = front[0]; + + for (auto iter = 0; iter < kHalfLength; iter++) { + front[iter] = front[iter + 1]; + } + + // DPC++ Basic synchronization (barrier function) + // Force synchronization within a work-group + // using barrier function to ensure that SLM buffers + // are not overwritten by next set of work-items + // (highly unlikely but not impossible) + it.barrier(access::fence_space::local_space); + } +} + +/* + * Device-Code - Optimized for GPU, CPU + * SYCL implementation for single iteration of iso3dfd kernel + * without using any shared local memory optimizations + * + * + * ND-Range kernel is used to spawn work-items in x, y dimension + * Each work-item can then traverse in the z-dimension + * + * z-dimension slicing can be used to vary the total number + * global work-items. + * + */ +void Iso3dfdIterationGlobal(sycl::nd_item<3> it, float *next, float *prev, + float *vel, const float *coeff, int nx, int nxy, + int bx, int by, int z_offset, int full_end_z) { + // We compute the start and the end position in the grid + // for each work-item. + // Each work-items local value gid is updated to track the + // current cell/grid point it is working with. + // This position is calculated with the help of slice-ID and number of + // grid points each work-item will process. + // Offset of kHalfLength is also used to account for HALO + auto begin_z = it.get_global_id(0) * z_offset + kHalfLength; + auto end_z = begin_z + z_offset; + if (end_z > full_end_z) end_z = full_end_z; + + auto gid = (it.get_global_id(2) + bx) + ((it.get_global_id(1) + by) * nx) + + (begin_z * nxy); + + // front and back temporary arrays are used to ensure + // the grid values in z-dimension are read once, shifted in + // these array and re-used multiple times before being discarded + // + // This is an optimization technique to enable data-reuse and + // improve overall FLOPS to BYTES read ratio + float front[kHalfLength + 1]; + float back[kHalfLength]; + float c[kHalfLength + 1]; + + for (auto iter = 0; iter <= kHalfLength; iter++) { + front[iter] = prev[gid + iter * nxy]; + } + c[0] = coeff[0]; + for (auto iter = 1; iter <= kHalfLength; iter++) { + c[iter] = coeff[iter]; + back[iter - 1] = prev[gid - iter * nxy]; + } + + // Stencil code to update grid point at position given by global id (gid) + // New time step for grid point is computed based on the values of the + // the immediate neighbors - horizontal, vertical and depth + // directions(kHalfLength number of points in each direction), + // as well as the value of grid point at a previous time step + + float value = c[0] * front[0]; +#pragma unroll(kHalfLength) + for (auto iter = 1; iter <= kHalfLength; iter++) { + value += c[iter] * + (front[iter] + back[iter - 1] + prev[gid + iter] + + prev[gid - iter] + prev[gid + iter * nx] + prev[gid - iter * nx]); + } + next[gid] = 2.0f * front[0] - next[gid] + value * vel[gid]; + + // Update the gid and position in z-dimension and check if there + // is more work to do + gid += nxy; + begin_z++; + + while (begin_z < end_z) { + // Input data in front and back are shifted to discard the + // oldest value and read one new value. + for (auto iter = kHalfLength - 1; iter > 0; iter--) { + back[iter] = back[iter - 1]; + } + back[0] = front[0]; + + for (auto iter = 0; iter < kHalfLength; iter++) { + front[iter] = front[iter + 1]; + } + + // Only one new data-point read from global memory + // in z-dimension (depth) + front[kHalfLength] = prev[gid + kHalfLength * nxy]; + + // Stencil code to update grid point at position given by global id (gid) + float value = c[0] * front[0]; +#pragma unroll(kHalfLength) + for (auto iter = 1; iter <= kHalfLength; iter++) { + value += c[iter] * (front[iter] + back[iter - 1] + prev[gid + iter] + + prev[gid - iter] + prev[gid + iter * nx] + + prev[gid - iter * nx]); + } + + next[gid] = 2.0f * front[0] - next[gid] + value * vel[gid]; + + gid += nxy; + begin_z++; + } +} + +/* + * Host-side SYCL Code + * + * Driver function for ISO3DFD SYCL code + * Uses ptr_next and ptr_prev as ping-pong buffers to achieve + * accelerated wave propogation + * + * This function uses SYCL buffers to facilitate host to device + * buffer copies + * + */ + +bool Iso3dfdDevice(sycl::queue &q, float *ptr_next, float *ptr_prev, + float *ptr_vel, float *ptr_coeff, size_t n1, size_t n2, + size_t n3, size_t n1_block, size_t n2_block, size_t n3_block, + size_t end_z, unsigned int nIterations) { + auto nx = n1; + auto nxy = n1 * n2; + + auto bx = kHalfLength; + auto by = kHalfLength; + + // Display information about the selected device + PrintTargetInfo(q, n1_block, n2_block); + + auto sizeTotal = nxy * n3; + + { // Begin buffer scope + // Create buffers using DPC++ class buffer + buffer b_ptr_next(ptr_next, sizeTotal); + buffer b_ptr_prev(ptr_prev, sizeTotal); + buffer b_ptr_vel(ptr_vel, sizeTotal); + buffer b_ptr_coeff(ptr_coeff, kHalfLength + 1); + + // Iterate over time steps + for (auto i = 0; i < nIterations; i += 1) { + // Submit command group for execution + q.submit([&](auto &h) { + // Create accessors + auto next = b_ptr_next.get_access(h); + auto prev = b_ptr_prev.get_access(h); + auto vel = b_ptr_vel.get_access(h); + auto coeff = b_ptr_coeff.get_access(h); + + // Define local and global range + + // Define local ND range of work-items + // Size of each DPC++ work-group selected here is a product of + // n2_block and n1_block which can be controlled by the input + // command line arguments + auto local_nd_range = range(1, n2_block, n1_block); + + // Define global ND range of work-items + // Size of total number of work-items is selected based on the + // total grid size in first and second dimensions (XY-plane) + // + // Each of the work-item then works on computing + // one or more grid points. This value can be controlled by the + // input command line argument n3_block + // + // Effectively this implementation enables slicing of the full + // grid into smaller grid slices which can be computed in parallel + // to allow auto-scaling of the total number of work-items + // spawned to achieve full occupancy for small or larger accelerator + // devices + auto global_nd_range = + range((n3 - 2 * kHalfLength) / n3_block, (n2 - 2 * kHalfLength), + (n1 - 2 * kHalfLength)); + +#ifdef USE_SHARED + // Using 3D-stencil kernel with Shared Local Memory (SLM) + // optimizations (DPC++) to improve effective FLOPS to BYTES + // ratio. By default, SLM code path is disabled in this + // code sample. + // SLM code path can be enabled by recompiling the DPC++ source + // as follows: + // cmake -DSHARED_KERNEL=1 .. + // make -j`nproc` + + // Define a range for SLM Buffer + // Padding can be used to avoid SLM bank conflicts + // By default padding is disabled in the sample code + auto local_range = range((n1_block + (2 * kHalfLength) + kPad) * + (n2_block + (2 * kHalfLength))); + + // Create an accessor for SLM buffer + accessor tab( + local_range, h); + + // Send a DPC++ kernel (lambda) for parallel execution + // The function that executes a single iteration is called + // "Iso3dfdIterationSLM" + // alternating the 'next' and 'prev' parameters which effectively + // swaps their content at every iteration. + if (i % 2 == 0) + h.parallel_for( + nd_range(global_nd_range, local_nd_range), [=](nd_item<3> it) { + Iso3dfdIterationSLM(it, next.get_pointer(), prev.get_pointer(), + vel.get_pointer(), coeff.get_pointer(), + tab.get_pointer(), nx, nxy, bx, by, + n3_block, end_z); + }); + else + h.parallel_for( + nd_range(global_nd_range, local_nd_range), [=](nd_item<3> it) { + Iso3dfdIterationSLM(it, prev.get_pointer(), next.get_pointer(), + vel.get_pointer(), coeff.get_pointer(), + tab.get_pointer(), nx, nxy, bx, by, + n3_block, end_z); + }); + +#else + + // Use Global Memory version of the 3D-Stencil kernel. + // This code path is enabled by default + + // Send a DPC++ kernel (lambda) for parallel execution + // The function that executes a single iteration is called + // "Iso3dfdIterationGlobal" + // alternating the 'next' and 'prev' parameters which effectively + // swaps their content at every iteration. + if (i % 2 == 0) + h.parallel_for( + nd_range(global_nd_range, local_nd_range), [=](nd_item<3> it) { + Iso3dfdIterationGlobal(it, next.get_pointer(), + prev.get_pointer(), vel.get_pointer(), + coeff.get_pointer(), nx, nxy, bx, by, + n3_block, end_z); + }); + else + h.parallel_for( + nd_range(global_nd_range, local_nd_range), [=](nd_item<3> it) { + Iso3dfdIterationGlobal(it, prev.get_pointer(), + next.get_pointer(), vel.get_pointer(), + coeff.get_pointer(), nx, nxy, bx, by, + n3_block, end_z); + }); +#endif + }); + } + } // end buffer scope + return true; +} diff --git a/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/utils.cpp b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/utils.cpp new file mode 100644 index 0000000000..680fca2674 --- /dev/null +++ b/DirectProgramming/DPC++/StructuredGrids/iso3dfd_dpcpp/src/utils.cpp @@ -0,0 +1,165 @@ +//============================================================== +// Copyright © 2020 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include "iso3dfd.h" + +/* + * Host-Code + * Utility function to validate grid and block dimensions + */ +bool CheckGridDimension(size_t n1, size_t n2, size_t n3, unsigned int dim_x, + unsigned int dim_y, unsigned int block_z) { + if (n1 % dim_x) { + std::cout << " ERROR: Invalid Grid Size: n1 should be multiple of DIMX - " + << dim_x << "\n"; + return true; + } + if (n2 % dim_y) { + std::cout << " ERROR: Invalid Grid Size: n2 should be multiple of DIMY - " + << dim_y << "\n"; + return true; + } + if (n3 % block_z) { + std::cout << " ERROR: Invalid Grid Size: n3 should be multiple of BLOCKZ - " + << block_z << "\n"; + return true; + } + + return false; +} + +/* + * Host-Code + * Utility function to validate block sizes + */ +bool CheckBlockDimension(sycl::queue& q, unsigned int dim_x, + unsigned int dim_y) { + auto device = q.get_device(); + auto max_block_size = + device.get_info(); + + if ((max_block_size > 1) && (dim_x * dim_y > max_block_size)) { + std::cout << "ERROR: Invalid block sizes: n1_Tblock * n2_Tblock should be " + "less than or equal to " + << max_block_size << "\n"; + return true; + } + + return false; +} + +/* + * Host-Code + * Utility function to print device info + */ +void PrintTargetInfo(sycl::queue& q, unsigned int dim_x, unsigned int dim_y) { + auto device = q.get_device(); + auto max_block_size = + device.get_info(); + + auto max_exec_unit_count = + device.get_info(); + + std::cout << " Running on " << device.get_info() + << "\n"; + std::cout << " The Device Max Work Group Size is : " << max_block_size + << "\n"; + std::cout << " The Device Max EUCount is : " << max_exec_unit_count << "\n"; + std::cout << " The blockSize x is : " << dim_x << "\n"; + std::cout << " The blockSize y is : " << dim_y << "\n"; +#ifdef USE_SHARED + std::cout << " Using Shared Local Memory Kernel\n"; +#else + std::cout << " Using Global Memory Kernel\n"; + +#endif +} + +/* + * Host-Code + * Utility function to get input arguments + */ +void Usage(const std::string& programName) { + std::cout << " Incorrect parameters \n"; + std::cout << " Usage: "; + std::cout << programName + << " n1 n2 n3 b1 b2 b3 Iterations [omp|sycl] [gpu|cpu] \n\n"; + std::cout << " n1 n2 n3 : Grid sizes for the stencil \n"; + std::cout << " b1 b2 b3 : cache block sizes for cpu openmp version.\n"; + std::cout << " Iterations : No. of timesteps. \n"; + std::cout << " [omp|sycl] : Optional: Run the OpenMP or the SYCL variant." + << " Default is to use both for validation \n"; + std::cout + << " [gpu|cpu] : Optional: Device to run the SYCL version" + << " Default is to use the GPU if available, if not fallback to CPU \n\n"; +} + +/* + * Host-Code + * Utility function to print stats + */ +void PrintStats(double time, size_t n1, size_t n2, size_t n3, + unsigned int nIterations) { + float throughput_mpoints = 0.0f, mflops = 0.0f, normalized_time = 0.0f; + double mbytes = 0.0f; + + normalized_time = (double)time / nIterations; + throughput_mpoints = ((n1 - 2 * kHalfLength) * (n2 - 2 * kHalfLength) * + (n3 - 2 * kHalfLength)) / + (normalized_time * 1e3f); + mflops = (7.0f * kHalfLength + 5.0f) * throughput_mpoints; + mbytes = 12.0f * throughput_mpoints; + + std::cout << "--------------------------------------\n"; + std::cout << "time : " << time / 1e3f << " secs\n"; + std::cout << "throughput : " << throughput_mpoints << " Mpts/s\n"; + std::cout << "flops : " << mflops / 1e3f << " GFlops\n"; + std::cout << "bytes : " << mbytes / 1e3f << " GBytes/s\n"; + std::cout << "\n--------------------------------------\n"; + std::cout << "\n--------------------------------------\n"; +} + +/* + * Host-Code + * Utility function to calculate L2-norm between resulting buffer and reference + * buffer + */ +bool WithinEpsilon(float* output, float* reference, const size_t dim_x, + const size_t dim_y, const size_t dim_z, + const unsigned int radius, const int zadjust = 0, + const float delta = 0.01f) { + std::ofstream error_file; + error_file.open("error_diff.txt"); + + bool error = false; + double norm2 = 0; + + for (size_t iz = 0; iz < dim_z; iz++) { + for (size_t iy = 0; iy < dim_y; iy++) { + for (size_t ix = 0; ix < dim_x; ix++) { + if (ix >= radius && ix < (dim_x - radius) && iy >= radius && + iy < (dim_y - radius) && iz >= radius && + iz < (dim_z - radius + zadjust)) { + float difference = fabsf(*reference - *output); + norm2 += difference * difference; + if (difference > delta) { + error = true; + error_file << " ERROR: " << ix << ", " << iy << ", " << iz << " " + << *output << " instead of " << *reference + << " (|e|=" << difference << ")\n"; + } + } + ++output; + ++reference; + } + } + } + + error_file.close(); + norm2 = sqrt(norm2); + if (error) std::cout << "error (Euclidean norm): " << norm2 << "\n"; + return error; +} diff --git a/Libraries/oneDPL/gamma-correction/CMakeLists.txt b/Libraries/oneDPL/gamma-correction/CMakeLists.txt new file mode 100644 index 0000000000..415e459500 --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/CMakeLists.txt @@ -0,0 +1,12 @@ +set(CMAKE_CXX_COMPILER "dpcpp") +# Set default build type to RelWithDebInfo if not specified +if (NOT CMAKE_BUILD_TYPE) + message (STATUS "Default CMAKE_BUILD_TYPE not set using Release with Debug Info") + set (CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE + STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +cmake_minimum_required (VERSION 3.0) +project(gamma_correction LANGUAGES CXX) +add_subdirectory (src) diff --git a/Libraries/oneDPL/gamma-correction/License.txt b/Libraries/oneDPL/gamma-correction/License.txt new file mode 100644 index 0000000000..8f608e972a --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/License.txt @@ -0,0 +1,8 @@ +Copyright 2019 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + diff --git a/Libraries/oneDPL/gamma-correction/README.md b/Libraries/oneDPL/gamma-correction/README.md new file mode 100644 index 0000000000..718412cc40 --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/README.md @@ -0,0 +1,70 @@ +# Parallel STL 'Gamma Correction' Sample +Gamma correction is a nonlinear operation used to encode and decode the luminance of each pixel of an image. This sample demonstrates use of Parallel STL algorithms from Intel® oneAPI DPC++ Library (oneDPL) to facilitate offload to devices. + +| Optimized for | Description | +|---------------------------------|----------------------------------------------------------------------------------| +| OS | Linux* Ubuntu* 18.04, Windows 10 | +| Hardware | Skylake with GEN9 or newer | +| Software | Intel® oneAPI DPC++ Compiler beta; Intel® oneAPI DPC++ Library (oneDPL) | +| What you will learn | How to offload the computation to GPU using Intel® oneAPI DPC++ Library | +| Time to complete | At most 5 minutes | + +## Purpose + +Gamma correction uses nonlinear operations to encode and decode the luminance of each pixel of an image. See https://en.wikipedia.org/wiki/Gamma_correction for more information. +It does so by creating a fractal image in memory and performs gamma correction on it with `gamma=2`. +A device policy is created and passed to the `std::for_each` Parallel STL algorithm. +This example demonstrates how to use Parallel STL algorithms, Parallel STL is a component of Intel® oneAPI DPC++ library (oneDPL). + +Parallel STL is an implementation of the C++ standard library algorithms with support for execution policies, as specified in ISO/IEC 14882:2017 standard, commonly called C++17. The implementation also supports the unsequenced execution policy specified in the final draft for the C++ 20 standard (N4860). + +Parallel STL offers efficient support for both parallel and vectorized execution of algorithms for Intel® processors. For sequential execution, it relies on an available implementation of the C++ standard library. The implementation also supports the unsequenced execution policy specified in the final draft for the next version of the C++ standard and DPC++ execution policy specified in the oneDPL Spec (https://spec.oneapi.com/versions/latest/elements/oneDPL/source/pstl.html). + +## Key Implementation Details + +`std::for_each` Parallel STL algorithms are used in the code. + +## License + +This code sample is licensed under MIT license. + +## Building the 'Gamma Correction' Program for CPU and GPU + +### Running Samples In DevCloud +If running a sample in the Intel DevCloud, remember that you must specify the compute node (CPU, GPU, FPGA) as well whether to run in batch or interactive mode. For more information see the Intel(R) oneAPI Base Toolkit Get Started Guide (https://devcloud.intel.com/oneapi/get-started/base-toolkit/) + +### On a Linux* System +Perform the following steps: + +1. Build the program using the following `cmake` commands. +``` + $ mkdir build + $ cd build + $ cmake .. + $ make +``` + +2. Run the program: +``` + $ make run +``` + +3. Clean the program using: +``` + $ make clean +``` + +### On a Windows* System Using Visual Studio* Version 2017 or Newer +* Build the program using VS2017 or VS2019. Right click on the solution file and open using either VS2017 or VS2019 IDE. Right click on the project in Solution explorer and select Rebuild. From top menu select Debug -> Start without Debugging. +* Build the program using MSBuild. Open "x64 Native Tools Command Prompt for VS2017" or "x64 Native Tools Command Prompt for VS2019". Run - MSBuild gamma-correction.sln /t:Rebuild /p:Configuration="Release" + +## Running the Sample +### Example of Output + +The output of the example application is a BMP image with corrected luminance. Original image is created by the program. +``` +success +Run on Intel(R) Gen9 +Original image is in the fractal_original.bmp file +Image after applying gamma correction on the device is in the fractal_gamma.bmp file +``` diff --git a/Libraries/oneDPL/gamma-correction/gamma-correction.sln b/Libraries/oneDPL/gamma-correction/gamma-correction.sln new file mode 100644 index 0000000000..4af5eecb16 --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/gamma-correction.sln @@ -0,0 +1,25 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.852 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gamma-correction", "gamma-correction.vcxproj", "{79DE2E7B-C214-4962-A20A-B4D680835D62}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {79DE2E7B-C214-4962-A20A-B4D680835D62}.Debug|x64.ActiveCfg = Debug|x64 + {79DE2E7B-C214-4962-A20A-B4D680835D62}.Debug|x64.Build.0 = Debug|x64 + {79DE2E7B-C214-4962-A20A-B4D680835D62}.Release|x64.ActiveCfg = Release|x64 + {79DE2E7B-C214-4962-A20A-B4D680835D62}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {244849C1-825C-4CB7-BC9F-018082AE1225} + EndGlobalSection +EndGlobal diff --git a/Libraries/oneDPL/gamma-correction/gamma-correction.vcxproj b/Libraries/oneDPL/gamma-correction/gamma-correction.vcxproj new file mode 100644 index 0000000000..65557f1e15 --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/gamma-correction.vcxproj @@ -0,0 +1,179 @@ + + + + + Debug + x64 + + + Release + x64 + + + + + + + + + + + + + + + 15.0 + {79de2e7b-c214-4962-a20a-b4d680835d62} + Win32Proj + gamma_correction + $(WindowsSDKVersion.Replace("\","")) + + + + Application + true + oneAPI Data Parallel C++ Compiler + Unicode + + + Application + false + oneAPI Data Parallel C++ Compiler + true + Unicode + + + Application + true + Intel(R) oneAPI DPC++ Compiler + Unicode + + + Application + false + Intel(R) oneAPI DPC++ Compiler + true + Unicode + + + + + + + + + + + + + + + + + + + + + true + + + true + C:\Program Files %28x86%29\inteloneapi\tbb\2021.1-beta02\redist\intel64\vc14;$(ExecutablePath) + C:\Program Files %28x86%29\inteloneapi\tbb\2021.1-beta02\include;C:\Program Files %28x86%29\inteloneapi\compiler\2021.1-beta02\windows\include;$(IncludePath) + C:\Program Files %28x86%29\inteloneapi\tbb\2021.1-beta02\lib\intel64\vc14;$(LibraryPath) + + + false + + + false + C:\Program Files %28x86%29\inteloneapi\tbb\2021.1-beta02\include;C:\Program Files %28x86%29\inteloneapi\compiler\2021.1-beta02\windows\include;$(IncludePath) + C:\Program Files %28x86%29\inteloneapi\tbb\2021.1-beta02\redist\intel64\vc14;$(ExecutablePath) + C:\Program Files %28x86%29\inteloneapi\tbb\2021.1-beta02\lib\intel64\vc14;$(LibraryPath) + + + + Use + Level3 + Disabled + true + true + pch.h + + + Console + true + + + + + Use + Level3 + Disabled + true + true + pch.h + DisableAllWarnings + $(ONEAPI_ROOT)\tbb\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + /link /libpath:"$(ONEAPI_ROOT)/tbb/latest/lib/intel64/vc14" %(AdditionalOptions) + + + copy /y "$(ONEAPI_ROOT)\tbb\latest\redist\intel64\vc14\tbb.dll" "$(SolutionDir)$(Platform)\$(Configuration)\" + + + copy tbb.dll to binary folder + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + + + Console + true + true + true + + + + + Use + Level3 + MaxSpeed + true + true + true + true + pch.h + DisableAllWarnings + $(ONEAPI_ROOT)\tbb\latest\include;%(AdditionalIncludeDirectories) + + + Console + true + true + true + /link /libpath:"$(ONEAPI_ROOT)/tbb/latest/lib/intel64/vc14" %(AdditionalOptions) + + + copy /y "$(ONEAPI_ROOT)\tbb\latest\redist\intel64\vc14\tbb.dll" "$(SolutionDir)$(Platform)\$(Configuration)\" + + + copy tbb.dll to binary folder + + + + + + \ No newline at end of file diff --git a/Libraries/oneDPL/gamma-correction/gamma-correction.vcxproj.filters b/Libraries/oneDPL/gamma-correction/gamma-correction.vcxproj.filters new file mode 100644 index 0000000000..b989fc6291 --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/gamma-correction.vcxproj.filters @@ -0,0 +1,45 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + {a188ffd5-6536-4f5e-89ce-605f5c22d477} + + + + + Source Files + + + + + Header Files + + + Header Files\utils + + + Header Files\utils + + + Header Files\utils + + + Header Files\utils + + + Header Files\utils + + + \ No newline at end of file diff --git a/Libraries/oneDPL/gamma-correction/gamma-correction.vcxproj.user b/Libraries/oneDPL/gamma-correction/gamma-correction.vcxproj.user new file mode 100644 index 0000000000..a7871a057b --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/gamma-correction.vcxproj.user @@ -0,0 +1,11 @@ + + + + $(LocalDebuggerEnvironment) + WindowsLocalDebugger + + + $(LocalDebuggerEnvironment) + WindowsLocalDebugger + + \ No newline at end of file diff --git a/Libraries/oneDPL/gamma-correction/sample.json b/Libraries/oneDPL/gamma-correction/sample.json new file mode 100644 index 0000000000..e2d46465e3 --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/sample.json @@ -0,0 +1,33 @@ +{ + "name": "Gamma Correction", + "categories": ["Toolkit/Intel® oneAPI Base Toolkit/oneAPI DPC++ Compiler/oneAPI DPC++ Library/CPU and GPU"], + "description": "gamma correction - a nonlinear operation used to encode and decode the luminance of each image pixel.", + "toolchain": ["dpcpp"], + "languages": [{"cpp":{}}], + "targetDevice": ["CPU", "GPU"], + "os": ["linux", "windows"], + "builder": ["ide", "cmake"], + "guid": "E249AFB8-6580-4CC3-8EF8-5D33C9A41A2C", + "ciTests": { + "linux": [ + { + "steps": [ + "mkdir -p build", + "cd build", + "cmake ..", + "make", + "make run" + ] + } + ], + "windows": [ + { + "steps": [ + "MSBuild gamma-correction.sln /t:Rebuild /p:Configuration=\"Release\"", + "cd x64\\Release", + "gamma-correction.exe" + ] + } + ] + } +} diff --git a/Libraries/oneDPL/gamma-correction/src/CMakeLists.txt b/Libraries/oneDPL/gamma-correction/src/CMakeLists.txt new file mode 100644 index 0000000000..eeadfd839f --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/src/CMakeLists.txt @@ -0,0 +1,12 @@ +if (NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE RelWithDebInfo) +endif() + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") +set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") + +# Add an executable target from source files +add_executable(${PROJECT_NAME} main.cpp) + +# Add custom target for running +add_custom_target(run ./${PROJECT_NAME}) diff --git a/Libraries/oneDPL/gamma-correction/src/main.cpp b/Libraries/oneDPL/gamma-correction/src/main.cpp new file mode 100644 index 0000000000..ca732ca14c --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/src/main.cpp @@ -0,0 +1,92 @@ +//============================================================== +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#include +#include + +#include + +#include +#include +#include + +#include "utils.hpp" + +using namespace sycl; +using namespace std; + +int main() { + // Image size is width x height + int width = 2560; + int height = 1600; + + Img image{width, height}; + ImgFractal fractal{width, height}; + + // Lambda to process image with gamma = 2 + auto gamma_f = [](ImgPixel &pixel) { + auto v = (0.3f * pixel.r + 0.59f * pixel.g + 0.11f * pixel.b) / 255.0f; + + auto gamma_pixel = static_cast(255 * v * v); + if (gamma_pixel > 255) gamma_pixel = 255; + pixel.set(gamma_pixel, gamma_pixel, gamma_pixel, gamma_pixel); + }; + + // fill image with created fractal + int index = 0; + image.fill([&index, width, &fractal](ImgPixel &pixel) { + int x = index % width; + int y = index / width; + + auto fractal_pixel = fractal(x, y); + if (fractal_pixel < 0) fractal_pixel = 0; + if (fractal_pixel > 255) fractal_pixel = 255; + pixel.set(fractal_pixel, fractal_pixel, fractal_pixel, fractal_pixel); + + ++index; + }); + + string original_image = "fractal_original.bmp"; + string processed_image = "fractal_gamma.bmp"; + Img image2 = image; + image.write(original_image); + + // call standard serial function for correctness check + image.fill(gamma_f); + + // use default policy for algorithms execution + auto policy = oneapi::dpl::execution::dpcpp_default; + // We need to have the scope to have data in image2 after buffer's destruction + { + // create a buffer, being responsible for moving data around and counting + // dependencies + buffer b(image2.data(), image2.width() * image2.height()); + + // create iterator to pass buffer to the algorithm + auto b_begin = oneapi::dpl::begin(b); + auto b_end = oneapi::dpl::end(b); + + // call std::for_each with DPC++ support + std::for_each(policy, b_begin, b_end, gamma_f); + } + + image2.write(processed_image); + // check correctness + if (check(image.begin(), image.end(), image2.begin())) { + cout << "success\n"; + } else { + cout << "fail\n"; + return 1; + } + cout << "Run on " + << policy.queue().get_device().template get_info() + << "\n"; + cout << "Original image is in " << original_image << "\n"; + cout << "Image after applying gamma correction on the device is in " + << processed_image << "\n"; + + return 0; +} diff --git a/Libraries/oneDPL/gamma-correction/src/utils.hpp b/Libraries/oneDPL/gamma-correction/src/utils.hpp new file mode 100644 index 0000000000..3042d46400 --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/src/utils.hpp @@ -0,0 +1,17 @@ +//============================================================== +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef _GAMMA_UTILS_HPP +#define _GAMMA_UTILS_HPP + +#include "utils/Img.hpp" +#include "utils/ImgAlgorithm.hpp" +#include "utils/ImgFormat.hpp" +#include "utils/ImgPixel.hpp" + +#include "utils/Other.hpp" + +#endif // _GAMMA_UTILS_HPP diff --git a/Libraries/oneDPL/gamma-correction/src/utils/Img.hpp b/Libraries/oneDPL/gamma-correction/src/utils/Img.hpp new file mode 100644 index 0000000000..bb02d09736 --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/src/utils/Img.hpp @@ -0,0 +1,205 @@ +//============================================================== +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef _GAMMA_UTILS_IMG_HPP +#define _GAMMA_UTILS_IMG_HPP + +#include "ImgPixel.hpp" + +#include +#include +#include +#include + +using namespace std; + +// Image class definition +template +class Img { + private: + Format _format; + int32_t _width; + int32_t _height; + vector _pixels; + + using Iterator = vector::iterator; + using ConstIterator = vector::const_iterator; + + public: + ///////////////////// + // SPECIAL METHODS // + ///////////////////// + + Img(int32_t width, int32_t height); + + void reset(int32_t width, int32_t height); + + /////////////// + // ITERATORS // + /////////////// + + Iterator begin() noexcept; + Iterator end() noexcept; + ConstIterator begin() const noexcept; + ConstIterator end() const noexcept; + ConstIterator cbegin() const noexcept; + ConstIterator cend() const noexcept; + + ///////////// + // GETTERS // + ///////////// + + int32_t width() const noexcept; + int32_t height() const noexcept; + + ImgPixel const* data() const noexcept; + ImgPixel* data() noexcept; + + /////////////////// + // FUNCTIONALITY // + /////////////////// + + void write(string const& filename) const; + + template + void fill(Functor f); + void fill(ImgPixel pixel); + void fill(ImgPixel pixel, int32_t row, int32_t col); +}; + +/////////////////////////////////////////////// +// IMG CLASS IMPLEMENTATION: SPECIAL METHODS // +/////////////////////////////////////////////// + +template +Img::Img(int32_t width, int32_t height) : _format(width, height) { + _pixels.resize(width * height); + + _width = width; + _height = height; +} + +template +void Img::reset(int32_t width, int32_t height) { + _pixels.resize(width * height); + + _width = width; + _height = height; + + _format.reset(width, height); +} + +///////////////////////////////////////// +// IMG CLASS IMPLEMENTATION: ITERATORS // +///////////////////////////////////////// + +template +typename Img::Iterator Img::begin() noexcept { + return _pixels.begin(); +} + +template +typename Img::Iterator Img::end() noexcept { + return _pixels.end(); +} + +template +typename Img::ConstIterator Img::begin() const noexcept { + return _pixels.begin(); +} + +template +typename Img::ConstIterator Img::end() const noexcept { + return _pixels.end(); +} + +template +typename Img::ConstIterator Img::cbegin() const noexcept { + return _pixels.begin(); +} + +template +typename Img::ConstIterator Img::cend() const noexcept { + return _pixels.end(); +} + +/////////////////////////////////////// +// IMG CLASS IMPLEMENTATION: GETTERS // +/////////////////////////////////////// + +template +int32_t Img::width() const noexcept { + return _width; +} + +template +int32_t Img::height() const noexcept { + return _height; +} + +template +ImgPixel const* Img::data() const noexcept { + return _pixels.data(); +} + +template +ImgPixel* Img::data() noexcept { + return _pixels.data(); +} + +///////////////////////////////////////////// +// IMG CLASS IMPLEMENTATION: FUNCTIONALITY // +///////////////////////////////////////////// + +template +void Img::write(string const& filename) const { + if (_pixels.empty()) { + cerr << "Img::write:: image is empty\n"; + return; + } + + ofstream filestream(filename, ios::binary); + + _format.write(filestream, *this); +} + +template +template +void Img::fill(Functor f) { + if (_pixels.empty()) { + cerr << "Img::fill(Functor): image is empty\n"; + return; + } + + for (auto& pixel : _pixels) f(pixel); +} + +template +void Img::fill(ImgPixel pixel) { + if (_pixels.empty()) { + cerr << "Img::fill(ImgPixel): image is empty\n"; + return; + } + + fill(_pixels.begin(), _pixels.end(), pixel); +} + +template +void Img::fill(ImgPixel pixel, int row, int col) { + if (_pixels.empty()) { + cerr << "Img::fill(ImgPixel): image is empty\n"; + return; + } + + if (row >= _height || row < 0 || col >= _width || col < 0) { + cerr << "Img::fill(ImgPixel, int, int): out of range\n"; + return; + } + + _pixels.at(row * _width + col) = pixel; +} + +#endif // _GAMMA_UTILS_IMG_HPP diff --git a/Libraries/oneDPL/gamma-correction/src/utils/ImgAlgorithm.hpp b/Libraries/oneDPL/gamma-correction/src/utils/ImgAlgorithm.hpp new file mode 100644 index 0000000000..06cd25c198 --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/src/utils/ImgAlgorithm.hpp @@ -0,0 +1,51 @@ +//============================================================== +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef _GAMMA_UTILS_IMGALGORITHM_HPP +#define _GAMMA_UTILS_IMGALGORITHM_HPP + +#include +#include + +using namespace std; + +// struct to store fractal that image will fill from +class ImgFractal { + private: + const int32_t _width; + const int32_t _height; + + double _cx = -0.7436; + double _cy = 0.1319; + + double _magn = 2000000.0; + int _maxIterations = 1000; + + public: + ImgFractal(int32_t width, int32_t height) : _width(width), _height(height) {} + + double operator()(int32_t x, int32_t y) const { + double fx = (double(x) - double(_width) / 2) * (1 / _magn) + _cx; + double fy = (double(y) - double(_height) / 2) * (1 / _magn) + _cy; + + double res = 0; + double nx = 0; + double ny = 0; + double val = 0; + + for (int i = 0; nx * nx + ny * ny <= 4 && i < _maxIterations; ++i) { + val = nx * nx - ny * ny + fx; + ny = 2 * nx * ny + fy; + nx = val; + + res += exp(-sqrt(nx * nx + ny * ny)); + } + + return res; + } +}; + +#endif // _GAMMA_UTILS_IMGALGORITHM_HPP diff --git a/Libraries/oneDPL/gamma-correction/src/utils/ImgFormat.hpp b/Libraries/oneDPL/gamma-correction/src/utils/ImgFormat.hpp new file mode 100644 index 0000000000..2c827fe3b1 --- /dev/null +++ b/Libraries/oneDPL/gamma-correction/src/utils/ImgFormat.hpp @@ -0,0 +1,98 @@ +//============================================================== +// Copyright © 2019 Intel Corporation +// +// SPDX-License-Identifier: MIT +// ============================================================= + +#ifndef _GAMMA_UTILS_IMGFORMAT_HPP +#define _GAMMA_UTILS_IMGFORMAT_HPP + +#include "ImgPixel.hpp" + +#include + +using namespace std; + +namespace ImgFormat { + +// struct to store an image in BMP format +struct BMP { + private: + using FileHeader = struct { + // not from specification + // was added for alignemt + // store size of rest of the fields + uint16_t sizeRest; // file header size in bytes + + uint16_t type; + uint32_t size; // file size in bytes + uint32_t reserved; + uint32_t offBits; // cumulative header size in bytes + }; + + using InfoHeader = struct { + // from specification + // store size of rest of the fields + uint32_t size; // info header size in bytes + + int32_t width; // image width in pixels + int32_t height; // image height in pixels + uint16_t planes; + uint16_t bitCount; // color depth + uint32_t compression; // compression + uint32_t sizeImage; // image map size in bytes + int32_t xPelsPerMeter; // pixel per metre (y axis) + int32_t yPelsPerMeter; // pixel per metre (y axis) + uint32_t clrUsed; // color pallete (0 is default) + uint32_t clrImportant; + }; + + FileHeader _fileHeader; + InfoHeader _infoHeader; + + public: + BMP(int32_t width, int32_t height) noexcept { reset(width, height); } + + void reset(int32_t width, int32_t height) noexcept { + uint32_t padSize = (4 - (width * sizeof(ImgPixel)) % 4) % 4; + uint32_t mapSize = width * height * sizeof(ImgPixel) + height * padSize; + uint32_t allSize = mapSize + _fileHeader.sizeRest + _infoHeader.size; + + _fileHeader.sizeRest = 14; // file header size in bytes + _fileHeader.type = 0x4d42; + _fileHeader.size = allSize; // file size in bytes + _fileHeader.reserved = 0; + _fileHeader.offBits = 54; // sizeRest + size -> 14 + 40 -> 54 + + _infoHeader.size = 40; // info header size in bytes + _infoHeader.width = width; // image width in pixels + _infoHeader.height = height; // image height in pixels + _infoHeader.planes = 1; + _infoHeader.bitCount = 32; // color depth + _infoHeader.compression = 0; // compression + _infoHeader.sizeImage = mapSize; // image map size in bytes + _infoHeader.xPelsPerMeter = 0; // pixel per metre (x axis) + _infoHeader.yPelsPerMeter = 0; // pixel per metre (y axis) + _infoHeader.clrUsed = 0; // color pallete (0 is default) + _infoHeader.clrImportant = 0; + } + + template