diff --git a/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/License.txt b/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/License.txt new file mode 100644 index 0000000000..6e9524bd74 --- /dev/null +++ b/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/License.txt @@ -0,0 +1,7 @@ +Copyright 2020 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/Makefile b/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/Makefile new file mode 100644 index 0000000000..baefe44f8c --- /dev/null +++ b/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/Makefile @@ -0,0 +1,37 @@ +## ============================================================= +## Copyright © 2020 Intel Corporation +## +## SPDX-License-Identifier: MIT +## ============================================================= +## +## +##****************************************************************************** +## Content: +## +## Build for openmp_sample +##****************************************************************************** + +FC = ifort + +release: openmp_sample.exe + +debug: openmp_sample_dbg.exe + +run: release ; @export DYLD_LIBRARY_PATH="$(LIBRARY_PATH)" ; ./openmp_sample.exe + +debug_run: debug ; @export DYLD_LIBRARY_PATH="$(LIBRARY_PATH)" ; ./openmp_sample_dbg.exe + +openmp_sample.exe: openmp_sample.o + $(FC) -O2 -fpp -qopenmp $^ -o $@ + +openmp_sample_dbg.exe: openmp_sample_dbg.o + $(FC) -O0 -g -fpp -qopenmp $^ -o $@ + +%.o: src/%.f90 + $(FC) -O2 -c -fpp -qopenmp -o $@ $< + +%_dbg.o: src/%.f90 + $(FC) -O0 -g -c -fpp -qopenmp -o $@ $< + +clean: + /bin/rm -f core.* *.o *.exe diff --git a/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/README.md b/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/README.md new file mode 100644 index 0000000000..b8b7de039e --- /dev/null +++ b/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/README.md @@ -0,0 +1,98 @@ +# `OpenMP Primes` +This sample is designed to illustrate how to use +the OpenMP* API with the Intel® Fortran Compiler. + +This program finds all primes in the first 40,000,000 integers, +the number of 4n+1 primes, and the number of 4n-1 primes in the same range. +It illustrates two OpenMP* directives to help speed up the code. + + +| Optimized for | Description +|:--- |:--- +| OS | macOS* with Xcode* installed +| Software | Intel® oneAPI Intel Fortran Compiler (Beta) +| What you will learn | How to build and run a Fortran OpenMP application using Intel Fortran compiler +| Time to complete | 10 minutes + +## Purpose + +This program finds all primes in the first 40,000,000 integers, the number of 4n+1 primes, +and the number of 4n-1 primes in the same range. It illustrates two OpenMP* directives +to help speed up the code. + +First, a dynamic schedule clause is used with the OpenMP* for directive. +Because the DO loop's workload increases as its index gets bigger, +the default static scheduling does not work well. Instead, dynamic scheduling +is used to account for the increasing workload. +But dynamic scheduling itself has more overhead than static scheduling, +so a chunk size of 10 is used to reduce the overhead for dynamic scheduling. + +Second, a reduction clause is used instead of an OpenMP* critical directive +to eliminate lock overhead. A critical directive would cause excessive lock overhead +due to the one-thread-at-time update of the shared variables each time through the DO loop. +Instead the reduction clause causes only one update of the shared variables once at the end of the loop. + +The sample can be compiled unoptimized (-O0 ), or at any level of +optimization (-O1 through -O3 ). In addition, the following compiler options are needed. + +The option -qopenmp enables compiler recognition of OpenMP* directives. +This option can also be omitted, in which case the generated executable will be a serial program. + +The option -fpp enables the Fortran preprocessor. +Read the Intel® Fortran Compiler Documentation for more information about these options. + +## Key Implementation Details +The Intel® oneAPI Intel Fortran Compiler (Beta) includes all libraries and headers necessary to compile and run OpenMP* enabled Fortran applications. Users simply use the -qopenmp compiler option to compile and link their OpenMP enabled applications. + +## License +This code sample is licensed under MIT license + +## Building the `Fortran OpenMP*` sample + +### Experiment 1: Unoptimized build and run +* Build openmp_samples + + cd openmp_samples + make clean + make debug + + * Run the program + + make debug_run + + * What did you see? + + Did the debug, unoptimized code run slower? + +### Experiment 2: Default Optimized build and run + + * Build openmp_samples + + make + * Run the program + + make run + +### Experiment 3: Controlling number of threads +By default an OpenMP application creates and uses as many threads as there are "processors" in a system. A "processor" is the number of logical processors which on hyperthreaded cores is twice the number of physical cores. + +OpenMP uses environment variable 'OMP_NUM_THREADS' to set number of threads to use. Try this! + + export OMP_NUM_THREADS=1 + make run +note the number of threads reported by the application. Now try 2 threads: + + export OMP_NUM_THREADS=2 + make run +Did the make the application run faster? Experiment with the number of threads and see how it affects performance. + +### Clean up + * Clean the program + make clean + +## Further Reading +Interested in learning more? We have a wealth of information +on using OpenMP with the Intel Fortran Compiler in our +[OpenMP section of Developer Guide and Reference][1] + +[1]: https://software.intel.com/content/www/us/en/develop/documentation/fortran-compiler-developer-guide-and-reference/top/optimization-and-programming-guide/openmp-support.html "Developer Guide and Reference" diff --git a/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/sample.json b/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/sample.json new file mode 100644 index 0000000000..67c356fe59 --- /dev/null +++ b/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/sample.json @@ -0,0 +1,30 @@ +{ + "name": "openmp-primes", + "categories": [ "Toolkit/Intel® oneAPI HPC Toolkit" ], + "description": "Fortran Tutorial - Using OpenMP", + "toolchain": [ "ifort" ], + "languages": [ { "fortran": {} } ], + "targetDevice": [ "CPU" ], + "os": [ "darwin" ], + "builder": [ "make" ], + "ciTests":{ + "darwin": [ + { + "id": "fort_release_cpu", + "steps": [ + "make release", + "make run", + "make clean" + ] + }, + { + "id": "fort_debug_cpu", + "steps": [ + "make debug", + "make debug_run", + "make clean" + ] + } + ] + } +} diff --git a/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/src/openmp_sample.f90 b/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/src/openmp_sample.f90 new file mode 100644 index 0000000000..fb0a9ebacf --- /dev/null +++ b/DirectProgramming/Fortran/CombinationalLogic/openmp-primes/src/openmp_sample.f90 @@ -0,0 +1,117 @@ +! ============================================================== +! Copyright © 2020 Intel Corporation +! +! SPDX-License-Identifier: MIT +! ============================================================= +! +! [DESCRIPTION] +! This code finds all primes in the first 40,000,000 integers, the number of +! 4n+1 primes, and the number of 4n-1 primes in the same range. +! +! This source illustrates two OpenMP directives to help speed up +! the code. First, a dynamic "schedule" clause is used with the OpenMP "for" +! directive. Because the "for" loop's workload increases as its index +! gets bigger, the default "static" scheduling does not work well. +! Instead dynamic scheduling is used to account for the increasing +! workload. But dynamic scheduling itself has more overhead than +! static scheduling, so a "chunk size" of 10 is used to reduce the +! overhead for dynamic scheduling. Second, a "reduction" clause is +! used instead of an OpenMP "critical" directive to eliminate lock overhead. +! A "critical" directive would cause excessive lock overhead due to +! the one-thread-at-time update of the shared variables each +! time through the "for" loop. Instead the reduction clause causes only +! one update of the shared variables once at the end of the loop. +! +! [COMPILE] +! Use the following compiler options to compile both multi- and +! single-threaded versions. +! +! Parallel compilation: +! +! Windows*: /Qopenmp /fpp +! +! Linux* and macOS*: -qopenmp -fpp +! +! Serial compilation: +! +! Use the same command, but omit the -fopenmp (Linux* and macOS*) +! or /Qopenmp (Windows) option. +! + +program ompPrime + +#ifdef _OPENMP + include 'omp_lib.h' !needed for OMP_GET_NUM_THREADS() +#endif + +integer :: start = 1 +integer :: end = 40000000 +integer :: number_of_primes = 0 +integer :: number_of_41primes = 0 +integer :: number_of_43primes = 0 +integer index, factor, limit, nthr +real rindex, rlimit +logical prime, print_primes + +print_primes = .false. +nthr = 1 ! assume just one thread +print *, ' Range to check for Primes:',start,end + +#ifdef _OPENMP +!$omp parallel + +!$omp single + nthr = OMP_GET_NUM_THREADS() + print *, ' We are using',nthr,' thread(s)' +!$omp end single +! + +! +!$omp do private(factor, limit, prime) & + schedule(dynamic,10) & + reduction(+:number_of_primes,number_of_41primes,number_of_43primes) +#else + print *, ' We are using',nthr,' thread(s)' +#endif + +do index = start, end, 2 !workshared loop + + limit = int(sqrt(real(index))) + prime = .true. ! assume number is prime + factor = 3 + + do + if(prime .and. factor .le. limit) then + if(mod(index,factor) .eq. 0) then + prime = .false. + endif + factor = factor + 2 + else + exit ! we can jump out of non-workshared loop + endif + enddo + + if(prime) then + if(print_primes) then + print *, index, ' is prime' + endif + + number_of_primes = number_of_primes + 1 + + if(mod(index,4) .eq. 1) then + number_of_41primes = number_of_41primes + 1 + endif + + if(mod(index,4) .eq. 3) then + number_of_43primes = number_of_43primes + 1 + endif + + endif ! if(prime) +enddo +!$omp end do +!$omp end parallel + +print *, ' Number of primes found:',number_of_primes +print *, ' Number of 4n+1 primes found:',number_of_41primes +print *, ' Number of 4n-1 primes found:',number_of_43primes +end program ompPrime diff --git a/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/License.txt b/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/License.txt new file mode 100644 index 0000000000..6e9524bd74 --- /dev/null +++ b/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/License.txt @@ -0,0 +1,7 @@ +Copyright 2020 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/Makefile b/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/Makefile new file mode 100644 index 0000000000..2960bf3f8f --- /dev/null +++ b/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/Makefile @@ -0,0 +1,38 @@ +## ============================================================= +## Copyright © 2020 Intel Corporation +## +## SPDX-License-Identifier: MIT +## ============================================================= +## +## +##****************************************************************************** +## Content: +## +## Build for optimize_sample +##****************************************************************************** +# +# >>>>> SET OPTIMIZATION LEVEL BELOW <<<<< +# +#Uncomment one of the following with which you wish to compile + +FC = ifort -O0 +#FC = ifort -O1 +#FC = ifort -O2 +#FC = ifort -O3 + +OBJS = int_sin.o + +all: int_sin + +run: int_sin + ./int_sin + +int_sin: $(OBJS) + ifort $^ -o $@ + +%.o: src/%.f90 + $(FC) $^ -c + +clean: + /bin/rm -f core.* $(OBJS) int_sin + diff --git a/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/README.md b/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/README.md new file mode 100644 index 0000000000..e576ecb8af --- /dev/null +++ b/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/README.md @@ -0,0 +1,194 @@ +# `Optimization Integral` + +This sample is designed to illustrate compiler optimization features and programming concepts. + +This program computes the integral (area under the curve) of a user-supplied function +over an interval in a stepwise fashion. +The interval is split into segments, and at each segment position the area of a rectangle +is computed whose height is the value of sine at that point and the width is the segment width. +The areas of the rectangles are then summed. + +The process is repeated with smaller and smaller width rectangles, +more closely approximating the true value. + +The source for this program also demonstrates recommended Fortran coding practices. + +| Optimized for | Description +|:--- |:--- +| OS | macOS* with Xcode* installed +| Software | Intel® oneAPI Intel® Fortran Compiler (Beta) +| What you will learn | Optimization using the Intel® Fortran compiler +| Time to complete | 15 minutes + +## Purpose + +The Intel® Fortran Compiler can optimize applications for performance. The primary compiler option is -O followed by a numeric optimizaiton "level" from 0 requesting no optimization to 3, which requests all compiler optimizations for the application. The -O optimizaition levels are: + + * O0 - No optimizations + * O1 - Enables optimizations for speed and disables some optimizations that increase code size and affect speed. + * O2 - Enables optimizations for speed. This is the generally recommended optimization level. Vectorization is enabled at O2 and higher levels. + * O3 - Performs O2 optimizations and enables more aggressive loop transformations such as Fusion, Block-Unroll-and-Jam, and collapsing IF statements. + +Read the [Intel® Fortran Compiler Developer Guide and Reference][1] +[1]: https://software.intel.com/content/www/us/en/develop/documentation/fortran-compiler-developer-guide-and-reference/top.html "Intel® Fortran Compiler Developer Guide and Reference" +for more information about these options. + +Some of these compiler optimizations use features and options that can +restrict program execution to specific architectures. + + +## License +This code sample is licensed under MIT license + +## Building the `Fortran Optimization` sample + +Use the one of the following compiler options: + + +### macOS* : -O0 -O1, -O2, -O3 + +### STEP 1: Build and run with -O0 +cd optimize_samples + +Edit 'Makefile' using your favorite editor + +To set optimization level uncomment FC = ifort -O0 like this + + FC = ifort -O0 + #FC = ifort -O1 + #FC = ifort -O2 + #FC = ifort -O3 + * Build the executable with 'make' + + make + + * Run the program + + make run + + * Note the final run time (example) + CPU Time = 3.776983 seconds + + * Clean the files we built + + make clean + + +### STEP 2: Build and run with -O1 +Edit 'Makefile' using your favorite editor + +To set optimization level uncomment FC = ifort -O1 like this + + #FC = ifort -O0 + FC = ifort -O1 + #FC = ifort -O2 + #FC = ifort -O3 + * Build the executable with 'make' + + make + + * Run the program + + make run + + * Note the final run time (example) + CPU Time = 1.444569 seconds + + * Clean the files we built + + make clean + + +### STEP 3: Build and run with -O2 +Edit 'Makefile' using your favorite editor + +To set optimization level uncomment FC = ifort -O2 like this + + #FC = ifort -O0 + #FC = ifort -O1 + FC = ifort -O2 + #FC = ifort -O3 + * Build the executable with 'make' + + make + + * Run the program + + make run + + * Note the final run time (example) + CPU Time = 0.5143980 seconds + + * Clean the files we built + + make clean + +### STEP 4: Build and run with -O3 +Edit 'Makefile' using your favorite editor + +To set optimization level uncomment FC = ifort -O3 like this + + #FC = ifort -O0 + #FC = ifort -O1 + #FC = ifort -O2 + FC = ifort -O3 + * Build the executable with 'make' + + make + + * Run the program + + make run + + * Note the final run time (example) + CPU Time = 0.5133380 seconds + + * Clean the files we built + + make clean + +## What did we learn? +There are big jumps going from O0 to O1, and from O1 to O2. +But we see very little performance gain going from O2 to O3. +This does vary by application but generally with Intel® Compilers +O2 is has most optimizations. Sometimes O3 can help, of course, +but generally O2 is sufficient for most applications. + +### Further Exploration +The Intel® Fortran Compiler has many options for optimization. +If you have a genuine Intel® Architecture processor, try these additional options + + edit 'Makefile' using your favorite editor. To set additional optimizations uncomment FC = ifort -O3 and add additional options shown: + + #FC = ifort -O0 + #FC = ifort -O1 + #FC = ifort -O2 + FC = ifort -O3 -xhost -align array64byte + * Build the executable with the new options -xhost -align array64byte + + make + + * Run the program + + make run + + * Note the final run time (example) + CPU Time = 0.2578490 seconds + + * Clean the program + + make clean + +There are 2 additional compiler options here that are worth mentioning: Read the online +[Developer Guide and Reference][3] for more information about +these options +[3]: https://software.intel.com/content/www/us/en/develop/documentation/fortran-compiler-developer-guide-and-reference/top.html "Developer Guide and Reference" + 1. -xhost (sub option of -x option): [-x][4] + [4]: https://software.intel.com/content/www/us/en/develop/documentation/fortran-compiler-developer-guide-and-reference/top/compiler-reference/compiler-options/compiler-option-details/code-generation-options/x-qx.html "-x option" + 2. -align array64byte: [-align][5] + [5]: https://software.intel.com/content/www/us/en/develop/documentation/fortran-compiler-developer-guide-and-reference/top/compiler-reference/compiler-options/compiler-option-details/data-options/align.html "-align option" + +### Clean up + * Clean the program + make clean + diff --git a/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/sample.json b/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/sample.json new file mode 100644 index 0000000000..1e0a458b35 --- /dev/null +++ b/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/sample.json @@ -0,0 +1,22 @@ +{ + "name": "optimization-integral", + "categories": [ "Toolkit/Intel® oneAPI HPC Toolkit" ], + "description": "Fortran Sample - Simple Compiler Optimizations", + "toolchain": [ "ifort" ], + "languages": [ { "fortran": {} } ], + "targetDevice": [ "CPU" ], + "os": [ "darwin" ], + "builder": [ "make" ], + "ciTests":{ + "darwin": [ + { + "id": "fort_optsample_cpu", + "steps": [ + "make", + "make run", + "make clean" + ] + } + ] + } +} diff --git a/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/src/int_sin.f90 b/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/src/int_sin.f90 new file mode 100644 index 0000000000..d1519820f3 --- /dev/null +++ b/DirectProgramming/Fortran/DenseLinearAlgebra/optimize-integral/src/int_sin.f90 @@ -0,0 +1,96 @@ + ! ============================================================== + ! Copyright © 2020 Intel Corporation + ! + ! SPDX-License-Identifier: MIT + ! ============================================================= + ! + ! [DESCRIPTION] + ! This program computes the integral (area under the curve) of a user-supplied + ! function over an interval in a stepwise fashion. The interval is split into + ! segments, and at each segment position the area of a rectangle is computed + ! whose height is the value of sine at that point and the width is the segment + ! width. The areas of the rectangles are then summed. + ! + ! The process is repeated with smaller and smaller width rectangles, more + ! closely approximating the true value. + ! + ! The source for this program also demonstrates recommended Fortran + ! coding practices. + ! + ! Compile the sample several times using different optimization options. + ! + ! Read the Intel(R) Fortran Compiler Documentation for more information about these options. + ! + ! Some of these automatic optimizations use features and options + ! that can restrict program execution to specific architectures. + ! + ! [COMPILE] + ! Use the one of the following compiler options: + ! + ! Windows*: /O1, /O2, /O3 + ! + ! Linux* and macOS*: -O1, -O2, -O3 + ! + +program int_sin +implicit none + +! Create a value DP that is the "kind" number of a double precision value +! We will use this value in our declarations and constants. +integer, parameter :: DP = kind(0.0D0) + +! Declare a named constant for pi, specifying the kind type +real(DP), parameter :: pi = 3.141592653589793238_DP + +! Declare interval begin and end +real(DP), parameter :: interval_begin = 0.0_DP +real(DP), parameter :: interval_end = 2.0_DP * pi + +real(DP) :: step, sum, x_i +integer :: N, i, j +real clock_start, clock_finish + +write (*,'(A)') " " +write (*,'(A)') " Number of | Computed Integral |" +write (*,'(A)') " Interior Points | |" +call cpu_time (clock_start) + +do j=2,26 + write (*,'(A)') "--------------------------------------" + N = 2**j + ! Compute stepsize for N-1 internal rectangles + step = (interval_end - interval_begin) / real(N,DP); + + ! Approximate 1/2 area in first rectangle: f(x0) * (step/2) + sum = INTEG_FUNC(interval_begin) * (step / 2.0_DP) + + do i=1,N-1 + x_i = real(i,DP) * step + ! Apply midpoint rule: + ! Given length = f(x), compute the area of the + ! rectangle of width step + sum = sum + (INTEG_FUNC(x_i) * step) + end do + + ! Add approximate area in last rectangle for f(xN) * (step/2) + sum = sum + (INTEG_FUNC(interval_end) * (step / 2.0_DP)) + + write (*,'(T5,I10,T18,"|",2X,1P,E14.7,T38,"|")') N, sum + end do + +call cpu_time(clock_finish) +write (*,'(A)') "--------------------------------------" +write (*,'(A)') " " +write (*,*) "CPU Time = ",(clock_finish - clock_start), " seconds" + +contains + +! Function to integrate +real(DP) function INTEG_FUNC (x) + real(DP), intent(IN) :: x + + INTEG_FUNC = abs(sin(x)) + return +end function INTEG_FUNC + +end program int_sin diff --git a/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/License.txt b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/License.txt new file mode 100644 index 0000000000..6e9524bd74 --- /dev/null +++ b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/License.txt @@ -0,0 +1,7 @@ +Copyright 2020 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/Makefile b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/Makefile new file mode 100644 index 0000000000..3641610d5f --- /dev/null +++ b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/Makefile @@ -0,0 +1,30 @@ +## ============================================================= +## Copyright © 2020 Intel Corporation +## +## SPDX-License-Identifier: MIT +## ============================================================= +## +## +##****************************************************************************** +## Content: +## +## Build for vec_sample +##****************************************************************************** +# +FC=ifort +FFLAGS=-O2 -qopt-report-phase=vec -qopt-report=2 +OBJ=src/driver.o src/matvec.o + +all : matvec + +run : matvec + ./matvec + +src/%.o: src/%.f90 + $(FC) $(FFLAGS) -c $< -o $@ + +matvec: $(OBJ) + $(FC) -V $^ -o matvec + +clean: + -rm -f matvec $(OBJ) src/*.optrpt diff --git a/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/README.md b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/README.md new file mode 100644 index 0000000000..1e08cb7caf --- /dev/null +++ b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/README.md @@ -0,0 +1,224 @@ +# `Vectorize VecMatMult` + +In this sample, you will use the auto-vectorizer to improve the performance +of the sample application. You will compare the performance of the +serial version and the version that was compiled with the auto-vectorizer. + +| Optimized for | Description +|:--- |:--- +| OS | macOS* with Xcode* installed +| Hardware | Intel-based Mac* +| Software | Intel® oneAPI Intel Fortran Compiler (beta) +| What you will learn | Vectorization using Intel Fortran compiler +| Time to complete | 15 minutes + + +## Purpose +The Intel® Compiler has an auto-vectorizer that detects operations in the application +that can be done in parallel and converts sequential operations +to parallel operations by using the +Single Instruction Multiple Data (SIMD) instruction set. + +For the Intel® compiler, vectorization is the unrolling of a loop combined with the generation of packed SIMD instructions. Because the packed instructions operate on more than one data element at a time, the loop can execute more efficiently. It is sometimes referred to as auto-vectorization to emphasize that the compiler automatically identifies and optimizes suitable loops on its own. + +Intel® Advisor can assist with vectorization and show optimization report messages with your source code. See [Intel Advisor][1] for details. +[1]: https://software.intel.com/content/www/us/en/develop/tools/advisor.html "Intel Avisor" + +Vectorization may call library routines that can result in additional performance gain on Intel microprocessors than on non-Intel microprocessors. The vectorization can also be affected by certain options, such as m or x. + +Vectorization is enabled with the compiler at optimization levels of O2 (default level) and higher for both Intel® microprocessors and non-Intel® microprocessors. Many loops are vectorized automatically, but in cases where this doesn't happen, you may be able to vectorize loops by making simple code modifications. In this sample, you will: + +1. establish a performance baseline + +2. generate a vectorization report + +3. improve performance by aligning data + +4. improve performance using Interprocedural Optimization + +## Key Implementation Details + +In this sample, you will use the following files: + + driver.f90 + + matvec.f90 + + +## License +This code sample is licensed under MIT license + + +## Building the `Fortran Vectorization` sample + +This sample contains 2 Fortran source files, in subdirectory 'src/' under the main sample root directory oneAPI-samples/DirectProgramming/Fortran/vectorize-vecmatmult + +1. matvec.f90 is a Fortran source file with a matrix-times-vector algorithm +2. driver.f90 is a Fortran source file with the main program calling matvec + +## Running the `Fortran Vectorization` sample + +### Step1 Establishing a Performance Baseline + +To set a performance baseline for the improvements that follow in this sample, compile your sources from the src directory with these compiler options: + + ifort -real-size 64 -O1 matvec.f90 driver.f90 -o MatVector + +Execute 'MatVector' + + ./MatVector +and record the execution time reported in the output. This is the baseline against which subsequent improvements will be measured. + + +### Step 2 Generating a Vectorization Report + +A vectorization report shows what loops in your code were vectorized and explains why other loops were not vectorized. To generate a vectorization report, use the **qopt-report-phase=vec** compiler options together with **qopt-report=1** or **qopt-report=2**. + +Together with **qopt-report-phase=vec**, **qopt-report=1** generates a report with the loops in your code that were vectorized while **qopt-report-phase=vec** with **qopt-report=2** generates a report with both the loops in your code that were vectorized and the reason that other loops were not vectorized. + +Because vectorization is turned off with the **O1** option, the compiler does not generate a vectorization report. To generate a vectorization report, compile your project with the **O2**, **qopt-report-phase=vec**, **qopt-report=1** options: + + ifort -real-size 64 -O2 -qopt-report=1 -qopt-report-phase=vec matvec.f90 driver.f90 -o MatVector + +Recompile the program and then execute MatVector. Record the new execution time. The reduction in time is mostly due to auto-vectorization of the inner loop at line 32 noted in the vectorization report **matvec.optrpt** : + + Begin optimization report for: matvec_ + + Report from: Vector optimizations [vec] + + + LOOP BEGIN at matvec.f90(26,3) + remark #25460: No loop optimizations reported + + LOOP BEGIN at matvec.f90(26,3) + remark #15300: LOOP WAS VECTORIZED + LOOP END + + LOOP BEGIN at matvec.f90(26,3) + + LOOP END + LOOP END + + LOOP BEGIN at matvec.f90(27,3) + remark #25460: No loop optimizations reported + + LOOP BEGIN at matvec.f90(32,6) + + LOOP END + + LOOP BEGIN at matvec.f90(32,6) + remark #15300: LOOP WAS VECTORIZED + LOOP END + + LOOP BEGIN at matvec.f90(32,6) + + LOOP END + + LOOP BEGIN at matvec.f90(32,6) + + LOOP END + LOOP END + +Note + +Your line and column numbers may be different. + +**qopt-report=2** with **qopt-report-phase=vec,loop** returns a list that also includes loops that were not vectorized or multi-versioned, along with the reason that the compiler did not vectorize them or multi-version the loop. + +Recompile your project with the **qopt-report=2** and **qopt-report-phase=vec,loop** options. + + ifort -real-size 64 -O2 -qopt-report-phase=vec -qopt-report=2 matvec.f90 driver.f90 -o MatVector + +The vectorization report matvec.optrpt indicates that the loop at line 33 in matvec.f90 did not vectorize because it is not the innermost loop of the loop nest. + + LOOP BEGIN at matvec.f90(27,3) + remark #15542: loop was not vectorized: inner loop was already vectorized + + LOOP BEGIN at matvec.f90(32,6) + + LOOP END + + LOOP BEGIN at matvec.f90(32,6) + remark #15300: LOOP WAS VECTORIZED + LOOP END + + LOOP BEGIN at matvec.f90(32,6) + + LOOP END + + LOOP BEGIN at matvec.f90(32,6) + + remark #15335: remainder loop was not vectorized: vectorization possible but seems inefficient. Use vector always directive or -vec-threshold0 to override + LOOP END + LOOP END + +Note: Your line and column numbers may be different. + +For more information on the **qopt-report** and **qopt-report-phase** compiler options, see the +[Compiler Options section][3] in the Intel® Fortran Compiler Developer Guide and Reference. +[3]: https://software.intel.com/content/www/us/en/develop/documentation/fortran-compiler-developer-guide-and-reference/top/compiler-reference/compiler-options/alphabetical-list-of-compiler-options.html "Options" + + +### Step 3 Improving Performance by Aligning Data + +The vectorizer can generate faster code when operating on aligned data. In this activity you will improve the vectorizer performance by aligning the arrays a, b, and c in **driver.f90** on a 16-byte boundary so the vectorizer can use aligned load instructions for all arrays rather than the slower unaligned load instructions and can avoid runtime tests of alignment. Using the ALIGNED macro will insert an alignment directive for a, b, and c in driver.f90 with the following syntax: + + !dir$ attributes align : 16 :: a,b,c + +This instructs the compiler to create arrays that it are aligned on a 16-byte boundary, which should facilitate the use of SSE aligned load instructions. + +In addition, the column height of the matrix a needs to be padded out to be a multiple of 16 bytes, so that each individual column of a maintains the same 16-byte alignment. In practice, maintaining a constant alignment between columns is much more important than aligning the start of the arrays. + +To derive the maximum benefit from this alignment, we also need to tell the vectorizer it can safely assume that the arrays in matvec.f90 are aligned by using the directive + + !dir$ vector aligned + +Note If you use **!dir$ vector aligned**, you must be sure that all the arrays or subarrays in the loop are 16-byte aligned. Otherwise, you may get a runtime error. Aligning data may still give a performance benefit even if **!dir$ vector aligned** is not used. See the code under the ALIGNED macro in **matvec.f90** + +If your compilation targets the Intel® AVX-512 instruction set, you should try to align data on a 64-byte boundary. This may result in improved performance. In this case, **!dir$ vector aligned** advises the compiler that the data is 64-byte aligned. + +Recompile the program after adding the ALIGNED macro to ensure consistently aligned data: + + ifort -real-size 64 -qopt-report=2 -qopt-report-phase=vec -D ALIGNED matvec.f90 driver.f90 -o MatVector + + +### Step 4 Improving Performance with Interprocedural Optimization + +The compiler may be able to perform additional optimizations if it is able to optimize across source line boundaries. These may include, but are not limited to, function inlining. This is enabled with the **-ipo** option. + +Recompile the program using the **-ipo** option to enable interprocedural optimization. + + ifort -real-size 64 -qopt-report=2 -qopt-report-phase=vec -D ALIGNED -ipo matvec.f90 driver.f90 -o MatVector + +Note that the vectorization messages now appear at the point of inlining in **driver.f90** (line 70) and this is found in the file **ipo_out.optrpt**. + + LOOP BEGIN at driver.f90(73,16) + remark #15541: loop was not vectorized: inner loop was already vectorized + + LOOP BEGIN at matvec.f90(32,3) inlined into driver.f90(70,14) + remark #15398: loop was not vectorized: loop was transformed to memset or memcpy + LOOP END + + LOOP BEGIN at matvec.f90(33,3) inlined into driver.f90(70,14) + remark #15541: loop was not vectorized: inner loop was already vectorized + + LOOP BEGIN at matvec.f90(38,6) inlined into driver.f90(70,14) + remark #15399: vectorization support: unroll factor set to 4 + remark #15300: LOOP WAS VECTORIZED + LOOP END + LOOP END + LOOP END + + +Note: Your line and column numbers may be different. + +Now, run the executable and record the execution time. + +### Additional Exercises + +The previous examples made use of double precision arrays. They may be built instead with single precision arrays by changing the command-line option **-real-size 64** to **-real-size 32**. The non-vectorized versions of the loop execute only slightly faster the double precision version; however, the vectorized versions are substantially faster. This is because a packed SIMD instruction operating on a 32-byte vector register operates on eight single precision data elements at once instead of four double precision data elements. + +Note: In the example with data alignment, you will need to set ROWBUF=3 to ensure 16-byte alignment for each row of the matrix a. Otherwise, the directive **!dir$ vector aligned** will cause the program to fail. + +This completes the sample that shows how the compiler can optimize performance with various vectorization techniques. + diff --git a/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/sample.json b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/sample.json new file mode 100644 index 0000000000..a573f6b037 --- /dev/null +++ b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/sample.json @@ -0,0 +1,22 @@ +{ + "name": "vectorize-vecmatmult", + "categories": [ "Toolkit/Intel® oneAPI HPC Toolkit" ], + "description": "Fortran Tutorial - Using Auto Vectorization", + "toolchain": [ "ifort" ], + "languages": [ { "fortran": {} } ], + "targetDevice": [ "CPU" ], + "os": [ "darwin" ], + "builder": [ "make" ], + "ciTests":{ + "darwin": [ + { + "id": "fort_vecsample_cpu", + "steps": [ + "make", + "make run", + "make clean" + ] + } + ] + } +} diff --git a/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/src/driver.f90 b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/src/driver.f90 new file mode 100644 index 0000000000..d4e24e6d71 --- /dev/null +++ b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/src/driver.f90 @@ -0,0 +1,69 @@ +! ============================================================== +! Copyright © 2020 Intel Corporation +! +! SPDX-License-Identifier: MIT +! ============================================================= +! +! Part of the vec_samples tutorial. For information, please read +! Tutorial: Auto-vectorization in the Getting Started Tutorials document +! + + +program driver + implicit none + + integer, parameter :: ROW=101 + integer, parameter :: COL=101 + +! Using ROWBUF=3 makes each column of 'a' be aligned at 16-byte intervals by +! adding three elements of padding to each column. + +!DIR$ IF DEFINED(ALIGNED) + integer, parameter :: ROWBUF=3 +!DIR$ ELSE + integer, parameter :: ROWBUF=0 +!DIR$ END IF + + integer, parameter :: TOTROW = ROW + ROWBUF + integer, parameter :: REPEATNTIMES = 1000000 + + integer :: i, j + integer :: size1=TOTROW, size2=COL + real, dimension(TOTROW,COL) :: a + real, dimension(COL) :: b + real, dimension(TOTROW) :: c + real :: sum + real(8) :: cptim1, cptim2 + +!DIR$ IF DEFINED(ALIGNED) +! aligning the start of each array is unimportant in this simple example. +! preserving the same alignment for all rows of the matrix is much more important. +!DIR$ attributes align : 32 :: a,b,c +!DIR$ ENDIF + +! initialize the matrix and vector + + a = reshape( (/((mod(i*j+1,10), i=0,size1-1), j=0,size2-1)/), & +& (/size1, size2/) ) + b = (/(mod(j+3,10), j=0,size2-1)/) + + if(ROWBUF.gt.0) a(ROW+1:TOTROW,:) = 0. + +! initialize timing + call cpu_time(cptim1) + +! just do it + do i=1,REPEATNTIMES + call matvec(size1, size2, a, b, c) +! this line so that each iteration is different, so that +! the compiler can't optimize away every iteration except one. + b(1) = b(1) + 0.000001 + enddo + +! print cpu time taken and a simple checksum +! (use a different timer for threaded programs) + call cpu_time(cptim2) + print '(''time taken '',f8.3,'' sum='',6pe20.12/)', & +& (cptim2 - cptim1), sum(c) + + end program driver diff --git a/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/src/matvec.f90 b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/src/matvec.f90 new file mode 100644 index 0000000000..56ab6da14f --- /dev/null +++ b/DirectProgramming/Fortran/DenseLinearAlgebra/vectorize-vecmatmult/src/matvec.f90 @@ -0,0 +1,30 @@ +! ============================================================== +! Copyright © 2020 Intel Corporation +! +! SPDX-License-Identifier: MIT +! ============================================================= +! +! Part of the vec_samples tutorial. For information, please read +! Tutorial: Auto-vectorization in the Getting Started Tutorials document +! + +subroutine matvec(size1,size2,a,b,c) + implicit none + integer, intent(in) :: size1,size2 + real, dimension(size1,size2), intent(in) :: a + real, dimension(size2), intent(in) :: b + real, dimension(size1), intent(out) :: c + integer :: i,j,k + + c=0. + do j=1,size2 + +!DIR$ IF DEFINED(ALIGNED) +!DIR$ vector aligned +!DIR$ END IF + do i=1,size1 + c(i) = c(i) + a(i,j) * b(j) + enddo + enddo + +end subroutine matvec