NVIDIA-developer-blog · nsakharnykh · Mar 2, 2023 · Feb 7, 2023 · Feb 8, 2023 · Feb 23, 2023
diff --git a/posts/gups/LICENSE b/posts/gups/LICENSE
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
diff --git a/posts/gups/LICENSE.gups.cu b/posts/gups/LICENSE.gups.cu
@@ -0,0 +1,12 @@
+Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+Copyright (c) 2012 NISHIMURA Ryohei.
+Copyright (c) 2012 The University of Tennessee.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+· Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+· Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution.
+· Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+This software is provided by the copyright holders and contributors "as is" and any express or implied warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose are disclaimed. in no event shall the copyright owner or contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this software, even if advised of the possibility of such damage.
diff --git a/posts/gups/Makefile b/posts/gups/Makefile
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+
+# The CUDA compiler.
+CUDA_HOME ?= /usr/local/cuda
+
+# The compiler.
+CXX = $(CUDA_HOME)/bin/nvcc
+
+# Optimization and Debugging
+OPTFLAGS ?= -O3
+
+# Set target GPU CC (only sm_80 and sm_90 are currently supported for STATIC_SHMEM)
+GPU_ARCH ?= 80 90
+
+# Default to using compile time NSHMEM
+DYNAMIC_SHMEM ?= -DSTATIC_SHMEM
+
+# Source files
+SRC_FILES = gups.cu
+
+# Object Files
+OBJ_FILES = $(SRC_FILES:.cu=.o)
+
+# CU flags
+CU_FLAGS = -std=c++14 -Xcompiler -std=c++14 -lineinfo
+
+CU_FLAGS += $(foreach cc,$(GPU_ARCH), \
+	       --generate-code arch=compute_$(cc),code=sm_$(cc) )
+
+# CXX flags
+CXXFLAGS = $(OPTFLAGS) $(CU_FLAGS) -Xcompiler -Wall $(DYNAMIC_SHMEM)
+
+
+LINKFLAGS = $(CXXFLAGS)
+
+
+DEFAULT: gups
+
+all = gups
+
+gups: $(OBJ_FILES)
+
+# Include the dependencies that were created by %.d rule.
+#
+ifneq ($(MAKECMDGOALS),clean)
+-include $(SRC_FILES:.cu=.d)
+endif
+#
+
+# Prepare file holding dependencies, to be included in this file.
+#
+
+%.d: %.cu Makefile
+	@set -e; rm -f $@; \
+	$(CXX) -DMAKE_DEPEND -M $(CXXFLAGS) $< > $@.$$$$; \
+	sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
+	rm -f $@.$$$$
+
+%.o: %.cu Makefile
+	$(CXX) $(CXXFLAGS) -c $*.cu
+
+$(all):%:
+	$(CXX) $(LINKFLAGS) -o $@ $^
+
+clean:
+	rm -f $(OBJ_FILES) *.o *.d gups \
+	*.d.[0-9][0-9][0-9][0-9][0-9] *.d.[0-9][0-9][0-9][0-9] \
+	*.d.[0-9][0-9][0-9] *.d.[0-9][0-9][0-9][0-9][0-9][0-9] *~
diff --git a/posts/gups/README.md b/posts/gups/README.md
@@ -0,0 +1,70 @@
+## GUPS Benchmark
+
+### How to build the benchmark
+Build with Makefile with following options:
+
+`GPU_ARCH=xx` where `xx` is the Compute Capibility of the device(s) being tested (default: 80 90). Users could check the CC of a specific GPU using the tables [here](https://developer.nvidia.com/cuda-gpus#compute). The generated executable (called `gups`) supports both global memory GUPS and shared memory GUPS modes. Global memory mode is the default mode. Please refer to the next section for the runtime option to switch between modes. 
+
+Notes on shared memory GUPS: 
+1. Note that for shared memory GUPS, unless if dynamic allocation is forced (see below), only CC 80 and CC 90 are supported, for other CC, the shared memory GUPS code will fall back to dynamic allocation mode.
+2. To force dynamic shared memory allocation, build with `DYNAMIC_SHMEM=`. Note that this is NOT recommended and will result in incorrect shared memory GUPS numbers as the kernel becomes instruction bound.
+
+For example: `make GPU_ARCH="70 80" DYNAMIC_SHMEM=` will build the executable `gups`, which supports global memory GUPS and shared memory GUPS with dynamic shared memory allocation, for both CC 70 (e.g., NVIIDA V100 GPU) and CC 80 (e.g., NVIDIA A100 GPU). 
+
+### How to run the benchmark
+Note that besides GUPS (updates (loop)), our benchmark code supports other random access tests, including reads, writes, reads+writes, and updates (no loop). 
+You can choose the benchmark type using the `-t` runtime option. Users may need to fine tune access per element option (`-a`) to achieve the best performance. 
+Note that the correctness verification is only available for updates (loop)/default test. 
+
+You could use `./gups -h` to get a list of runtime arguments.
+```
+Usage:
+  -n <int> input data size = 2^n [default: 29]
+  -o <int> occupancy percentage, 100/occupancy how much larger the working set is compared to the requested bytes [default: 100]
+  -r <int> number of kernel repetitions [default: 1]
+  -a <int> number of random accesses per input element [default:  32 (r, w) or 8 (u, unl, rw) for gmem, 65536 for shmem]
+  -t <int> test type (0 - update (u), 1 - read (r), 2 - write (w), 3 - read write (rw), 4 - update no loop (unl)) [default: 0]
+  -d <int> device ID to use [default: 0]
+  -s <int> enable input in shared memory instead of global memory for shared memory GUPS benchmark if s>=0. The benchmark will use max available shared memory if s=0 (for ideal GUPS conditions this must be done at compile time, check README.md for build options). This tool does allow setting the shmem data size with = 2^s (for s>0), however this will also result in an instruction bound kernel that fails to reach hardware limitations of GUPS. [default: -1 (disabled)]
+```
+
+You can also use provided Python script to run multiple tests with a single command and get a CSV report. The default setting of the script run all the random access tests. Run `python run.py --help` for the usage options. 
+```
+usage: run.py [-h] [--device-id DEVICE_ID]
+              [--input-size-begin INPUT_SIZE_BEGIN]
+              [--input-size-end INPUT_SIZE_END] [--occupancy OCCUPANCY]
+              [--repeats REPEATS]
+              [--test {reads,writes,reads_writes,updates,updates_no_loop,all}]
+              [--memory-loc {global,shared}]
+
+Benchmark GUPS. Store results in results.csv file.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --device-id DEVICE_ID
+                        GPU ID to run the test
+  --input-size-begin INPUT_SIZE_BEGIN
+                        exponent of the input data size begin range, base is 2
+                        (input size = 2^n). [Default: 29 for global GUPS,
+                        max_shmem for shared GUPS. Global/shared is controlled
+                        by --memory-loc
+  --input-size-end INPUT_SIZE_END
+                        exponent of the input data size end range, base is 2
+                        (input size = 2^n). [Default: 29 for global GUPS,
+                        max_shmem for shared GUPS. Global/shared is controlled
+                        by --memory-loc
+  --occupancy OCCUPANCY
+                        100/occupancy is how much larger the working set is
+                        compared to the requested bytes
+  --repeats REPEATS     number of kernel repetitions
+  --test {reads,writes,reads_writes,updates,updates_no_loop,all}
+                        test to run
+  --memory-loc {global,shared}
+                        memory buffer in global memory or shared memory
+```
+
+### LICENSE 
+
+`gups.cu` is modified based on `randomaccess.cu` file from [link to Github repository](https://github.com/nattoheaven/cuda_randomaccess). The LICENSE file of the Github repository is preserved as `LICENSE.gups.cu`. 
+
+`run.py` and `Makefile` are implemented from scratch by NVIDIA. For the license information of these two files, please refer to the `LICENSE` file.