diff --git a/posts/gups/LICENSE b/posts/gups/LICENSE new file mode 100644 index 0000000..0666b9f --- /dev/null +++ b/posts/gups/LICENSE @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# \ No newline at end of file diff --git a/posts/gups/LICENSE.gups.cu b/posts/gups/LICENSE.gups.cu new file mode 100644 index 0000000..d41f0ab --- /dev/null +++ b/posts/gups/LICENSE.gups.cu @@ -0,0 +1,12 @@ +Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +Copyright (c) 2012 NISHIMURA Ryohei. +Copyright (c) 2012 The University of Tennessee. +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: +· Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. +· Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer listed in this license in the documentation and/or other materials provided with the distribution. +· Neither the name of the copyright holders nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +This software is provided by the copyright holders and contributors "as is" and any express or implied warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose are disclaimed. in no event shall the copyright owner or contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this software, even if advised of the possibility of such damage. \ No newline at end of file diff --git a/posts/gups/Makefile b/posts/gups/Makefile new file mode 100644 index 0000000..f9cf6b4 --- /dev/null +++ b/posts/gups/Makefile @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +# The CUDA compiler. +CUDA_HOME ?= /usr/local/cuda + +# The compiler. +CXX = $(CUDA_HOME)/bin/nvcc + +# Optimization and Debugging +OPTFLAGS ?= -O3 + +# Set target GPU CC (only sm_80 and sm_90 are currently supported for STATIC_SHMEM) +GPU_ARCH ?= 80 90 + +# Default to using compile time NSHMEM +DYNAMIC_SHMEM ?= -DSTATIC_SHMEM + +# Source files +SRC_FILES = gups.cu + +# Object Files +OBJ_FILES = $(SRC_FILES:.cu=.o) + +# CU flags +CU_FLAGS = -std=c++14 -Xcompiler -std=c++14 -lineinfo + +CU_FLAGS += $(foreach cc,$(GPU_ARCH), \ + --generate-code arch=compute_$(cc),code=sm_$(cc) ) + +# CXX flags +CXXFLAGS = $(OPTFLAGS) $(CU_FLAGS) -Xcompiler -Wall $(DYNAMIC_SHMEM) + + +LINKFLAGS = $(CXXFLAGS) + + +DEFAULT: gups + +all = gups + +gups: $(OBJ_FILES) + +# Include the dependencies that were created by %.d rule. +# +ifneq ($(MAKECMDGOALS),clean) +-include $(SRC_FILES:.cu=.d) +endif +# + +# Prepare file holding dependencies, to be included in this file. +# + +%.d: %.cu Makefile + @set -e; rm -f $@; \ + $(CXX) -DMAKE_DEPEND -M $(CXXFLAGS) $< > $@.$$$$; \ + sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \ + rm -f $@.$$$$ + +%.o: %.cu Makefile + $(CXX) $(CXXFLAGS) -c $*.cu + +$(all):%: + $(CXX) $(LINKFLAGS) -o $@ $^ + +clean: + rm -f $(OBJ_FILES) *.o *.d gups \ + *.d.[0-9][0-9][0-9][0-9][0-9] *.d.[0-9][0-9][0-9][0-9] \ + *.d.[0-9][0-9][0-9] *.d.[0-9][0-9][0-9][0-9][0-9][0-9] *~ diff --git a/posts/gups/README.md b/posts/gups/README.md new file mode 100644 index 0000000..5f8f75c --- /dev/null +++ b/posts/gups/README.md @@ -0,0 +1,70 @@ +## GUPS Benchmark + +### How to build the benchmark +Build with Makefile with following options: + +`GPU_ARCH=xx` where `xx` is the Compute Capibility of the device(s) being tested (default: 80 90). Users could check the CC of a specific GPU using the tables [here](https://developer.nvidia.com/cuda-gpus#compute). The generated executable (called `gups`) supports both global memory GUPS and shared memory GUPS modes. Global memory mode is the default mode. Please refer to the next section for the runtime option to switch between modes. + +Notes on shared memory GUPS: +1. Note that for shared memory GUPS, unless if dynamic allocation is forced (see below), only CC 80 and CC 90 are supported, for other CC, the shared memory GUPS code will fall back to dynamic allocation mode. +2. To force dynamic shared memory allocation, build with `DYNAMIC_SHMEM=`. Note that this is NOT recommended and will result in incorrect shared memory GUPS numbers as the kernel becomes instruction bound. + +For example: `make GPU_ARCH="70 80" DYNAMIC_SHMEM=` will build the executable `gups`, which supports global memory GUPS and shared memory GUPS with dynamic shared memory allocation, for both CC 70 (e.g., NVIIDA V100 GPU) and CC 80 (e.g., NVIDIA A100 GPU). + +### How to run the benchmark +Note that besides GUPS (updates (loop)), our benchmark code supports other random access tests, including reads, writes, reads+writes, and updates (no loop). +You can choose the benchmark type using the `-t` runtime option. Users may need to fine tune access per element option (`-a`) to achieve the best performance. +Note that the correctness verification is only available for updates (loop)/default test. + +You could use `./gups -h` to get a list of runtime arguments. +``` +Usage: + -n input data size = 2^n [default: 29] + -o occupancy percentage, 100/occupancy how much larger the working set is compared to the requested bytes [default: 100] + -r number of kernel repetitions [default: 1] + -a number of random accesses per input element [default: 32 (r, w) or 8 (u, unl, rw) for gmem, 65536 for shmem] + -t test type (0 - update (u), 1 - read (r), 2 - write (w), 3 - read write (rw), 4 - update no loop (unl)) [default: 0] + -d device ID to use [default: 0] + -s enable input in shared memory instead of global memory for shared memory GUPS benchmark if s>=0. The benchmark will use max available shared memory if s=0 (for ideal GUPS conditions this must be done at compile time, check README.md for build options). This tool does allow setting the shmem data size with = 2^s (for s>0), however this will also result in an instruction bound kernel that fails to reach hardware limitations of GUPS. [default: -1 (disabled)] +``` + +You can also use provided Python script to run multiple tests with a single command and get a CSV report. The default setting of the script run all the random access tests. Run `python run.py --help` for the usage options. +``` +usage: run.py [-h] [--device-id DEVICE_ID] + [--input-size-begin INPUT_SIZE_BEGIN] + [--input-size-end INPUT_SIZE_END] [--occupancy OCCUPANCY] + [--repeats REPEATS] + [--test {reads,writes,reads_writes,updates,updates_no_loop,all}] + [--memory-loc {global,shared}] + +Benchmark GUPS. Store results in results.csv file. + +optional arguments: + -h, --help show this help message and exit + --device-id DEVICE_ID + GPU ID to run the test + --input-size-begin INPUT_SIZE_BEGIN + exponent of the input data size begin range, base is 2 + (input size = 2^n). [Default: 29 for global GUPS, + max_shmem for shared GUPS. Global/shared is controlled + by --memory-loc + --input-size-end INPUT_SIZE_END + exponent of the input data size end range, base is 2 + (input size = 2^n). [Default: 29 for global GUPS, + max_shmem for shared GUPS. Global/shared is controlled + by --memory-loc + --occupancy OCCUPANCY + 100/occupancy is how much larger the working set is + compared to the requested bytes + --repeats REPEATS number of kernel repetitions + --test {reads,writes,reads_writes,updates,updates_no_loop,all} + test to run + --memory-loc {global,shared} + memory buffer in global memory or shared memory +``` + +### LICENSE + +`gups.cu` is modified based on `randomaccess.cu` file from [link to Github repository](https://github.com/nattoheaven/cuda_randomaccess). The LICENSE file of the Github repository is preserved as `LICENSE.gups.cu`. + +`run.py` and `Makefile` are implemented from scratch by NVIDIA. For the license information of these two files, please refer to the `LICENSE` file. \ No newline at end of file diff --git a/posts/gups/gups.cu b/posts/gups/gups.cu new file mode 100644 index 0000000..5636a32 --- /dev/null +++ b/posts/gups/gups.cu @@ -0,0 +1,598 @@ +// Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + +/* -*- mode: C; tab-width: 2; indent-tabs-mode: nil; -*- */ + +/* + * This code has been contributed by the DARPA HPCS program. Contact + * David Koester or Bob Lucas + * if you have questions. + * + * + * GUPS (Giga UPdates per Second) is a measurement that profiles the memory + * architecture of a system and is a measure of performance similar to MFLOPS. + * The HPCS HPCchallenge RandomAccess benchmark is intended to exercise the + * GUPS capability of a system, much like the LINPACK benchmark is intended to + * exercise the MFLOPS capability of a computer. In each case, we would + * expect these benchmarks to achieve close to the "peak" capability of the + * memory system. The extent of the similarities between RandomAccess and + * LINPACK are limited to both benchmarks attempting to calculate a peak system + * capability. + * + * GUPS is calculated by identifying the number of memory locations that can be + * randomly updated in one second, divided by 1 billion (1e9). The term "randomly" + * means that there is little relationship between one address to be updated and + * the next, except that they occur in the space of one half the total system + * memory. An update is a read-modify-write operation on a table of 64-bit words. + * An address is generated, the value at that address read from memory, modified + * by an integer operation (add, and, or, xor) with a literal value, and that + * new value is written back to memory. + * + * We are interested in knowing the GUPS performance of both entire systems and + * system subcomponents --- e.g., the GUPS rating of a distributed memory + * multiprocessor the GUPS rating of an SMP node, and the GUPS rating of a + * single processor. While there is typically a scaling of FLOPS with processor + * count, a similar phenomenon may not always occur for GUPS. + * + * Select the memory size to be the power of two such that 2^n <= 1/2 of the + * total memory. Each CPU operates on its own address stream, and the single + * table may be distributed among nodes. The distribution of memory to nodes + * is left to the implementer. A uniform data distribution may help balance + * the workload, while non-uniform data distributions may simplify the + * calculations that identify processor location by eliminating the requirement + * for integer divides. A small (less than 1%) percentage of missed updates + * are permitted. + * + * When implementing a benchmark that measures GUPS on a distributed memory + * multiprocessor system, it may be required to define constraints as to how + * far in the random address stream each node is permitted to "look ahead". + * Likewise, it may be required to define a constraint as to the number of + * update messages that can be stored before processing to permit multi-level + * parallelism for those systems that support such a paradigm. The limits on + * "look ahead" and "stored updates" are being implemented to assure that the + * benchmark meets the intent to profile memory architecture and not induce + * significant artificial data locality. For the purpose of measuring GUPS, + * we will stipulate that each thread is permitted to look ahead no more than + * 1024 random address stream samples with the same number of update messages + * stored before processing. + * + * The supplied MPI-1 code generates the input stream {A} on all processors + * and the global table has been distributed as uniformly as possible to + * balance the workload and minimize any Amdahl fraction. This code does not + * exploit "look-ahead". Addresses are sent to the appropriate processor + * where the table entry resides as soon as each address is calculated. + * Updates are performed as addresses are received. Each message is limited + * to a single 64 bit long integer containing element ai from {A}. + * Local offsets for T[ ] are extracted by the destination processor. + * + * If the number of processors is equal to a power of two, then the global + * table can be distributed equally over the processors. In addition, the + * processor number can be determined from that portion of the input stream + * that identifies the address into the global table by masking off log2(p) + * bits in the address. + * + * If the number of processors is not equal to a power of two, then the global + * table cannot be equally distributed between processors. In the MPI-1 + * implementation provided, there has been an attempt to minimize the differences + * in workloads and the largest difference in elements of T[ ] is one. The + * number of values in the input stream generated by each processor will be + * related to the number of global table entries on each processor. + * + * The MPI-1 version of RandomAccess treats the potential instance where the + * number of processors is a power of two as a special case, because of the + * significant simplifications possible because processor location and local + * offset can be determined by applying masks to the input stream values. + * The non power of two case uses an integer division to determine the processor + * location. The integer division will be more costly in terms of machine + * cycles to perform than the bit masking operations + * + * For additional information on the GUPS metric, the HPCchallenge RandomAccess + * Benchmark,and the rules to run RandomAccess or modify it to optimize + * performance -- see http://icl.cs.utk.edu/hpcc/ + * + */ + +#include +#include +#include +#include +#include + +#define DEFAULT_LOGN 29 +#define ACCESS_PER_ELEM_GMEM_U_UNL_RW 8 +#define ACCESS_PER_ELEM_GMEM_R_W 32 +#define ACCESS_PER_ELEM_SHMEM 65536 +#define NUM_THREADS_PER_BLOCK_GMEM 1024 +// Number of threads for shared memory GUPS may not be changed for best perf +#define NUM_THREADS_PER_BLOCK_SHMEM 1024 +#define POLY 0x0000000000000007ULL + +// Switch to compile-time shared memory allocation otherwise collected numbers +// from shmem are instruction bound (not real gups) +#ifdef STATIC_SHMEM + +// Need to manually set correct value for compile time shmem allocation +// Handled through makefile (currently only SM80 and SM90 are supported) +// Must satisfy NSHMEM = prop.sharedMemPerBlockOptin / sizeof(benchtype); +#if __CUDA_ARCH__ == 800 +#define NSHMEM 20864 +#elif __CUDA_ARCH__ == 900 +#define NSHMEM 29056 +#else +// not supported, fall back to dynamic shmem allocation +#undef STATIC_SHMEM +#endif + +#endif + +union benchtype +{ + uint64_t data; +}; +static __constant__ uint64_t c_m2[64]; + +#ifndef CUDA_RT_CALL +#define CUDA_RT_CALL(call) \ + { \ + cudaError_t cudaStatus = call; \ + if (cudaSuccess != cudaStatus) \ + fprintf( \ + stderr, \ + "ERROR: CUDA RT call \"%s\" in line %d of file %s failed with %s " \ + "(%d).\n", \ + #call, \ + __LINE__, \ + __FILE__, \ + cudaGetErrorString(cudaStatus), \ + cudaStatus); \ + } +#endif + +static __global__ void d_init(size_t n, benchtype* t) +{ + for (ptrdiff_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; + i += gridDim.x * blockDim.x) { + t[i].data = i; + } +} + +static __device__ uint64_t d_starts(size_t n) +{ + if (n == 0) { + return 1; + } + + int i = 63 - __clzll(n); + + uint64_t ran = 2; + while (i > 0) { + uint64_t temp = 0; + for (int j = 0; j < 64; j++) { + if ((ran >> j) & 1) { + temp ^= c_m2[j]; + } + } + ran = temp; + i -= 1; + if ((n >> i) & 1) { + ran = (ran << 1) ^ ((int64_t)ran < 0 ? POLY : 0); + } + } + + return ran; +} + +enum test_t +{ + UPDATE = 0, + READ, + WRITE, + READ_WRITE, + UPDATE_NO_LOOP, +}; + +const char* test_name[] = { + "Gupdates/s ATOM.CAS.64(loop)", + "Greads/s LD.64", + "Gwrites/s STG.64", + "Greads+writes/s LDG.64+STG.64", + "Gupdates/s ATOM.CAS.64(no_loop)", +}; + +template +__global__ void +d_bench(size_t n, size_t working_set, benchtype* t, int accesses_per_elem) +{ + size_t num_threads = gridDim.x * blockDim.x; + size_t thread_num = blockIdx.x * blockDim.x + threadIdx.x; + size_t start = thread_num * accesses_per_elem * n / num_threads; + size_t end = (thread_num + 1) * accesses_per_elem * n / num_threads; + benchtype ran; + ran.data = d_starts(start); + +#pragma unroll + for (ptrdiff_t i = start; i < end; ++i) { + ran.data = (ran.data << 1) ^ ((int64_t)ran.data < 0 ? POLY : 0); + unsigned long long int *address, old, assumed; + address = (unsigned long long int*)&t[ran.data & (working_set - 1)].data; + switch (T_TYPE) { + case READ: + old = *address; + if (old == n) { // basically never executes + *address = n + 1; + } + break; + case WRITE: + *address = 1; + break; + case READ_WRITE: + *address += 1; + break; + case UPDATE_NO_LOOP: + old = *address; + assumed = old; + old = atomicCAS(address, assumed, assumed ^ ran.data); + break; + case UPDATE: + old = *address; + do { + assumed = old; + old = atomicCAS(address, assumed, assumed ^ ran.data); + } while (assumed != old); + break; + } + } +} + +template +__global__ void d_bench_shmem(size_t n_shmem, int accesses_per_elem_sh) +{ + extern __shared__ benchtype extern_mem[]; + benchtype* t = (benchtype*)&extern_mem; + + size_t num_threads = gridDim.x * blockDim.x; + size_t thread_num = blockIdx.x * blockDim.x + threadIdx.x; +#ifdef STATIC_SHMEM + // ignore any compiler warnings about not using n_shmem! + const size_t n_shmem_l = NSHMEM; +#else + const size_t n_shmem_l = n_shmem; +#endif + size_t start = thread_num * accesses_per_elem_sh * n_shmem_l / num_threads; + size_t end = (thread_num + 1) * accesses_per_elem_sh * n_shmem_l / num_threads; + benchtype ran; + ran.data = d_starts(start); + +#pragma unroll + for (ptrdiff_t i = start; i < end; ++i) { + ran.data = (ran.data << 1) ^ ((int64_t)ran.data < 0 ? POLY : 0); + unsigned long long int *address, old, assumed; + address = (unsigned long long int*)&t[ran.data % n_shmem_l].data; + switch (T_TYPE) { + case READ: + old = *address; + if (old == n_shmem_l) { // basically never executes + *address = n_shmem_l + 1; + } + break; + case WRITE: + *address = 1; + break; + case READ_WRITE: + *address += 1; + break; + case UPDATE_NO_LOOP: + old = *address; + assumed = old; + old = atomicCAS(address, assumed, assumed ^ ran.data); + break; + case UPDATE: + old = *address; + do { + assumed = old; + old = atomicCAS(address, assumed, assumed ^ ran.data); + } while (assumed != old); + break; + } + } +} + +static __global__ void d_check(size_t n, benchtype* t, uint32_t* d_error) +{ + for (ptrdiff_t i = blockIdx.x * blockDim.x + threadIdx.x; i < n; + i += gridDim.x * blockDim.x) { + if (t[i].data != i) { + atomicAdd(d_error, 1); + } + } +} + +static void starts() +{ + uint64_t m2[64]; + uint64_t temp = 1; + for (ptrdiff_t i = 0; i < 64; i++) { + m2[i] = temp; + temp = (temp << 1) ^ ((int64_t)temp < 0 ? POLY : 0); + temp = (temp << 1) ^ ((int64_t)temp < 0 ? POLY : 0); + } + CUDA_RT_CALL(cudaMemcpyToSymbol(c_m2, m2, sizeof(m2))); +} + +int main(int argc, char* argv[]) +{ + int logn = DEFAULT_LOGN; + test_t test_type = UPDATE; + int dev = 0; + bool shared_mem = false; + int logn_shmem = -1; + + int occupancy = 100; + int repeats = 1; + int accesses_per_elem = -1; + int accesses_per_elem_sh = -1; + + const char* opts_desc[7] = { + " -n input data size = 2^n [default: 29]", + " -o occupancy percentage, 100/occupancy how much larger the " + "working set is compared to the requested bytes [default: 100]", + " -r number of kernel repetitions [default: 1]", + " -a number of random accesses per input element [default: " + " 32 (r, w) or 8 (u, unl, rw) for gmem, 65536 for shmem]", + " -t test type (0 - update (u), 1 - read (r), 2 - write (w), 3 - read " + "write (rw), " + "4 - update no loop (unl)) [default: 0]", + " -d device ID to use [default: 0]", + " -s enable input in shared memory instead of global memory for " + "shared memory GUPS benchmark if s>=0. The benchmark will use max available " + "shared memory if s=0 (for ideal GUPS conditions this must be done at " + "compile time, check README.md for build options). This tool does allow " + "setting the shmem data size with = 2^s (for s>0), however this will " + "also result in an instruction bound kernel that fails to reach " + "hardware limitations of GUPS. [default: -1 (disabled)]", + }; + + int c; + while ((c = getopt(argc, argv, "n:o:r:a:t:d:s:h")) != -1) { + switch (c) { + case 'n': + logn = atoi(optarg); + break; + case 'o': + occupancy = atoi(optarg); + break; + case 'r': + repeats = atoi(optarg); + break; + case 'a': + accesses_per_elem = accesses_per_elem_sh = atoi(optarg); + break; + case 't': + test_type = static_cast(atoi(optarg)); + break; + case 'd': + dev = atoi(optarg); + break; + case 's': + shared_mem = true; + logn_shmem = atoi(optarg); + break; + case '?': + printf("Please use -h to get option list.\n"); + return 1; + case 'h': + printf("Usage:\n"); + for (int i = 0; i < 7; i++) { + printf("%s\n", opts_desc[i]); + } + return 0; + default: + break; + } + } + + size_t n_shmem = (size_t)1 << logn_shmem; + + int ndev; + CUDA_RT_CALL(cudaGetDeviceCount(&ndev)); + if (dev < 0 || dev >= ndev) { + dev = 0; + } + + cudaDeviceProp prop; + CUDA_RT_CALL(cudaGetDeviceProperties(&prop, dev)); + CUDA_RT_CALL(cudaSetDevice(dev)); + printf("Using GPU %d of %d GPUs.\n", dev, ndev); + printf("Warp size = %d.\n", prop.warpSize); + printf("Multi-processor count = %d.\n", prop.multiProcessorCount); + printf( + "Max threads per multi-processor = %d.\n", prop.maxThreadsPerMultiProcessor); + + if (shared_mem) { + if (n_shmem == 1) { + printf("Using max shared memory\n"); +#ifdef STATIC_SHMEM + n_shmem = NSHMEM; +#else + n_shmem = prop.sharedMemPerBlockOptin / sizeof(benchtype); +#endif + } else { + printf( + "Shared memory size = %zu (%zu bytes.)\n", + n_shmem, + n_shmem * sizeof(benchtype)); + } + // using the dynamic allocation doesn't appear to impact performance so will + // always use max. + printf( + "Max shared memory per block = %zu Bytes.\n", prop.sharedMemPerBlockOptin); + if (n_shmem * sizeof(benchtype) > prop.sharedMemPerBlockOptin) { + fprintf(stderr, "Requested shmem size not supported!\n"); + exit(-1); + } +#ifdef STATIC_SHMEM + assert(prop.sharedMemPerBlockOptin / sizeof(benchtype) == NSHMEM); +#endif + CUDA_RT_CALL(cudaFuncSetAttribute( + d_bench_shmem, + cudaFuncAttributeMaxDynamicSharedMemorySize, + prop.sharedMemPerBlockOptin)); + CUDA_RT_CALL(cudaFuncSetAttribute( + d_bench_shmem, + cudaFuncAttributeMaxDynamicSharedMemorySize, + prop.sharedMemPerBlockOptin)); + CUDA_RT_CALL(cudaFuncSetAttribute( + d_bench_shmem, + cudaFuncAttributeMaxDynamicSharedMemorySize, + prop.sharedMemPerBlockOptin)); + CUDA_RT_CALL(cudaFuncSetAttribute( + d_bench_shmem, + cudaFuncAttributeMaxDynamicSharedMemorySize, + prop.sharedMemPerBlockOptin)); + CUDA_RT_CALL(cudaFuncSetAttribute( + d_bench_shmem, + cudaFuncAttributeMaxDynamicSharedMemorySize, + prop.sharedMemPerBlockOptin)); + CUDA_RT_CALL(cudaFuncSetAttribute( + d_bench_shmem, + cudaFuncAttributeMaxDynamicSharedMemorySize, + prop.sharedMemPerBlockOptin)); + } + + size_t n = (size_t)(1 << logn); + size_t working_set = (size_t)n * 100 / occupancy; + if (accesses_per_elem == -1) { + if (test_type == UPDATE || test_type == UPDATE_NO_LOOP + || test_type == READ_WRITE) { + accesses_per_elem = ACCESS_PER_ELEM_GMEM_U_UNL_RW; + } else { + accesses_per_elem = ACCESS_PER_ELEM_GMEM_R_W; + } + } + if (accesses_per_elem_sh == -1) { + accesses_per_elem_sh = ACCESS_PER_ELEM_SHMEM; + } + size_t thread, grid; + if (!shared_mem) { + thread = NUM_THREADS_PER_BLOCK_GMEM; + // determine numebr of blocks based on input size + grid = ceil(1.0 * n / thread); + } else { + thread = NUM_THREADS_PER_BLOCK_SHMEM; + // ensure max occupancy + grid = prop.multiProcessorCount + * floor(prop.sharedMemPerBlockOptin / (n_shmem * sizeof(benchtype))); + } + size_t total_num_thread = thread * grid; + + printf( + "Table size = %zu (%lf GB.)\nTotal number of threads %zu\nEach thread " + "access %d locations.\nNumber of iterations = %d\n", + working_set, + working_set * sizeof(benchtype) / 1e9, + total_num_thread, + accesses_per_elem, + repeats); + + benchtype* d_t; + if (!shared_mem) + CUDA_RT_CALL(cudaMalloc((void**)&d_t, working_set * sizeof(benchtype))); + + cudaEvent_t begin, end; + CUDA_RT_CALL(cudaEventCreate(&begin)); + CUDA_RT_CALL(cudaEventCreate(&end)); + + if (!shared_mem) { + d_init<<>>(working_set, d_t); + } + starts(); + + CUDA_RT_CALL(cudaEventRecord(begin)); + CUDA_RT_CALL(cudaEventSynchronize(begin)); + + if (!shared_mem) { + for (int i = 0; i < repeats; i++) { + if (test_type == READ) { + d_bench<<>>(n, working_set, d_t, accesses_per_elem); + } else if (test_type == WRITE) { + d_bench<<>>(n, working_set, d_t, accesses_per_elem); + } else if (test_type == UPDATE) { + d_bench<<>>(n, working_set, d_t, accesses_per_elem); + } else if (test_type == READ_WRITE) { + d_bench + <<>>(n, working_set, d_t, accesses_per_elem); + } else if (test_type == UPDATE_NO_LOOP) { + d_bench + <<>>(n, working_set, d_t, accesses_per_elem); + } else { + printf("Test currently not supported."); + } + } + } else { + printf( + "Shmem launch config: %zu %zu %zu\n", + grid, + thread, + n_shmem * sizeof(benchtype)); + for (int i = 0; i < repeats; i++) { + if (test_type == READ) { + d_bench_shmem<<>>( + n_shmem, accesses_per_elem_sh); + } else if (test_type == WRITE) { + d_bench_shmem<<>>( + n_shmem, accesses_per_elem_sh); + } else if (test_type == UPDATE) { + d_bench_shmem<<>>( + n_shmem, accesses_per_elem_sh); + } else if (test_type == READ_WRITE) { + d_bench_shmem<<>>( + n_shmem, accesses_per_elem_sh); + } else if (test_type == UPDATE_NO_LOOP) { + d_bench_shmem + <<>>( + n_shmem, accesses_per_elem_sh); + } else { + printf("Test currently not supported."); + } + } + } + + CUDA_RT_CALL(cudaEventRecord(end)); + CUDA_RT_CALL(cudaEventSynchronize(end)); + + float ms; + CUDA_RT_CALL(cudaEventElapsedTime(&ms, begin, end)); + CUDA_RT_CALL(cudaEventDestroy(end)); + CUDA_RT_CALL(cudaEventDestroy(begin)); + double time = ms * 1.0e-3; + printf("Elapsed time = %.6f seconds\n", time); + if (!shared_mem) { + double result = accesses_per_elem * n * repeats / (double)ms * 1.0e-6; + printf("Result %s = %.6f\n", test_name[test_type], result); + } else { + double result = accesses_per_elem_sh * n_shmem * repeats / (double)ms * 1.0e-6; + printf("Result %s(shmem_combined_SMs) = %.6f\n", test_name[test_type], result); + printf( + "Result %s(shmem_single_SM) = %.6f\n", + test_name[test_type], + result / prop.multiProcessorCount); + } + + uint32_t* d_error; + CUDA_RT_CALL(cudaMalloc((void**)&d_error, sizeof(uint32_t))); + CUDA_RT_CALL(cudaMemset(d_error, 0, sizeof(uint32_t))); + + if (test_type == UPDATE && !shared_mem) { + // d_check only works with UPDATE operation + d_bench<<>>(n, working_set, d_t, accesses_per_elem); + d_check<<>>(working_set, d_t, d_error); + uint32_t h_error; + CUDA_RT_CALL( + cudaMemcpy(&h_error, d_error, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + printf("Verification: Found %u errors.\n", h_error); + } + + CUDA_RT_CALL(cudaFree(d_error)); + if (!shared_mem) + CUDA_RT_CALL(cudaFree(d_t)); + return 0; +} diff --git a/posts/gups/run.py b/posts/gups/run.py new file mode 100644 index 0000000..829b0b4 --- /dev/null +++ b/posts/gups/run.py @@ -0,0 +1,178 @@ +# SPDX-FileCopyrightText: Copyright (c) 2020-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +import subprocess +import csv + +test_type_to_id = { + "updates": 0, + "reads": 1, + "writes": 2, + "reads_writes": 3, + "updates_no_loop": 4 +} + +DEFAULT_GLOBAL_GUPS_SIZE=29 + +def run_test(input_size, device_id, test_type, occupancy, repeats, memory_loc): + tests_to_run = [] + if test_type == "all": + tests_to_run = [0, 1, 2, 3, 4] + else: + tests_to_run.append(test_type_to_id[test_type]) + + results = [] + + if memory_loc == "global": + for t in tests_to_run: + proc = subprocess.run( + [ + "./gups", + "-n", str(input_size), + "-t", str(t), + "-d", str(device_id), + "-o", str(occupancy), + "-r", str(repeats) + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + if proc.returncode != 0: + print(proc.stdout) + raise RuntimeError("Failed to run GUPS") + + output = proc.stdout + for output_line in output.splitlines(): + if "Result" in output_line: + output_line_split = output_line.split() + assert output_line_split[0] == "Result" + + value = output_line_split[-1] + res_unit = output_line_split[1] + op_name = output_line_split[2] + results.append([res_unit, op_name, value]) + else: + for t in tests_to_run: + proc = subprocess.run( + [ + "./gups", + "-s", str(input_size), + "-t", str(t), + "-d", str(device_id), + "-r", str(repeats) + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + universal_newlines=True + ) + + if proc.returncode != 0: + print(proc.stdout) + raise RuntimeError("Failed to run GUPS") + + output = proc.stdout + for output_line in output.splitlines(): + if "Result" in output_line: + output_line_split = output_line.split() + assert output_line_split[0] == "Result" + + value = output_line_split[-1] + res_unit = output_line_split[1] + op_name = output_line_split[2] + results.append([res_unit, op_name, value]) + + with open('results.csv', "a") as csv_file: + csv_writer = csv.writer(csv_file, delimiter=',') + row = [2**input_size] + for result in results: + row += [result[2]] + csv_writer.writerow(row) + +def main(): + # Parse command line arguments + args = parse_commandline_argument() + device_id = int(args.device_id) + test_type = args.test + occupancy = args.occupancy + repeats = args.repeats + input_size_begin = int(args.input_size_begin) + input_size_end = int(args.input_size_end) + memory_loc = args.memory_loc + + if memory_loc == "global" and input_size_begin == 0: + input_size_begin = DEFAULT_GLOBAL_GUPS_SIZE + if memory_loc == "global" and input_size_end == 0: + input_size_end = DEFAULT_GLOBAL_GUPS_SIZE + + # Write header + with open('results.csv', "w") as csv_file: + csv_writer = csv.writer(csv_file, delimiter=',') + header = ["Size"] + if test_type == "all": + if memory_loc == "global": + for t in test_type_to_id: + header += [t] + else: + for t in test_type_to_id: + header += ['shmem_GPU_'+t] + header += ['shmem_SM_'+t] + else: + if memory_loc == "global": + header += [test_type] + else: + header += ['shmem_GPU_'+test_type] + header += ['shmem_SM_'+test_type] + csv_writer.writerow(header) + # Test different sizes + for input_size in range(input_size_begin, input_size_end+1): + run_test(input_size, device_id, test_type, occupancy, repeats, memory_loc) + +def parse_commandline_argument(): + import argparse + parser = argparse.ArgumentParser(description='Benchmark GUPS. Store results in results.csv file.') + + parser.add_argument("--device-id", help="GPU ID to run the test", default="0") + parser.add_argument("--input-size-begin", help="exponent of the input data size begin range, base is 2 (input size = 2^n). "\ + "[Default: 29 for global GUPS, max_shmem for shared GUPS. Global/shared is controlled by --memory-loc", default="0") + parser.add_argument("--input-size-end", help="exponent of the input data size end range, base is 2 (input size = 2^n). "\ + "[Default: 29 for global GUPS, max_shmem for shared GUPS. Global/shared is controlled by --memory-loc", default="0") + parser.add_argument("--occupancy", help="100/occupancy is how much larger the working set is compared to the requested bytes", default="100") + parser.add_argument("--repeats", help="number of kernel repetitions", default="1") + parser.add_argument("--test", help="test to run", + choices=["reads", "writes", "reads_writes", "updates", "updates_no_loop", "all"], + default="all") + parser.add_argument("--memory-loc", help="memory buffer in global memory or shared memory", + choices=["global", "shared"], default="global") + + return parser.parse_args() + +if __name__ == '__main__': + main() \ No newline at end of file