From b6b47fb3b6d7560d667efb0841710740be3db714 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Sat, 11 May 2024 17:19:54 -0400 Subject: [PATCH 01/18] initial nvptx microbenchmarking infrastructure --- libc/benchmarks/CMakeLists.txt | 416 +++++++++--------- libc/benchmarks/gpu/BenchmarkLogger.cpp | 89 ++++ libc/benchmarks/gpu/BenchmarkLogger.h | 27 ++ libc/benchmarks/gpu/CMakeLists.txt | 183 ++++++++ libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 70 +++ libc/benchmarks/gpu/LibcGpuBenchmark.h | 122 +++++ libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp | 6 + libc/benchmarks/gpu/src/CMakeLists.txt | 2 + libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 21 + .../gpu/src/ctype/isalnum_benchmark.cpp | 22 + .../gpu/src/ctype/isalpha_benchmark.cpp | 9 + libc/benchmarks/gpu/src/math/CMakeLists.txt | 0 libc/benchmarks/gpu/timing/CMakeLists.txt | 12 + .../gpu/timing/nvptx/CMakeLists.txt | 7 + libc/benchmarks/gpu/timing/nvptx/timing.h | 108 +++++ libc/benchmarks/gpu/timing/timing.h | 22 + 16 files changed, 911 insertions(+), 205 deletions(-) create mode 100644 libc/benchmarks/gpu/BenchmarkLogger.cpp create mode 100644 libc/benchmarks/gpu/BenchmarkLogger.h create mode 100644 libc/benchmarks/gpu/CMakeLists.txt create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmark.cpp create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmark.h create mode 100644 libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp create mode 100644 libc/benchmarks/gpu/src/CMakeLists.txt create mode 100644 libc/benchmarks/gpu/src/ctype/CMakeLists.txt create mode 100644 libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp create mode 100644 libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp create mode 100644 libc/benchmarks/gpu/src/math/CMakeLists.txt create mode 100644 libc/benchmarks/gpu/timing/CMakeLists.txt create mode 100644 libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt create mode 100644 libc/benchmarks/gpu/timing/nvptx/timing.h create mode 100644 libc/benchmarks/gpu/timing/timing.h diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt index 4978da65850cc..a802e653a091e 100644 --- a/libc/benchmarks/CMakeLists.txt +++ b/libc/benchmarks/CMakeLists.txt @@ -1,205 +1,211 @@ -find_package(Threads) - -set(LLVM_LINK_COMPONENTS - Support - TargetParser - ) - -#============================================================================== -# Add Unit Testing Support -#============================================================================== - -function(add_libc_benchmark_unittest target_name) - if(NOT LLVM_INCLUDE_TESTS) - return() - endif() - - cmake_parse_arguments( - "LIBC_BENCHMARKS_UNITTEST" - "" # No optional arguments - "SUITE" # Single value arguments - "SRCS;DEPENDS" # Multi-value arguments - ${ARGN} - ) - - add_executable(${target_name} - EXCLUDE_FROM_ALL - ${LIBC_BENCHMARKS_UNITTEST_SRCS} - ) - target_link_libraries(${target_name} - PRIVATE - llvm_gtest_main - llvm_gtest - ${LIBC_BENCHMARKS_UNITTEST_DEPENDS} - ) - llvm_update_compile_flags(${target_name}) - - add_custom_command( - TARGET ${target_name} - POST_BUILD - COMMAND $ - ) - add_dependencies(libc-benchmark-util-tests ${target_name}) -endfunction() - -#============================================================================== -# Build Google Benchmark for libc -#============================================================================== - -include(ExternalProject) -ExternalProject_Add(google-benchmark-libc - EXCLUDE_FROM_ALL ON - PREFIX google-benchmark-libc - SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark - INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc - CMAKE_CACHE_ARGS - -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF - -DBENCHMARK_ENABLE_LTO:BOOL=OFF - -DBENCHMARK_ENABLE_TESTING:BOOL=OFF - -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR} - -DBENCHMARK_FORCE_WERROR:BOOL=OFF - -DBENCHMARK_USE_LIBCXX:BOOL=OFF - -DCMAKE_BUILD_TYPE:STRING=Release - - -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME} - -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR} - -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} - -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} - -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH} - - -DBUILD_SHARED_LIBS:BOOL=OFF - -DCMAKE_EXE_LINKER_FLAGS:STRING=-static - - -DCMAKE_CXX_STANDARD:STRING=14 - -DCMAKE_INSTALL_PREFIX:PATH= - ) - -add_custom_target(libc-benchmark-util-tests) - -# libc-benchmark -add_library(libc-benchmark - STATIC - EXCLUDE_FROM_ALL - LibcBenchmark.cpp - LibcBenchmark.h -) - -target_include_directories(libc-benchmark - PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR} -) -target_link_libraries(libc-benchmark - PUBLIC - benchmark::benchmark - LLVMSupport - LLVMTargetParser - Threads::Threads -) -add_dependencies(libc-benchmark google-benchmark-libc) -llvm_update_compile_flags(libc-benchmark) - -add_libc_benchmark_unittest(libc-benchmark-test - SRCS LibcBenchmarkTest.cpp - DEPENDS libc-benchmark -) - -# libc-memory-benchmark -add_library(libc-memory-benchmark - STATIC - EXCLUDE_FROM_ALL - LibcMemoryBenchmark.cpp - LibcMemoryBenchmark.h - LibcFunctionPrototypes.h - MemorySizeDistributions.cpp - MemorySizeDistributions.h -) -target_include_directories(libc-memory-benchmark - PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR} -) -target_link_libraries(libc-memory-benchmark - PUBLIC - libc-benchmark -) -llvm_update_compile_flags(libc-memory-benchmark) - -add_libc_benchmark_unittest(libc-memory-benchmark-test - SRCS LibcMemoryBenchmarkTest.cpp - DEPENDS libc-memory-benchmark -) - -# json -add_library(json - STATIC - EXCLUDE_FROM_ALL - JSON.cpp - JSON.h -) -target_link_libraries(json PUBLIC libc-memory-benchmark) -llvm_update_compile_flags(json) - -add_libc_benchmark_unittest(json-test - SRCS JSONTest.cpp - DEPENDS json -) - -#============================================================================== -# Benchmarking tool -#============================================================================== - -# Benchmark all implementations that can run on the target CPU. -function(add_libc_multi_impl_benchmark name) - get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations) - foreach(fq_config_name IN LISTS fq_implementations) - get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES) - cpu_supports(can_run "${required_cpu_features}") - if(can_run) - set(benchmark_name ${fq_config_name}_benchmark) - add_executable(${benchmark_name} - EXCLUDE_FROM_ALL - LibcMemoryBenchmarkMain.cpp - ) - get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW") - target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file}) - string(TOUPPER ${name} name_upper) - target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"") - llvm_update_compile_flags(${benchmark_name}) - else() - message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'") - endif() - endforeach() -endfunction() - -add_libc_multi_impl_benchmark(bcmp) -add_libc_multi_impl_benchmark(bzero) -add_libc_multi_impl_benchmark(memcmp) -add_libc_multi_impl_benchmark(memcpy) -add_libc_multi_impl_benchmark(memmove) -add_libc_multi_impl_benchmark(memset) - -#============================================================================== -# Google Benchmarking tool -#============================================================================== - -# This target uses the Google Benchmark facility to report throughput for llvm -# libc memory functions compiled for the host machine. This is useful to -# continuously monitor the performance of the memory functions. -add_executable(libc.benchmarks.memory_functions.opt_host - EXCLUDE_FROM_ALL - LibcMemoryGoogleBenchmarkMain.cpp - LibcDefaultImplementations.cpp -) -target_link_libraries(libc.benchmarks.memory_functions.opt_host - PRIVATE - libc-memory-benchmark - libc.src.string.memcmp_opt_host.__internal__ - libc.src.string.bcmp_opt_host.__internal__ - libc.src.string.memcpy_opt_host.__internal__ - libc.src.string.memset_opt_host.__internal__ - libc.src.string.bzero_opt_host.__internal__ - libc.src.string.memmove_opt_host.__internal__ - benchmark_main -) -llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host) - -add_subdirectory(automemcpy) +if(NOT LIBC_TARGET_OS_IS_GPU) + find_package(Threads) + + set(LLVM_LINK_COMPONENTS + Support + TargetParser + ) + + #============================================================================== + # Add Unit Testing Support + #============================================================================== + + function(add_libc_benchmark_unittest target_name) + if(NOT LLVM_INCLUDE_TESTS) + return() + endif() + + cmake_parse_arguments( + "LIBC_BENCHMARKS_UNITTEST" + "" # No optional arguments + "SUITE" # Single value arguments + "SRCS;DEPENDS" # Multi-value arguments + ${ARGN} + ) + + add_executable(${target_name} + EXCLUDE_FROM_ALL + ${LIBC_BENCHMARKS_UNITTEST_SRCS} + ) + target_link_libraries(${target_name} + PRIVATE + llvm_gtest_main + llvm_gtest + ${LIBC_BENCHMARKS_UNITTEST_DEPENDS} + ) + llvm_update_compile_flags(${target_name}) + + add_custom_command( + TARGET ${target_name} + POST_BUILD + COMMAND $ + ) + add_dependencies(libc-benchmark-util-tests ${target_name}) + endfunction() + + #============================================================================== + # Build Google Benchmark for libc + #============================================================================== + + include(ExternalProject) + ExternalProject_Add(google-benchmark-libc + EXCLUDE_FROM_ALL ON + PREFIX google-benchmark-libc + SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc + CMAKE_CACHE_ARGS + -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF + -DBENCHMARK_ENABLE_LTO:BOOL=OFF + -DBENCHMARK_ENABLE_TESTING:BOOL=OFF + -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR} + -DBENCHMARK_FORCE_WERROR:BOOL=OFF + -DBENCHMARK_USE_LIBCXX:BOOL=OFF + -DCMAKE_BUILD_TYPE:STRING=Release + + -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME} + -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR} + -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} + -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} + -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH} + + -DBUILD_SHARED_LIBS:BOOL=OFF + -DCMAKE_EXE_LINKER_FLAGS:STRING=-static + + -DCMAKE_CXX_STANDARD:STRING=14 + -DCMAKE_INSTALL_PREFIX:PATH= + ) + + add_custom_target(libc-benchmark-util-tests) + + # libc-benchmark + add_library(libc-benchmark + STATIC + EXCLUDE_FROM_ALL + LibcBenchmark.cpp + LibcBenchmark.h + ) + + target_include_directories(libc-benchmark + PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR} + ) + target_link_libraries(libc-benchmark + PUBLIC + benchmark::benchmark + LLVMSupport + LLVMTargetParser + Threads::Threads + ) + add_dependencies(libc-benchmark google-benchmark-libc) + llvm_update_compile_flags(libc-benchmark) + + add_libc_benchmark_unittest(libc-benchmark-test + SRCS LibcBenchmarkTest.cpp + DEPENDS libc-benchmark + ) + + # libc-memory-benchmark + add_library(libc-memory-benchmark + STATIC + EXCLUDE_FROM_ALL + LibcMemoryBenchmark.cpp + LibcMemoryBenchmark.h + LibcFunctionPrototypes.h + MemorySizeDistributions.cpp + MemorySizeDistributions.h + ) + target_include_directories(libc-memory-benchmark + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} + ) + target_link_libraries(libc-memory-benchmark + PUBLIC + libc-benchmark + ) + llvm_update_compile_flags(libc-memory-benchmark) + + add_libc_benchmark_unittest(libc-memory-benchmark-test + SRCS LibcMemoryBenchmarkTest.cpp + DEPENDS libc-memory-benchmark + ) + + # json + add_library(json + STATIC + EXCLUDE_FROM_ALL + JSON.cpp + JSON.h + ) + target_link_libraries(json PUBLIC libc-memory-benchmark) + llvm_update_compile_flags(json) + + add_libc_benchmark_unittest(json-test + SRCS JSONTest.cpp + DEPENDS json + ) + + #============================================================================== + # Benchmarking tool + #============================================================================== + + # Benchmark all implementations that can run on the target CPU. + function(add_libc_multi_impl_benchmark name) + get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations) + foreach(fq_config_name IN LISTS fq_implementations) + get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES) + cpu_supports(can_run "${required_cpu_features}") + if(can_run) + set(benchmark_name ${fq_config_name}_benchmark) + add_executable(${benchmark_name} + EXCLUDE_FROM_ALL + LibcMemoryBenchmarkMain.cpp + ) + get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW") + target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file}) + string(TOUPPER ${name} name_upper) + target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"") + llvm_update_compile_flags(${benchmark_name}) + else() + message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'") + endif() + endforeach() + endfunction() + + add_libc_multi_impl_benchmark(bcmp) + add_libc_multi_impl_benchmark(bzero) + add_libc_multi_impl_benchmark(memcmp) + add_libc_multi_impl_benchmark(memcpy) + add_libc_multi_impl_benchmark(memmove) + add_libc_multi_impl_benchmark(memset) + + #============================================================================== + # Google Benchmarking tool + #============================================================================== + + # This target uses the Google Benchmark facility to report throughput for llvm + # libc memory functions compiled for the host machine. This is useful to + # continuously monitor the performance of the memory functions. + add_executable(libc.benchmarks.memory_functions.opt_host + EXCLUDE_FROM_ALL + LibcMemoryGoogleBenchmarkMain.cpp + LibcDefaultImplementations.cpp + ) + target_link_libraries(libc.benchmarks.memory_functions.opt_host + PRIVATE + libc-memory-benchmark + libc.src.string.memcmp_opt_host.__internal__ + libc.src.string.bcmp_opt_host.__internal__ + libc.src.string.memcpy_opt_host.__internal__ + libc.src.string.memset_opt_host.__internal__ + libc.src.string.bzero_opt_host.__internal__ + libc.src.string.memmove_opt_host.__internal__ + benchmark_main + ) + llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host) + + add_subdirectory(automemcpy) +endif() + +if(LIBC_TARGET_OS_IS_GPU) + add_subdirectory(gpu) +endif() diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp new file mode 100644 index 0000000000000..94a0d897c9585 --- /dev/null +++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp @@ -0,0 +1,89 @@ +#include "benchmarks/gpu/BenchmarkLogger.h" +#include "src/__support/CPP/string.h" +#include "src/__support/CPP/string_view.h" +#include "src/__support/OSUtil/io.h" // write_to_stderr +#include "src/__support/big_int.h" // is_big_int +#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128 +#include "src/__support/uint128.h" + +#include + +namespace LIBC_NAMESPACE { +namespace libc_gpu_benchmarks { + +// cpp::string_view specialization +template <> +BenchmarkLogger &BenchmarkLogger::operator<< (cpp::string_view str) { + LIBC_NAMESPACE::write_to_stderr(str); + return *this; +} + +// cpp::string specialization +template <> BenchmarkLogger &BenchmarkLogger::operator<< (cpp::string str) { + return *this << static_cast(str); +} + +// const char* specialization +template <> BenchmarkLogger &BenchmarkLogger::operator<< (const char *str) { + return *this << cpp::string_view(str); +} + +// char* specialization +template <> BenchmarkLogger &BenchmarkLogger::operator<< (char *str) { + return *this << cpp::string_view(str); +} + +// char specialization +template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) { + return *this << cpp::string_view(&ch, 1); +} + +// bool specialization +template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) { + return *this << (cond ? "true" : "false"); +} + +// void * specialization +template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) { + return *this << "0x" << cpp::to_string(reinterpret_cast(addr)); +} + +template BenchmarkLogger &BenchmarkLogger::operator<<(T t) { + if constexpr (is_big_int_v || + (cpp::is_integral_v && cpp::is_unsigned_v && + (sizeof(T) > sizeof(uint64_t)))) { + static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt"); + const IntegerToString buffer(t); + return *this << buffer.view(); + } else { + return *this << cpp::to_string(t); + } +} + +// is_integral specializations +// char is already specialized to handle character +template BenchmarkLogger &BenchmarkLogger::operator<< (short); +template BenchmarkLogger &BenchmarkLogger::operator<< (int); +template BenchmarkLogger &BenchmarkLogger::operator<< (long); +template BenchmarkLogger &BenchmarkLogger::operator<< (long long); +template BenchmarkLogger &BenchmarkLogger::operator<< (unsigned char); +template BenchmarkLogger &BenchmarkLogger::operator<< (unsigned short); +template BenchmarkLogger &BenchmarkLogger::operator<< (unsigned int); +template BenchmarkLogger &BenchmarkLogger::operator<< (unsigned long); +template BenchmarkLogger & + BenchmarkLogger::operator<< (unsigned long long); + +#ifdef LIBC_TYPES_HAS_INT128 +template BenchmarkLogger &BenchmarkLogger::operator<< <__uint128_t>(__uint128_t); +#endif // LIBC_TYPES_HAS_INT128 +template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<128>); +template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<192>); +template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<256>); +template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<320>); + +// TODO: Add floating point formatting once it's supported by StringStream. + +BenchmarkLogger blog; + +} // namespace libc_gpu_benchmarks +} // namespace LIBC_NAMESPACE diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h new file mode 100644 index 0000000000000..ed3cc97e59c6d --- /dev/null +++ b/libc/benchmarks/gpu/BenchmarkLogger.h @@ -0,0 +1,27 @@ +//===-- Utilities to log to standard output during tests --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H +#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H + +namespace LIBC_NAMESPACE { +namespace libc_gpu_benchmarks { + +// A class to log to standard output in the context of hermetic tests. +struct BenchmarkLogger { + constexpr BenchmarkLogger() = default; + template BenchmarkLogger &operator<<(T); +}; + +// A global TestLogger instance to be used in tests. +extern BenchmarkLogger blog; + +} // namespace libc_gpu_benchmarks +} // namespace LIBC_NAMESPACE + +#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */ diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt new file mode 100644 index 0000000000000..a18be27e33573 --- /dev/null +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -0,0 +1,183 @@ +add_subdirectory(timing) + +add_custom_target(gpu-benchmark) + +function (add_gpu_benchmark test_name) + if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1) + message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.") + return() + endif() + + cmake_parse_arguments( + "GPU_BENCHMARK" + "" # No optional arguments + "SUITE" # Single value arguments + "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments + ${ARGN} + ) + + if(NOT GPU_BENCHMARK_SUITE) + message(FATAL_ERROR "SUITE not specified for ${fq_target_name}") + endif() + if(NOT GPU_BENCHMARK_SRCS) + message(FATAL_ERROR "The SRCS list for add_gpu_benchmark is missing.") + endif() + + get_fq_target_name(${test_name} fq_target_name) + get_fq_target_name(${test_name}.libc fq_libc_target_name) # Stores the compiled libc + infrastructure archive to link in + get_fq_deps_list(fq_deps_list ${GPU_BENCHMARK_DEPENDS}) + list(APPEND fq_deps_list + # Hermetic tests use the platform's startup object. So, their deps also + # have to be collected. + libc.startup.${LIBC_TARGET_OS}.crt1 + # We always add the memory functions objects. This is because the + # compiler's codegen can emit calls to the C memory functions. + libc.src.string.bcmp + libc.src.string.bzero + libc.src.string.memcmp + libc.src.string.memcpy + libc.src.string.memmove + libc.src.string.memset + libc.src.__support.StringUtil.error_to_string + ) + + list(REMOVE_DUPLICATES fq_deps_list) + + # TODO: Instead of gathering internal object files from entrypoints, + # collect the object files with public names of entrypoints. + get_object_files_for_test( + link_object_files skipped_entrypoints_list ${fq_deps_list}) + if(skipped_entrypoints_list) + if(LIBC_CMAKE_VERBOSE_LOGGING) + set(msg "Skipping hermetic test ${fq_target_name} as it has missing deps: " + "${skipped_entrypoints_list}.") + endif() + return() + endif() + list(REMOVE_DUPLICATES link_object_files) + + # Make a library of all deps + add_library( + ${fq_target_name}.__libc__ + STATIC + EXCLUDE_FROM_ALL + ${link_object_files} + ) + set_target_properties(${fq_target_name}.__libc__ + PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + set_target_properties(${fq_target_name}.__libc__ + PROPERTIES ARCHIVE_OUTPUT_NAME ${fq_target_name}.libc) + + set(fq_build_target_name ${fq_target_name}.__build__) + add_executable( + ${fq_build_target_name} + EXCLUDE_FROM_ALL + $<$:${link_object_files}> + ${GPU_BENCHMARK_SRCS} + ${GPU_BENCHMARK_HDRS} + ) + set_target_properties(${fq_build_target_name} + PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + + _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}") + target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) + target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}) + _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}") + target_compile_options(${fq_build_target_name} PRIVATE ${compile_options}) + + set(link_libraries "") + foreach(lib in LISTS GPU_BENCHMARK_LINK_LIBRARIES) + if(TARGET ${lib}.hermetic) + list(APPEND link_libraries ${lib}.hermetic) + else() + list(APPEND link_libraries ${lib}) + endif() + endforeach() + + if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) + target_link_options(${fq_build_target_name} PRIVATE + ${LIBC_COMPILE_OPTIONS_DEFAULT} + -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu + "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static + "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}") + elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) + # We need to use the internal object versions for NVPTX. + set(internal_suffix ".__internal__") + target_link_options(${fq_build_target_name} PRIVATE + ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu + "-Wl,--suppress-stack-size-warning" + -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static + "--cuda-path=${LIBC_CUDA_ROOT}") + elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP) + target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static) + else() + # Older version of gcc does not support `nostdlib++` flag. We use + # `nostdlib` and link against libgcc_s, which cannot be linked statically. + target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib) + list(APPEND link_libraries ${LIBGCC_S_LOCATION}) + endif() + + # link libraries for the BUILD target (i.e. to compile the test) + target_link_libraries( + ${fq_build_target_name} + PRIVATE + libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix} + ${link_libraries} + # LibcTest.hermetic + LibcGpuBenchmark.hermetic + # LibcHermeticTestSupport.hermetic + LibcHermeticTestSupport.hermetic + # The NVIDIA 'nvlink' linker does not currently support static libraries. + $<$>:${fq_target_name}.__libc__>) + + add_dependencies(${fq_build_target_name} + LibcGpuBenchmark.hermetic + ${fq_deps_list}) + + # Tests on the GPU require an external loader utility to launch the kernel. + if(TARGET libc.utils.gpu.loader) + add_dependencies(${fq_build_target_name} libc.utils.gpu.loader) + get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE") + endif() + + set(test_cmd ${GPU_BENCHMARK_ENV} + $<$:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${GPU_BENCHMARK_LOADER_ARGS} + $ ${GPU_BENCHMARK_ARGS}) + add_custom_target( + ${fq_target_name} + COMMAND ${test_cmd} + COMMAND_EXPAND_LISTS + COMMENT "Running GPU benchmark ${fq_target_name}" + ) + + # Make this benchmark part of its suite + add_dependencies(${GPU_BENCHMARK_SUITE} ${fq_target_name}) + # Remember to make this benchmark part of the umbrella command + add_dependencies(gpu-benchmark ${fq_target_name}) +endfunction(add_gpu_benchmark) + +add_unittest_framework_library( + LibcGpuBenchmark + SRCS + LibcGpuBenchmark.cpp + LibcGpuBenchmarkMain.cpp + BenchmarkLogger.cpp + HDRS + LibcGpuBenchmark.h + BenchmarkLogger.h + DEPENDS + libc.src.__support.big_int + libc.src.__support.c_string + libc.src.__support.CPP.string + libc.src.__support.CPP.string_view + libc.src.__support.CPP.type_traits + libc.src.__support.fixed_point.fx_rep + libc.src.__support.macros.properties.types + libc.src.__support.OSUtil.osutil + libc.src.__support.uint128 + libc.benchmarks.gpu.timing.timing +) + +add_subdirectory(src) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp new file mode 100644 index 0000000000000..d37f5a0a53a70 --- /dev/null +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -0,0 +1,70 @@ +#include "LibcGpuBenchmark.h" + +namespace LIBC_NAMESPACE { +namespace libc_gpu_benchmarks { + +Benchmark *Benchmark::Start = nullptr; +Benchmark *Benchmark::End = nullptr; + +void Benchmark::addBenchmark(Benchmark *B) { + if (End == nullptr) { + Start = B; + End = B; + return; + } + + End->Next = B; + End = B; +} + +int Benchmark::runBenchmarks() { + for (Benchmark *B = Start; B != nullptr; B = B->Next) { + B->Run(); + } + + return 0; +} + +BenchmarkResult benchmark(const BenchmarkOptions &Options, + uint64_t (*WrapperFunc)()) { + BenchmarkResult Result; + RuntimeEstimationProgression REP; + size_t TotalIterations = 0; + size_t Iterations = Options.InitialIterations; + if (Iterations < (uint32_t)1) { + Iterations = 1; + } + size_t Samples = 0; + uint64_t BestGuess = 0; + uint64_t TotalCycles = 0; + for (;;) { + uint64_t SampleCycles = 0; + for (uint32_t i = 0; i < Iterations; i++) { + auto overhead = LIBC_NAMESPACE::overhead(); + uint64_t result = WrapperFunc() - overhead; + SampleCycles += result; + } + + Samples++; + TotalCycles += SampleCycles; + TotalIterations += Iterations; + const double ChangeRatio = + REP.ComputeImprovement({Iterations, SampleCycles}); + BestGuess = REP.CurrentEstimation; + + if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) { + break; + } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) { + break; + } + + Iterations *= Options.ScalingFactor; + } + Result.Cycles = BestGuess; + Result.Samples = Samples; + Result.TotalIterations = TotalIterations; + return Result; +}; + +} // namespace libc_gpu_benchmarks +} // namespace LIBC_NAMESPACE diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h new file mode 100644 index 0000000000000..ccbbe3629dbda --- /dev/null +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -0,0 +1,122 @@ +#ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H +#define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H + +#include "benchmarks/gpu/timing/timing.h" + +#include "benchmarks/gpu/BenchmarkLogger.h" + +#include +#include + +namespace LIBC_NAMESPACE { + +namespace libc_gpu_benchmarks { + +struct BenchmarkOptions { + uint32_t InitialIterations = 1; + uint32_t MaxIterations = 10000000; + uint32_t MinSamples = 4; + uint32_t MaxSamples = 1000; + double Epsilon = 0.01; + double ScalingFactor = 1.4; +}; + +struct Measurement { + size_t Iterations = 0; + uint64_t ElapsedCycles = 0; +}; + +class RefinableRuntimeEstimation { + uint64_t TotalCycles = 0; + size_t TotalIterations = 0; + +public: + uint64_t Update(const Measurement &M) { + TotalCycles += M.ElapsedCycles; + TotalIterations += M.Iterations; + return TotalCycles / TotalIterations; + } +}; + +// Tracks the progression of the runtime estimation +class RuntimeEstimationProgression { + RefinableRuntimeEstimation RRE; + +public: + uint64_t CurrentEstimation = 0; + + double ComputeImprovement(const Measurement &M) { + const uint64_t NewEstimation = RRE.Update(M); + double Ratio = ((double)CurrentEstimation / NewEstimation) - 1.0; + + // Get absolute value + if (Ratio < 0) { + Ratio *= -1; + } + + CurrentEstimation = NewEstimation; + return Ratio; + } +}; + +struct BenchmarkResult { + uint64_t Cycles = 0; + size_t Samples = 0; + size_t TotalIterations = 0; +}; + +BenchmarkResult benchmark(const BenchmarkOptions &Options, + uint64_t (*WrapperFunc)()); + +class Benchmark { + Benchmark *Next = nullptr; + +public: + virtual ~Benchmark() {} + virtual void SetUp() {} + virtual void TearDown() {} + + static int runBenchmarks(); + +protected: + static void addBenchmark(Benchmark *); + +private: + virtual void Run() = 0; + virtual const char *getName() const = 0; + + static Benchmark *Start; + static Benchmark *End; +}; + +class WrapperBenchmark : public Benchmark { + using BenchmarkWrapperFunction = uint64_t (*)(); + BenchmarkWrapperFunction Func; + const char *Name; + +public: + WrapperBenchmark(BenchmarkWrapperFunction Func, char const *Name) + : Func(Func), Name(Name) { + addBenchmark(this); + } + +private: + void Run() override { + BenchmarkOptions Options; + auto result = benchmark(Options, Func); + constexpr auto GREEN = "\033[32m"; + constexpr auto RESET = "\033[0m"; + blog << GREEN << "[ RUN ] " << RESET << Name << '\n'; + blog << GREEN << "[ OK ] " << RESET << Name << ": " << result.Cycles + << " cycles, " << result.TotalIterations << " iterations\n"; + } + const char *getName() const override { return Name; } +}; +} // namespace libc_gpu_benchmarks +} // namespace LIBC_NAMESPACE + +#define BENCHMARK(SuiteName, TestName, Func) \ + LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark \ + SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName); + +#endif diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp new file mode 100644 index 0000000000000..c971b00cc9a1b --- /dev/null +++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp @@ -0,0 +1,6 @@ +#include "LibcGpuBenchmark.h" + +extern "C" int main(int argc, char **argv, char **envp) { + LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::runBenchmarks(); + return 0; +} diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt new file mode 100644 index 0000000000000..f15d082e4dd2b --- /dev/null +++ b/libc/benchmarks/gpu/src/CMakeLists.txt @@ -0,0 +1,2 @@ +add_subdirectory(ctype) +add_subdirectory(math) diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt new file mode 100644 index 0000000000000..ab2f6cdf0c7fd --- /dev/null +++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt @@ -0,0 +1,21 @@ +add_custom_target(libc-gpu-ctype-benchmarks) + +add_gpu_benchmark( + isalnum_benchmark + SUITE + libc-gpu-ctype-benchmarks + SRCS + isalnum_benchmark.cpp + DEPENDS + libc.src.ctype.isalnum +) + +add_gpu_benchmark( + isalpha_benchmark + SUITE + libc-gpu-ctype-benchmarks + SRCS + isalpha_benchmark.cpp + DEPENDS + libc.src.ctype.isalpha +) diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp new file mode 100644 index 0000000000000..8d9c958bb7ed4 --- /dev/null +++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp @@ -0,0 +1,22 @@ +#include "benchmarks/gpu/LibcGpuBenchmark.h" + +#include "src/ctype/isalnum.h" + +uint64_t BM_IsAlnum() { + char x = 'c'; + return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x); +} +BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper, BM_IsAlnum); + +[[gnu::noinline]] static uint64_t single_input_function(int x) { + asm volatile("" ::"r"(x)); // prevent the compiler from optimizing out x + return x; +} + +uint64_t BM_IsAlnumWithOverhead() { + char x = 'c'; + return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x) - + LIBC_NAMESPACE::latency(single_input_function, 0); +} +BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead, + BM_IsAlnumWithOverhead); diff --git a/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp new file mode 100644 index 0000000000000..2038eb89bc77b --- /dev/null +++ b/libc/benchmarks/gpu/src/ctype/isalpha_benchmark.cpp @@ -0,0 +1,9 @@ +#include "benchmarks/gpu/LibcGpuBenchmark.h" + +#include "src/ctype/isalpha.h" + +uint64_t BM_IsAlpha() { + char x = 'c'; + return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalpha, x); +} +BENCHMARK(LlvmLibcIsAlphaGpuBenchmark, IsAlpha, BM_IsAlpha); diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt new file mode 100644 index 0000000000000..0e6a5a6b47968 --- /dev/null +++ b/libc/benchmarks/gpu/timing/CMakeLists.txt @@ -0,0 +1,12 @@ +foreach(target nvptx) + add_subdirectory(${target}) + list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing) +endforeach() + +add_header_library( + timing + HDRS + timing.h + DEPENDS + ${target_gpu_timing} +) diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt new file mode 100644 index 0000000000000..9958e16206a41 --- /dev/null +++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt @@ -0,0 +1,7 @@ +add_header_library( + nvptx_timing + HDRS + timing.h + DEPENDS + libc.src.__support.common +) diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h new file mode 100644 index 0000000000000..008432e6aa1d2 --- /dev/null +++ b/libc/benchmarks/gpu/timing/nvptx/timing.h @@ -0,0 +1,108 @@ +//===------------- NVPTX implementation of timing utils ---------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_GPU_TIMING_NVPTX +#define LLVM_LIBC_UTILS_GPU_TIMING_NVPTX + +#include "src/__support/GPU/utils.h" +#include "src/__support/common.h" +#include "src/__support/macros/attributes.h" +#include "src/__support/macros/config.h" + +#include + +namespace LIBC_NAMESPACE { + +// Returns the overhead associated with calling the profiling region. This +// allows us to substract the constant-time overhead from the latency to +// obtain a true result. This can vary with system load. +[[gnu::noinline]] static uint64_t overhead() { + volatile uint32_t x = 1; + uint32_t y = x; + gpu::sync_threads(); + uint64_t start = gpu::processor_clock(); + asm volatile("" ::"r"(y), "llr"(start)); + uint32_t result = y; + asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); + uint64_t stop = gpu::processor_clock(); + gpu::sync_threads(); + volatile auto storage = result; + return stop - start; +} + +// Stimulate a simple function and obtain its latency in clock cycles on the +// system. This function cannot be inlined or else it will disturb the very +// deliccate balance of hard-coded dependencies. +// +// FIXME: This does not work in general on NVPTX because of further +// optimizations ptxas performs. The only way to get consistent results is to +// pass and extra "SHELL:-Xcuda-ptxas -O0" to CMake's compiler flag. This +// negatively implacts performance but it is at least stable. +template +[[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) { + // We need to store the input somewhere to guarantee that the compiler will + // not constant propagate it and remove the profiling region. + volatile T storage = t; + T arg = storage; + asm volatile("" ::"r"(arg)); + + // Get the current timestamp from the clock. + gpu::sync_threads(); + __nvvm_membar_sys(); + uint64_t start = gpu::processor_clock(); + + // This forces the compiler to load the input argument and run the clock cycle + // counter before the profiling region. + asm volatile("" ::"r"(arg), "llr"(start)); + + // Run the function under test and return its value. + auto result = f(arg); + + // This inline assembly performs a no-op which forces the result to both be + // used and prevents us from exiting this region before it's complete. + asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); + + // Obtain the current timestamp after running the calculation and force + // ordering. + uint64_t stop = gpu::processor_clock(); + __nvvm_membar_sys(); + gpu::sync_threads(); + asm volatile("" ::"r"(stop)); + volatile T output = result; + + // Return the time elapsed. + return stop - start; +} + +template +static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { + volatile T1 storage = t1; + volatile T2 storage2 = t2; + T1 arg = storage; + T2 arg2 = storage2; + asm volatile("" ::"r"(arg), "r"(arg2)); + + gpu::sync_threads(); + uint64_t start = gpu::processor_clock(); + + asm volatile("" ::"r"(arg), "r"(arg2), "llr"(start)); + + auto result = f(arg, arg2); + + asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); + + uint64_t stop = gpu::processor_clock(); + gpu::sync_threads(); + asm volatile("" ::"r"(stop)); + volatile auto output = result; + + return stop - start; +} +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_UTILS_GPU_TIMING_NVPTX diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h new file mode 100644 index 0000000000000..c47bb0d9ebb55 --- /dev/null +++ b/libc/benchmarks/gpu/timing/timing.h @@ -0,0 +1,22 @@ +//===------------- Implementation of GPU timing utils -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_UTILS_GPU_TIMING_H +#define LLVM_LIBC_UTILS_GPU_TIMING_H + +#include "src/__support/macros/properties/architectures.h" + +#if defined(LIBC_TARGET_ARCH_IS_AMDGPU) +#error "amdgpu not yet supported +#elif defined(LIBC_TARGET_ARCH_IS_NVPTX) +#include "nvptx/timing.h" +#else +#error "unsupported platform" +#endif + +#endif // LLVM_LIBC_UTILS_GPU_TIMING_H From f8291e91be692061ab3d240d78a2112b89cbc342 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Wed, 15 May 2024 12:47:40 -0400 Subject: [PATCH 02/18] refactor cmake rules --- libc/benchmarks/CMakeLists.txt | 397 +++++++++---------- libc/benchmarks/gpu/BenchmarkLogger.cpp | 24 +- libc/benchmarks/gpu/CMakeLists.txt | 162 +------- libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 32 +- libc/benchmarks/gpu/timing/CMakeLists.txt | 14 +- libc/benchmarks/gpu/timing/timing.h | 2 +- libc/cmake/modules/LLVMLibCTestRules.cmake | 10 +- 7 files changed, 259 insertions(+), 382 deletions(-) diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt index a802e653a091e..8b51511e3b5cf 100644 --- a/libc/benchmarks/CMakeLists.txt +++ b/libc/benchmarks/CMakeLists.txt @@ -1,211 +1,210 @@ -if(NOT LIBC_TARGET_OS_IS_GPU) - find_package(Threads) - - set(LLVM_LINK_COMPONENTS - Support - TargetParser - ) - - #============================================================================== - # Add Unit Testing Support - #============================================================================== - - function(add_libc_benchmark_unittest target_name) - if(NOT LLVM_INCLUDE_TESTS) - return() - endif() - - cmake_parse_arguments( - "LIBC_BENCHMARKS_UNITTEST" - "" # No optional arguments - "SUITE" # Single value arguments - "SRCS;DEPENDS" # Multi-value arguments - ${ARGN} - ) - - add_executable(${target_name} - EXCLUDE_FROM_ALL - ${LIBC_BENCHMARKS_UNITTEST_SRCS} - ) - target_link_libraries(${target_name} - PRIVATE - llvm_gtest_main - llvm_gtest - ${LIBC_BENCHMARKS_UNITTEST_DEPENDS} - ) - llvm_update_compile_flags(${target_name}) - - add_custom_command( - TARGET ${target_name} - POST_BUILD - COMMAND $ - ) - add_dependencies(libc-benchmark-util-tests ${target_name}) - endfunction() - - #============================================================================== - # Build Google Benchmark for libc - #============================================================================== - - include(ExternalProject) - ExternalProject_Add(google-benchmark-libc - EXCLUDE_FROM_ALL ON - PREFIX google-benchmark-libc - SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark - INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc - CMAKE_CACHE_ARGS - -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF - -DBENCHMARK_ENABLE_LTO:BOOL=OFF - -DBENCHMARK_ENABLE_TESTING:BOOL=OFF - -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR} - -DBENCHMARK_FORCE_WERROR:BOOL=OFF - -DBENCHMARK_USE_LIBCXX:BOOL=OFF - -DCMAKE_BUILD_TYPE:STRING=Release - - -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME} - -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR} - -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} - -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} - -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH} - - -DBUILD_SHARED_LIBS:BOOL=OFF - -DCMAKE_EXE_LINKER_FLAGS:STRING=-static - - -DCMAKE_CXX_STANDARD:STRING=14 - -DCMAKE_INSTALL_PREFIX:PATH= - ) - - add_custom_target(libc-benchmark-util-tests) - - # libc-benchmark - add_library(libc-benchmark - STATIC - EXCLUDE_FROM_ALL - LibcBenchmark.cpp - LibcBenchmark.h - ) +if(LIBC_TARGET_OS_IS_GPU) + add_subdirectory(gpu) + return() +endif() - target_include_directories(libc-benchmark - PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR} - ) - target_link_libraries(libc-benchmark - PUBLIC - benchmark::benchmark - LLVMSupport - LLVMTargetParser - Threads::Threads - ) - add_dependencies(libc-benchmark google-benchmark-libc) - llvm_update_compile_flags(libc-benchmark) +find_package(Threads) - add_libc_benchmark_unittest(libc-benchmark-test - SRCS LibcBenchmarkTest.cpp - DEPENDS libc-benchmark +set(LLVM_LINK_COMPONENTS + Support + TargetParser ) - # libc-memory-benchmark - add_library(libc-memory-benchmark - STATIC - EXCLUDE_FROM_ALL - LibcMemoryBenchmark.cpp - LibcMemoryBenchmark.h - LibcFunctionPrototypes.h - MemorySizeDistributions.cpp - MemorySizeDistributions.h - ) - target_include_directories(libc-memory-benchmark - PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR} +#============================================================================== +# Add Unit Testing Support +#============================================================================== + +function(add_libc_benchmark_unittest target_name) + if(NOT LLVM_INCLUDE_TESTS) + return() + endif() + + cmake_parse_arguments( + "LIBC_BENCHMARKS_UNITTEST" + "" # No optional arguments + "SUITE" # Single value arguments + "SRCS;DEPENDS" # Multi-value arguments + ${ARGN} ) - target_link_libraries(libc-memory-benchmark - PUBLIC - libc-benchmark - ) - llvm_update_compile_flags(libc-memory-benchmark) - add_libc_benchmark_unittest(libc-memory-benchmark-test - SRCS LibcMemoryBenchmarkTest.cpp - DEPENDS libc-memory-benchmark + add_executable(${target_name} + EXCLUDE_FROM_ALL + ${LIBC_BENCHMARKS_UNITTEST_SRCS} ) - - # json - add_library(json - STATIC - EXCLUDE_FROM_ALL - JSON.cpp - JSON.h + target_link_libraries(${target_name} + PRIVATE + llvm_gtest_main + llvm_gtest + ${LIBC_BENCHMARKS_UNITTEST_DEPENDS} ) - target_link_libraries(json PUBLIC libc-memory-benchmark) - llvm_update_compile_flags(json) + llvm_update_compile_flags(${target_name}) - add_libc_benchmark_unittest(json-test - SRCS JSONTest.cpp - DEPENDS json + add_custom_command( + TARGET ${target_name} + POST_BUILD + COMMAND $ ) - - #============================================================================== - # Benchmarking tool - #============================================================================== - - # Benchmark all implementations that can run on the target CPU. - function(add_libc_multi_impl_benchmark name) - get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations) - foreach(fq_config_name IN LISTS fq_implementations) - get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES) - cpu_supports(can_run "${required_cpu_features}") - if(can_run) - set(benchmark_name ${fq_config_name}_benchmark) - add_executable(${benchmark_name} - EXCLUDE_FROM_ALL - LibcMemoryBenchmarkMain.cpp - ) - get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW") - target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file}) - string(TOUPPER ${name} name_upper) - target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"") - llvm_update_compile_flags(${benchmark_name}) - else() - message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'") - endif() - endforeach() - endfunction() - - add_libc_multi_impl_benchmark(bcmp) - add_libc_multi_impl_benchmark(bzero) - add_libc_multi_impl_benchmark(memcmp) - add_libc_multi_impl_benchmark(memcpy) - add_libc_multi_impl_benchmark(memmove) - add_libc_multi_impl_benchmark(memset) - - #============================================================================== - # Google Benchmarking tool - #============================================================================== - - # This target uses the Google Benchmark facility to report throughput for llvm - # libc memory functions compiled for the host machine. This is useful to - # continuously monitor the performance of the memory functions. - add_executable(libc.benchmarks.memory_functions.opt_host - EXCLUDE_FROM_ALL - LibcMemoryGoogleBenchmarkMain.cpp - LibcDefaultImplementations.cpp - ) - target_link_libraries(libc.benchmarks.memory_functions.opt_host - PRIVATE - libc-memory-benchmark - libc.src.string.memcmp_opt_host.__internal__ - libc.src.string.bcmp_opt_host.__internal__ - libc.src.string.memcpy_opt_host.__internal__ - libc.src.string.memset_opt_host.__internal__ - libc.src.string.bzero_opt_host.__internal__ - libc.src.string.memmove_opt_host.__internal__ - benchmark_main + add_dependencies(libc-benchmark-util-tests ${target_name}) +endfunction() + +#============================================================================== +# Build Google Benchmark for libc +#============================================================================== + +include(ExternalProject) +ExternalProject_Add(google-benchmark-libc + EXCLUDE_FROM_ALL ON + PREFIX google-benchmark-libc + SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc + CMAKE_CACHE_ARGS + -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF + -DBENCHMARK_ENABLE_LTO:BOOL=OFF + -DBENCHMARK_ENABLE_TESTING:BOOL=OFF + -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR} + -DBENCHMARK_FORCE_WERROR:BOOL=OFF + -DBENCHMARK_USE_LIBCXX:BOOL=OFF + -DCMAKE_BUILD_TYPE:STRING=Release + + -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME} + -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR} + -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} + -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} + -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH} + + -DBUILD_SHARED_LIBS:BOOL=OFF + -DCMAKE_EXE_LINKER_FLAGS:STRING=-static + + -DCMAKE_CXX_STANDARD:STRING=14 + -DCMAKE_INSTALL_PREFIX:PATH= ) - llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host) - - add_subdirectory(automemcpy) -endif() -if(LIBC_TARGET_OS_IS_GPU) - add_subdirectory(gpu) -endif() +add_custom_target(libc-benchmark-util-tests) + +# libc-benchmark +add_library(libc-benchmark + STATIC + EXCLUDE_FROM_ALL + LibcBenchmark.cpp + LibcBenchmark.h +) + +target_include_directories(libc-benchmark + PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR} +) +target_link_libraries(libc-benchmark + PUBLIC + benchmark::benchmark + LLVMSupport + LLVMTargetParser + Threads::Threads +) +add_dependencies(libc-benchmark google-benchmark-libc) +llvm_update_compile_flags(libc-benchmark) + +add_libc_benchmark_unittest(libc-benchmark-test + SRCS LibcBenchmarkTest.cpp + DEPENDS libc-benchmark +) + +# libc-memory-benchmark +add_library(libc-memory-benchmark + STATIC + EXCLUDE_FROM_ALL + LibcMemoryBenchmark.cpp + LibcMemoryBenchmark.h + LibcFunctionPrototypes.h + MemorySizeDistributions.cpp + MemorySizeDistributions.h +) +target_include_directories(libc-memory-benchmark + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} +) +target_link_libraries(libc-memory-benchmark + PUBLIC + libc-benchmark +) +llvm_update_compile_flags(libc-memory-benchmark) + +add_libc_benchmark_unittest(libc-memory-benchmark-test + SRCS LibcMemoryBenchmarkTest.cpp + DEPENDS libc-memory-benchmark +) + +# json +add_library(json + STATIC + EXCLUDE_FROM_ALL + JSON.cpp + JSON.h +) +target_link_libraries(json PUBLIC libc-memory-benchmark) +llvm_update_compile_flags(json) + +add_libc_benchmark_unittest(json-test + SRCS JSONTest.cpp + DEPENDS json +) + +#============================================================================== +# Benchmarking tool +#============================================================================== + +# Benchmark all implementations that can run on the target CPU. +function(add_libc_multi_impl_benchmark name) + get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations) + foreach(fq_config_name IN LISTS fq_implementations) + get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES) + cpu_supports(can_run "${required_cpu_features}") + if(can_run) + set(benchmark_name ${fq_config_name}_benchmark) + add_executable(${benchmark_name} + EXCLUDE_FROM_ALL + LibcMemoryBenchmarkMain.cpp + ) + get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW") + target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file}) + string(TOUPPER ${name} name_upper) + target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"") + llvm_update_compile_flags(${benchmark_name}) + else() + message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'") + endif() + endforeach() +endfunction() + +add_libc_multi_impl_benchmark(bcmp) +add_libc_multi_impl_benchmark(bzero) +add_libc_multi_impl_benchmark(memcmp) +add_libc_multi_impl_benchmark(memcpy) +add_libc_multi_impl_benchmark(memmove) +add_libc_multi_impl_benchmark(memset) + +#============================================================================== +# Google Benchmarking tool +#============================================================================== + +# This target uses the Google Benchmark facility to report throughput for llvm +# libc memory functions compiled for the host machine. This is useful to +# continuously monitor the performance of the memory functions. +add_executable(libc.benchmarks.memory_functions.opt_host + EXCLUDE_FROM_ALL + LibcMemoryGoogleBenchmarkMain.cpp + LibcDefaultImplementations.cpp +) +target_link_libraries(libc.benchmarks.memory_functions.opt_host + PRIVATE + libc-memory-benchmark + libc.src.string.memcmp_opt_host.__internal__ + libc.src.string.bcmp_opt_host.__internal__ + libc.src.string.memcpy_opt_host.__internal__ + libc.src.string.memset_opt_host.__internal__ + libc.src.string.bzero_opt_host.__internal__ + libc.src.string.memmove_opt_host.__internal__ + benchmark_main +) +llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host) + +add_subdirectory(automemcpy) diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp index 94a0d897c9585..4f70d23a1e95e 100644 --- a/libc/benchmarks/gpu/BenchmarkLogger.cpp +++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp @@ -13,18 +13,21 @@ namespace libc_gpu_benchmarks { // cpp::string_view specialization template <> -BenchmarkLogger &BenchmarkLogger::operator<< (cpp::string_view str) { +BenchmarkLogger & + BenchmarkLogger::operator<< (cpp::string_view str) { LIBC_NAMESPACE::write_to_stderr(str); return *this; } // cpp::string specialization -template <> BenchmarkLogger &BenchmarkLogger::operator<< (cpp::string str) { +template <> +BenchmarkLogger &BenchmarkLogger::operator<< (cpp::string str) { return *this << static_cast(str); } // const char* specialization -template <> BenchmarkLogger &BenchmarkLogger::operator<< (const char *str) { +template <> +BenchmarkLogger &BenchmarkLogger::operator<< (const char *str) { return *this << cpp::string_view(str); } @@ -66,15 +69,20 @@ template BenchmarkLogger &BenchmarkLogger::operator<< (short); template BenchmarkLogger &BenchmarkLogger::operator<< (int); template BenchmarkLogger &BenchmarkLogger::operator<< (long); template BenchmarkLogger &BenchmarkLogger::operator<< (long long); -template BenchmarkLogger &BenchmarkLogger::operator<< (unsigned char); -template BenchmarkLogger &BenchmarkLogger::operator<< (unsigned short); -template BenchmarkLogger &BenchmarkLogger::operator<< (unsigned int); -template BenchmarkLogger &BenchmarkLogger::operator<< (unsigned long); +template BenchmarkLogger & + BenchmarkLogger::operator<< (unsigned char); +template BenchmarkLogger & + BenchmarkLogger::operator<< (unsigned short); +template BenchmarkLogger & + BenchmarkLogger::operator<< (unsigned int); +template BenchmarkLogger & + BenchmarkLogger::operator<< (unsigned long); template BenchmarkLogger & BenchmarkLogger::operator<< (unsigned long long); #ifdef LIBC_TYPES_HAS_INT128 -template BenchmarkLogger &BenchmarkLogger::operator<< <__uint128_t>(__uint128_t); +template BenchmarkLogger & + BenchmarkLogger::operator<< <__uint128_t>(__uint128_t); #endif // LIBC_TYPES_HAS_INT128 template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<128>); template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<192>); diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index a18be27e33573..5dafe66bbd738 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -2,161 +2,25 @@ add_subdirectory(timing) add_custom_target(gpu-benchmark) -function (add_gpu_benchmark test_name) - if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1) - message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.") - return() - endif() - +function(add_benchmark benchmark_name) cmake_parse_arguments( - "GPU_BENCHMARK" - "" # No optional arguments - "SUITE" # Single value arguments - "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments + "BENCHMARK" + "LINK_LIBRARIES" # Optional arguments + "" # Single value arguments + "" # Multi-value arguments ${ARGN} ) - - if(NOT GPU_BENCHMARK_SUITE) - message(FATAL_ERROR "SUITE not specified for ${fq_target_name}") - endif() - if(NOT GPU_BENCHMARK_SRCS) - message(FATAL_ERROR "The SRCS list for add_gpu_benchmark is missing.") - endif() - - get_fq_target_name(${test_name} fq_target_name) - get_fq_target_name(${test_name}.libc fq_libc_target_name) # Stores the compiled libc + infrastructure archive to link in - get_fq_deps_list(fq_deps_list ${GPU_BENCHMARK_DEPENDS}) - list(APPEND fq_deps_list - # Hermetic tests use the platform's startup object. So, their deps also - # have to be collected. - libc.startup.${LIBC_TARGET_OS}.crt1 - # We always add the memory functions objects. This is because the - # compiler's codegen can emit calls to the C memory functions. - libc.src.string.bcmp - libc.src.string.bzero - libc.src.string.memcmp - libc.src.string.memcpy - libc.src.string.memmove - libc.src.string.memset - libc.src.__support.StringUtil.error_to_string - ) - - list(REMOVE_DUPLICATES fq_deps_list) - - # TODO: Instead of gathering internal object files from entrypoints, - # collect the object files with public names of entrypoints. - get_object_files_for_test( - link_object_files skipped_entrypoints_list ${fq_deps_list}) - if(skipped_entrypoints_list) - if(LIBC_CMAKE_VERBOSE_LOGGING) - set(msg "Skipping hermetic test ${fq_target_name} as it has missing deps: " - "${skipped_entrypoints_list}.") - endif() - return() - endif() - list(REMOVE_DUPLICATES link_object_files) - - # Make a library of all deps - add_library( - ${fq_target_name}.__libc__ - STATIC - EXCLUDE_FROM_ALL - ${link_object_files} - ) - set_target_properties(${fq_target_name}.__libc__ - PROPERTIES ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - set_target_properties(${fq_target_name}.__libc__ - PROPERTIES ARCHIVE_OUTPUT_NAME ${fq_target_name}.libc) - - set(fq_build_target_name ${fq_target_name}.__build__) - add_executable( - ${fq_build_target_name} - EXCLUDE_FROM_ALL - $<$:${link_object_files}> - ${GPU_BENCHMARK_SRCS} - ${GPU_BENCHMARK_HDRS} - ) - set_target_properties(${fq_build_target_name} - PROPERTIES - RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - ) - - _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}") - target_include_directories(${fq_build_target_name} SYSTEM PRIVATE ${LIBC_INCLUDE_DIR}) - target_include_directories(${fq_build_target_name} PRIVATE ${LIBC_SOURCE_DIR}) - _get_hermetic_test_compile_options(compile_options "${GPU_BENCHMARK_COMPILE_OPTIONS}") - target_compile_options(${fq_build_target_name} PRIVATE ${compile_options}) - - set(link_libraries "") - foreach(lib in LISTS GPU_BENCHMARK_LINK_LIBRARIES) - if(TARGET ${lib}.hermetic) - list(APPEND link_libraries ${lib}.hermetic) - else() - list(APPEND link_libraries ${lib}) - endif() - endforeach() - - if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) - target_link_options(${fq_build_target_name} PRIVATE - ${LIBC_COMPILE_OPTIONS_DEFAULT} - -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto -Wno-multi-gpu - "-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static - "-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}") - elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX) - # We need to use the internal object versions for NVPTX. - set(internal_suffix ".__internal__") - target_link_options(${fq_build_target_name} PRIVATE - ${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu - "-Wl,--suppress-stack-size-warning" - -march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static - "--cuda-path=${LIBC_CUDA_ROOT}") - elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP) - target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static) - else() - # Older version of gcc does not support `nostdlib++` flag. We use - # `nostdlib` and link against libgcc_s, which cannot be linked statically. - target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib) - list(APPEND link_libraries ${LIBGCC_S_LOCATION}) - endif() - - # link libraries for the BUILD target (i.e. to compile the test) - target_link_libraries( - ${fq_build_target_name} - PRIVATE - libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix} - ${link_libraries} - # LibcTest.hermetic + add_libc_hermetic_test( + ${benchmark_name} + IS_BENCHMARK + LINK_LIBRARIES LibcGpuBenchmark.hermetic - # LibcHermeticTestSupport.hermetic - LibcHermeticTestSupport.hermetic - # The NVIDIA 'nvlink' linker does not currently support static libraries. - $<$>:${fq_target_name}.__libc__>) - - add_dependencies(${fq_build_target_name} - LibcGpuBenchmark.hermetic - ${fq_deps_list}) - - # Tests on the GPU require an external loader utility to launch the kernel. - if(TARGET libc.utils.gpu.loader) - add_dependencies(${fq_build_target_name} libc.utils.gpu.loader) - get_target_property(gpu_loader_exe libc.utils.gpu.loader "EXECUTABLE") - endif() - - set(test_cmd ${GPU_BENCHMARK_ENV} - $<$:${gpu_loader_exe}> ${CMAKE_CROSSCOMPILING_EMULATOR} ${GPU_BENCHMARK_LOADER_ARGS} - $ ${GPU_BENCHMARK_ARGS}) - add_custom_target( - ${fq_target_name} - COMMAND ${test_cmd} - COMMAND_EXPAND_LISTS - COMMENT "Running GPU benchmark ${fq_target_name}" + ${BENCHMARK_LINK_LIBRARIES} + ${BENCHMARK_UNPARSED_ARGUMENTS} ) - - # Make this benchmark part of its suite - add_dependencies(${GPU_BENCHMARK_SUITE} ${fq_target_name}) - # Remember to make this benchmark part of the umbrella command + get_fq_target_name(${benchmark_name} fq_target_name) add_dependencies(gpu-benchmark ${fq_target_name}) -endfunction(add_gpu_benchmark) +endfunction(add_benchmark) add_unittest_framework_library( LibcGpuBenchmark diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt index ab2f6cdf0c7fd..8d448b8ced955 100644 --- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt +++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt @@ -1,21 +1,21 @@ add_custom_target(libc-gpu-ctype-benchmarks) -add_gpu_benchmark( - isalnum_benchmark - SUITE - libc-gpu-ctype-benchmarks - SRCS - isalnum_benchmark.cpp - DEPENDS - libc.src.ctype.isalnum +add_benchmark( + isalnum_benchmark + SUITE + libc-gpu-ctype-benchmarks + SRCS + isalnum_benchmark.cpp + DEPENDS + libc.src.ctype.isalnum ) -add_gpu_benchmark( - isalpha_benchmark - SUITE - libc-gpu-ctype-benchmarks - SRCS - isalpha_benchmark.cpp - DEPENDS - libc.src.ctype.isalpha +add_benchmark( + isalpha_benchmark + SUITE + libc-gpu-ctype-benchmarks + SRCS + isalpha_benchmark.cpp + DEPENDS + libc.src.ctype.isalpha ) diff --git a/libc/benchmarks/gpu/timing/CMakeLists.txt b/libc/benchmarks/gpu/timing/CMakeLists.txt index 0e6a5a6b47968..8bbc7e33f122a 100644 --- a/libc/benchmarks/gpu/timing/CMakeLists.txt +++ b/libc/benchmarks/gpu/timing/CMakeLists.txt @@ -1,12 +1,12 @@ foreach(target nvptx) - add_subdirectory(${target}) - list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing) + add_subdirectory(${target}) + list(APPEND target_gpu_timing libc.benchmarks.gpu.timing.${target}.${target}_timing) endforeach() add_header_library( - timing - HDRS - timing.h - DEPENDS - ${target_gpu_timing} + timing + HDRS + timing.h + DEPENDS + ${target_gpu_timing} ) diff --git a/libc/benchmarks/gpu/timing/timing.h b/libc/benchmarks/gpu/timing/timing.h index c47bb0d9ebb55..180ea77954ae5 100644 --- a/libc/benchmarks/gpu/timing/timing.h +++ b/libc/benchmarks/gpu/timing/timing.h @@ -12,7 +12,7 @@ #include "src/__support/macros/properties/architectures.h" #if defined(LIBC_TARGET_ARCH_IS_AMDGPU) -#error "amdgpu not yet supported +#error "amdgpu not yet supported" #elif defined(LIBC_TARGET_ARCH_IS_NVPTX) #include "nvptx/timing.h" #else diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index c8d7c8a2b1c7c..278137774e089 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -550,7 +550,7 @@ function(add_libc_hermetic_test test_name) endif() cmake_parse_arguments( "HERMETIC_TEST" - "" # No optional arguments + "IS_BENCHMARK" # Optional arguments "SUITE" # Single value arguments "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments ${ARGN} @@ -651,6 +651,13 @@ function(add_libc_hermetic_test test_name) endif() endforeach() + # Benchmarks requires a separate library with a different `main` function + if(HERMETIC_TEST_IS_BENCHMARK) + list(APPEND link_libraries LibcGpuBenchmark.hermetic) + else() + list(APPEND link_libraries LibcTest.hermetic) + endif() + if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) target_link_options(${fq_build_target_name} PRIVATE ${LIBC_COMPILE_OPTIONS_DEFAULT} @@ -678,7 +685,6 @@ function(add_libc_hermetic_test test_name) PRIVATE libc.startup.${LIBC_TARGET_OS}.crt1${internal_suffix} ${link_libraries} - LibcTest.hermetic LibcHermeticTestSupport.hermetic # The NVIDIA 'nvlink' linker does not currently support static libraries. $<$>:${fq_target_name}.__libc__>) From 1129ccc33651c46ec22c6cd3d679abbb1829b3ba Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Thu, 16 May 2024 13:08:37 -0400 Subject: [PATCH 03/18] fix code style --- libc/benchmarks/gpu/CMakeLists.txt | 1 + libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 87 ++++++++------- libc/benchmarks/gpu/LibcGpuBenchmark.h | 105 +++++++++---------- libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp | 2 +- 4 files changed, 97 insertions(+), 98 deletions(-) diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index 5dafe66bbd738..db2953f6fcf23 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -37,6 +37,7 @@ add_unittest_framework_library( libc.src.__support.CPP.string libc.src.__support.CPP.string_view libc.src.__support.CPP.type_traits + libc.src.__support.CPP.functional libc.src.__support.fixed_point.fx_rep libc.src.__support.macros.properties.types libc.src.__support.OSUtil.osutil diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index d37f5a0a53a70..087b59689d90b 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -3,67 +3,66 @@ namespace LIBC_NAMESPACE { namespace libc_gpu_benchmarks { -Benchmark *Benchmark::Start = nullptr; -Benchmark *Benchmark::End = nullptr; +Benchmark *Benchmark::start = nullptr; +Benchmark *Benchmark::end = nullptr; -void Benchmark::addBenchmark(Benchmark *B) { - if (End == nullptr) { - Start = B; - End = B; +void Benchmark::add_benchmark(Benchmark *benchmark) { + if (end == nullptr) { + start = benchmark; + end = benchmark; return; } - - End->Next = B; - End = B; + end->next = benchmark; + end = benchmark; } -int Benchmark::runBenchmarks() { - for (Benchmark *B = Start; B != nullptr; B = B->Next) { - B->Run(); - } - +int Benchmark::run_benchmarks() { + for (Benchmark *b = start; b != nullptr; b = b->next) + b->run(); return 0; } -BenchmarkResult benchmark(const BenchmarkOptions &Options, - uint64_t (*WrapperFunc)()) { - BenchmarkResult Result; - RuntimeEstimationProgression REP; - size_t TotalIterations = 0; - size_t Iterations = Options.InitialIterations; - if (Iterations < (uint32_t)1) { - Iterations = 1; - } - size_t Samples = 0; - uint64_t BestGuess = 0; - uint64_t TotalCycles = 0; +BenchmarkResult benchmark(const BenchmarkOptions &options, + cpp::function wrapper_func) { + BenchmarkResult result; + RuntimeEstimationProgression rep; + size_t total_iterations = 0; + size_t iterations = options.initial_iterations; + if (iterations < (uint32_t)1) + iterations = 1; + + size_t samples = 0; + uint64_t best_guess = 0; + uint64_t total_cycles = 0; for (;;) { - uint64_t SampleCycles = 0; - for (uint32_t i = 0; i < Iterations; i++) { - auto overhead = LIBC_NAMESPACE::overhead(); - uint64_t result = WrapperFunc() - overhead; - SampleCycles += result; + uint64_t sample_cycles = 0; + uint64_t overhead = LIBC_NAMESPACE::overhead(); + for (uint32_t i = 0; i < iterations; i++) { + uint64_t result = wrapper_func() - overhead; + sample_cycles += result; } - Samples++; - TotalCycles += SampleCycles; - TotalIterations += Iterations; - const double ChangeRatio = - REP.ComputeImprovement({Iterations, SampleCycles}); - BestGuess = REP.CurrentEstimation; + samples++; + total_cycles += sample_cycles; + total_iterations += iterations; + const double change_ratio = + rep.compute_improvement({iterations, sample_cycles}); + best_guess = rep.current_estimation; - if (Samples >= Options.MaxSamples || Iterations >= Options.MaxIterations) { + if (samples >= options.max_samples || + iterations >= options.max_iterations) { break; - } else if (Samples >= Options.MinSamples && ChangeRatio < Options.Epsilon) { + } else if (samples >= options.min_samples && + change_ratio < options.epsilon) { break; } - Iterations *= Options.ScalingFactor; + iterations *= options.scaling_factor; } - Result.Cycles = BestGuess; - Result.Samples = Samples; - Result.TotalIterations = TotalIterations; - return Result; + result.cycles = best_guess; + result.samples = samples; + result.total_iterations = total_iterations; + return result; }; } // namespace libc_gpu_benchmarks diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index ccbbe3629dbda..3d762631f2d96 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -1,9 +1,10 @@ #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H -#include "benchmarks/gpu/timing/timing.h" - #include "benchmarks/gpu/BenchmarkLogger.h" +#include "benchmarks/gpu/timing/timing.h" +#include "src/__support/CPP/functional.h" +#include "src/__support/CPP/string_view.h" #include #include @@ -13,104 +14,102 @@ namespace LIBC_NAMESPACE { namespace libc_gpu_benchmarks { struct BenchmarkOptions { - uint32_t InitialIterations = 1; - uint32_t MaxIterations = 10000000; - uint32_t MinSamples = 4; - uint32_t MaxSamples = 1000; - double Epsilon = 0.01; - double ScalingFactor = 1.4; + uint32_t initial_iterations = 1; + uint32_t max_iterations = 10000000; + uint32_t min_samples = 4; + uint32_t max_samples = 1000; + double epsilon = 0.01; + double scaling_factor = 1.4; }; struct Measurement { - size_t Iterations = 0; - uint64_t ElapsedCycles = 0; + size_t iterations = 0; + uint64_t elapsed_cycles = 0; }; class RefinableRuntimeEstimation { - uint64_t TotalCycles = 0; - size_t TotalIterations = 0; + uint64_t total_cycles = 0; + size_t total_iterations = 0; public: - uint64_t Update(const Measurement &M) { - TotalCycles += M.ElapsedCycles; - TotalIterations += M.Iterations; - return TotalCycles / TotalIterations; + uint64_t update(const Measurement &M) { + total_cycles += M.elapsed_cycles; + total_iterations += M.iterations; + return total_cycles / total_iterations; } }; // Tracks the progression of the runtime estimation class RuntimeEstimationProgression { - RefinableRuntimeEstimation RRE; + RefinableRuntimeEstimation rre; public: - uint64_t CurrentEstimation = 0; + uint64_t current_estimation = 0; - double ComputeImprovement(const Measurement &M) { - const uint64_t NewEstimation = RRE.Update(M); - double Ratio = ((double)CurrentEstimation / NewEstimation) - 1.0; + double compute_improvement(const Measurement &M) { + const uint64_t new_estimation = rre.update(M); + double ratio = ((double)current_estimation / new_estimation) - 1.0; // Get absolute value - if (Ratio < 0) { - Ratio *= -1; - } + if (ratio < 0) + ratio *= -1; - CurrentEstimation = NewEstimation; - return Ratio; + current_estimation = new_estimation; + return ratio; } }; struct BenchmarkResult { - uint64_t Cycles = 0; - size_t Samples = 0; - size_t TotalIterations = 0; + uint64_t cycles = 0; + size_t samples = 0; + size_t total_iterations = 0; }; -BenchmarkResult benchmark(const BenchmarkOptions &Options, - uint64_t (*WrapperFunc)()); +BenchmarkResult benchmark(const BenchmarkOptions &options, + cpp::function wrapper_func); class Benchmark { - Benchmark *Next = nullptr; + Benchmark *next = nullptr; public: virtual ~Benchmark() {} - virtual void SetUp() {} - virtual void TearDown() {} + virtual void set_up() {} + virtual void tear_down() {} - static int runBenchmarks(); + static int run_benchmarks(); protected: - static void addBenchmark(Benchmark *); + static void add_benchmark(Benchmark *); private: - virtual void Run() = 0; - virtual const char *getName() const = 0; + virtual void run() = 0; + virtual const cpp::string_view get_name() const = 0; - static Benchmark *Start; - static Benchmark *End; + static Benchmark *start; + static Benchmark *end; }; class WrapperBenchmark : public Benchmark { - using BenchmarkWrapperFunction = uint64_t (*)(); - BenchmarkWrapperFunction Func; - const char *Name; + const cpp::function func; + const cpp::string_view name; public: - WrapperBenchmark(BenchmarkWrapperFunction Func, char const *Name) - : Func(Func), Name(Name) { - addBenchmark(this); + WrapperBenchmark(cpp::function func, char const *name) + : func(func), name(name) { + add_benchmark(this); } private: - void Run() override { - BenchmarkOptions Options; - auto result = benchmark(Options, Func); + void run() override { + BenchmarkOptions options; + auto result = benchmark(options, func); constexpr auto GREEN = "\033[32m"; constexpr auto RESET = "\033[0m"; - blog << GREEN << "[ RUN ] " << RESET << Name << '\n'; - blog << GREEN << "[ OK ] " << RESET << Name << ": " << result.Cycles - << " cycles, " << result.TotalIterations << " iterations\n"; + blog << GREEN << "[ RUN ] " << RESET << name << '\n'; + blog << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles + << " cycles, " << result.total_iterations << " iterations\n"; } - const char *getName() const override { return Name; } + const cpp::string_view get_name() const override { return name; } }; } // namespace libc_gpu_benchmarks } // namespace LIBC_NAMESPACE diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp index c971b00cc9a1b..510fd13210494 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp @@ -1,6 +1,6 @@ #include "LibcGpuBenchmark.h" extern "C" int main(int argc, char **argv, char **envp) { - LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::runBenchmarks(); + LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::run_benchmarks(); return 0; } From 5c46009bbbddd2114a11fabd2e3afbebed7488f7 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Fri, 17 May 2024 16:22:23 -0400 Subject: [PATCH 04/18] measure walltime, standard deviation, min, and max --- libc/benchmarks/gpu/CMakeLists.txt | 7 +++++ libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 31 ++++++++++++++++--- libc/benchmarks/gpu/LibcGpuBenchmark.h | 12 ++++++- .../gpu/src/ctype/isalnum_benchmark.cpp | 13 -------- libc/benchmarks/gpu/timing/nvptx/timing.h | 13 +++----- 5 files changed, 50 insertions(+), 26 deletions(-) diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index db2953f6fcf23..b9ca85393cc2e 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -10,6 +10,9 @@ function(add_benchmark benchmark_name) "" # Multi-value arguments ${ARGN} ) + if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS) + message(FATAL_ERROR "target does not support clock") + endif() add_libc_hermetic_test( ${benchmark_name} IS_BENCHMARK @@ -38,10 +41,14 @@ add_unittest_framework_library( libc.src.__support.CPP.string_view libc.src.__support.CPP.type_traits libc.src.__support.CPP.functional + libc.src.__support.CPP.limits + libc.src.__support.CPP.algorithm libc.src.__support.fixed_point.fx_rep libc.src.__support.macros.properties.types libc.src.__support.OSUtil.osutil libc.src.__support.uint128 + libc.src.___support.FPUtil.sqrt + libc.src.time.clock libc.benchmarks.gpu.timing.timing ) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index 087b59689d90b..3ecff18884b34 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -1,4 +1,7 @@ #include "LibcGpuBenchmark.h" +#include "src/__support/CPP/algorithm.h" +#include "src/__support/FPUtil/sqrt.h" +#include "src/time/gpu/time_utils.h" namespace LIBC_NAMESPACE { namespace libc_gpu_benchmarks { @@ -32,27 +35,42 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, iterations = 1; size_t samples = 0; + uint64_t total_time = 0; uint64_t best_guess = 0; uint64_t total_cycles = 0; + uint64_t cycles_2 = 0; + uint64_t min = UINT_MAX; + uint64_t max = 0; for (;;) { uint64_t sample_cycles = 0; uint64_t overhead = LIBC_NAMESPACE::overhead(); + const clock_t start = (double)clock(); for (uint32_t i = 0; i < iterations; i++) { - uint64_t result = wrapper_func() - overhead; + auto wrapper_intermediate = wrapper_func(); + uint64_t result = wrapper_intermediate - overhead; + max = cpp::max(max, result); + min = cpp::min(min, result); sample_cycles += result; } - + const clock_t end = clock(); + const clock_t duration_ns = + ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC; + total_time += duration_ns; samples++; total_cycles += sample_cycles; + cycles_2 += sample_cycles * sample_cycles; + total_iterations += iterations; const double change_ratio = rep.compute_improvement({iterations, sample_cycles}); best_guess = rep.current_estimation; if (samples >= options.max_samples || - iterations >= options.max_iterations) { + iterations >= options.max_iterations || + total_time >= options.max_duration) { break; - } else if (samples >= options.min_samples && + } else if (total_time >= options.min_duration && + samples >= options.min_samples && change_ratio < options.epsilon) { break; } @@ -60,8 +78,13 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, iterations *= options.scaling_factor; } result.cycles = best_guess; + result.standard_deviation = fputil::sqrt((double)cycles_2 / total_iterations - + (best_guess * best_guess)); + result.min = min; + result.max = max; result.samples = samples; result.total_iterations = total_iterations; + result.total_time = total_time; return result; }; diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index 3d762631f2d96..798ae06086b1a 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -4,7 +4,9 @@ #include "benchmarks/gpu/BenchmarkLogger.h" #include "benchmarks/gpu/timing/timing.h" #include "src/__support/CPP/functional.h" +#include "src/__support/CPP/limits.h" #include "src/__support/CPP/string_view.h" +#include "src/time/clock.h" #include #include @@ -18,6 +20,8 @@ struct BenchmarkOptions { uint32_t max_iterations = 10000000; uint32_t min_samples = 4; uint32_t max_samples = 1000; + uint64_t min_duration = 0; // in nanoseconds (ns) + uint64_t max_duration = 1000 * 1000 * 1000; // 1e9 nanoseconds = 1 second double epsilon = 0.01; double scaling_factor = 1.4; }; @@ -61,8 +65,12 @@ class RuntimeEstimationProgression { struct BenchmarkResult { uint64_t cycles = 0; + double standard_deviation = 0; + uint64_t min = UINT_MAX; + uint64_t max = 0; size_t samples = 0; size_t total_iterations = 0; + clock_t total_time = 0; }; BenchmarkResult benchmark(const BenchmarkOptions &options, @@ -107,7 +115,9 @@ class WrapperBenchmark : public Benchmark { constexpr auto RESET = "\033[0m"; blog << GREEN << "[ RUN ] " << RESET << name << '\n'; blog << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles - << " cycles, " << result.total_iterations << " iterations\n"; + << " cycles, " << result.min << " min, " << result.max << " max, " + << result.total_iterations << " iterations, " << result.total_time + << " ns, " << (long)result.standard_deviation << " stddev\n"; } const cpp::string_view get_name() const override { return name; } }; diff --git a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp index 8d9c958bb7ed4..4050bc0ec77b9 100644 --- a/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp +++ b/libc/benchmarks/gpu/src/ctype/isalnum_benchmark.cpp @@ -7,16 +7,3 @@ uint64_t BM_IsAlnum() { return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x); } BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWrapper, BM_IsAlnum); - -[[gnu::noinline]] static uint64_t single_input_function(int x) { - asm volatile("" ::"r"(x)); // prevent the compiler from optimizing out x - return x; -} - -uint64_t BM_IsAlnumWithOverhead() { - char x = 'c'; - return LIBC_NAMESPACE::latency(LIBC_NAMESPACE::isalnum, x) - - LIBC_NAMESPACE::latency(single_input_function, 0); -} -BENCHMARK(LlvmLibcIsAlNumGpuBenchmark, IsAlnumWithOverhead, - BM_IsAlnumWithOverhead); diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h index 008432e6aa1d2..001bdd3686062 100644 --- a/libc/benchmarks/gpu/timing/nvptx/timing.h +++ b/libc/benchmarks/gpu/timing/nvptx/timing.h @@ -37,12 +37,7 @@ namespace LIBC_NAMESPACE { // Stimulate a simple function and obtain its latency in clock cycles on the // system. This function cannot be inlined or else it will disturb the very -// deliccate balance of hard-coded dependencies. -// -// FIXME: This does not work in general on NVPTX because of further -// optimizations ptxas performs. The only way to get consistent results is to -// pass and extra "SHELL:-Xcuda-ptxas -O0" to CMake's compiler flag. This -// negatively implacts performance but it is at least stable. +// delicate balance of hard-coded dependencies. template [[gnu::noinline]] static LIBC_INLINE uint64_t latency(F f, T t) { // We need to store the input somewhere to guarantee that the compiler will @@ -53,7 +48,7 @@ template // Get the current timestamp from the clock. gpu::sync_threads(); - __nvvm_membar_sys(); + gpu::memory_fence(); uint64_t start = gpu::processor_clock(); // This forces the compiler to load the input argument and run the clock cycle @@ -70,7 +65,7 @@ template // Obtain the current timestamp after running the calculation and force // ordering. uint64_t stop = gpu::processor_clock(); - __nvvm_membar_sys(); + gpu::memory_fence(); gpu::sync_threads(); asm volatile("" ::"r"(stop)); volatile T output = result; @@ -88,6 +83,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { asm volatile("" ::"r"(arg), "r"(arg2)); gpu::sync_threads(); + gpu::memory_fence(); uint64_t start = gpu::processor_clock(); asm volatile("" ::"r"(arg), "r"(arg2), "llr"(start)); @@ -97,6 +93,7 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); uint64_t stop = gpu::processor_clock(); + gpu::memory_fence(); gpu::sync_threads(); asm volatile("" ::"r"(stop)); volatile auto output = result; From e50ea99befc4279ea1987c47cf5084a55f2f8a47 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Wed, 22 May 2024 16:08:39 -0400 Subject: [PATCH 05/18] fixed vector for benchmarks --- libc/benchmarks/CMakeLists.txt | 114 ++++++++++++----------- libc/benchmarks/gpu/CMakeLists.txt | 1 + libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 16 +--- libc/benchmarks/gpu/LibcGpuBenchmark.h | 4 +- 4 files changed, 65 insertions(+), 70 deletions(-) diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt index 8b51511e3b5cf..221d4e11d383d 100644 --- a/libc/benchmarks/CMakeLists.txt +++ b/libc/benchmarks/CMakeLists.txt @@ -1,14 +1,14 @@ if(LIBC_TARGET_OS_IS_GPU) - add_subdirectory(gpu) - return() + add_subdirectory(gpu) + return() endif() find_package(Threads) set(LLVM_LINK_COMPONENTS - Support - TargetParser - ) + Support + TargetParser +) #============================================================================== # Add Unit Testing Support @@ -16,35 +16,37 @@ set(LLVM_LINK_COMPONENTS function(add_libc_benchmark_unittest target_name) if(NOT LLVM_INCLUDE_TESTS) - return() + return() endif() - cmake_parse_arguments( - "LIBC_BENCHMARKS_UNITTEST" - "" # No optional arguments - "SUITE" # Single value arguments - "SRCS;DEPENDS" # Multi-value arguments - ${ARGN} + cmake_parse_arguments(if(LIBC_TARGET_OS_IS_GPU) + add_subdirectory(gpu) + return() + "LIBC_BENCHMARKS_UNITTEST" + "" # No optional arguments + "SUITE" # Single value arguments + "SRCS;DEPENDS" # Multi-value arguments + ${ARGN} ) - add_executable(${target_name} - EXCLUDE_FROM_ALL - ${LIBC_BENCHMARKS_UNITTEST_SRCS} - ) - target_link_libraries(${target_name} - PRIVATE - llvm_gtest_main - llvm_gtest - ${LIBC_BENCHMARKS_UNITTEST_DEPENDS} - ) - llvm_update_compile_flags(${target_name}) - - add_custom_command( - TARGET ${target_name} - POST_BUILD - COMMAND $ - ) - add_dependencies(libc-benchmark-util-tests ${target_name}) + add_executable(${target_name} + EXCLUDE_FROM_ALL + ${LIBC_BENCHMARKS_UNITTEST_SRCS} + ) + target_link_libraries(${target_name} + PRIVATE + llvm_gtest_main + llvm_gtest + ${LIBC_BENCHMARKS_UNITTEST_DEPENDS} + ) + llvm_update_compile_flags(${target_name}) + + add_custom_command( + TARGET ${target_name} + POST_BUILD + COMMAND $ + ) + add_dependencies(libc-benchmark-util-tests ${target_name}) endfunction() #============================================================================== @@ -53,32 +55,32 @@ endfunction() include(ExternalProject) ExternalProject_Add(google-benchmark-libc - EXCLUDE_FROM_ALL ON - PREFIX google-benchmark-libc - SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark - INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc - CMAKE_CACHE_ARGS - -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF - -DBENCHMARK_ENABLE_LTO:BOOL=OFF - -DBENCHMARK_ENABLE_TESTING:BOOL=OFF - -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR} - -DBENCHMARK_FORCE_WERROR:BOOL=OFF - -DBENCHMARK_USE_LIBCXX:BOOL=OFF - -DCMAKE_BUILD_TYPE:STRING=Release - - -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME} - -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR} - -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} - -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} - -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH} - - -DBUILD_SHARED_LIBS:BOOL=OFF - -DCMAKE_EXE_LINKER_FLAGS:STRING=-static - - -DCMAKE_CXX_STANDARD:STRING=14 - -DCMAKE_INSTALL_PREFIX:PATH= - ) + EXCLUDE_FROM_ALL ON + PREFIX google-benchmark-libc + SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc + CMAKE_CACHE_ARGS + -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF + -DBENCHMARK_ENABLE_LTO:BOOL=OFF + -DBENCHMARK_ENABLE_TESTING:BOOL=OFF + -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR} + -DBENCHMARK_FORCE_WERROR:BOOL=OFF + -DBENCHMARK_USE_LIBCXX:BOOL=OFF + -DCMAKE_BUILD_TYPE:STRING=Release + + -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME} + -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR} + -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} + -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} + -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH} + + -DBUILD_SHARED_LIBS:BOOL=OFF + -DCMAKE_EXE_LINKER_FLAGS:STRING=-static + + -DCMAKE_CXX_STANDARD:STRING=14 + -DCMAKE_INSTALL_PREFIX:PATH= + ) add_custom_target(libc-benchmark-util-tests) diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index b9ca85393cc2e..51fc267df807d 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -48,6 +48,7 @@ add_unittest_framework_library( libc.src.__support.OSUtil.osutil libc.src.__support.uint128 libc.src.___support.FPUtil.sqrt + libc.src.__support.fixedvector libc.src.time.clock libc.benchmarks.gpu.timing.timing ) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index 3ecff18884b34..f8021c873242f 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -6,22 +6,16 @@ namespace LIBC_NAMESPACE { namespace libc_gpu_benchmarks { -Benchmark *Benchmark::start = nullptr; -Benchmark *Benchmark::end = nullptr; +FixedVector benchmarks_to_run; void Benchmark::add_benchmark(Benchmark *benchmark) { - if (end == nullptr) { - start = benchmark; - end = benchmark; - return; - } - end->next = benchmark; - end = benchmark; + benchmarks_to_run.push_back(benchmark); } int Benchmark::run_benchmarks() { - for (Benchmark *b = start; b != nullptr; b = b->next) - b->run(); + for (auto it = benchmarks_to_run.rbegin(), e = benchmarks_to_run.rend(); + it != e; ++it) + (*it)->run(); return 0; } diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index 798ae06086b1a..459e4d9b6ea98 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -6,6 +6,7 @@ #include "src/__support/CPP/functional.h" #include "src/__support/CPP/limits.h" #include "src/__support/CPP/string_view.h" +#include "src/__support/fixedvector.h" #include "src/time/clock.h" #include @@ -92,9 +93,6 @@ class Benchmark { private: virtual void run() = 0; virtual const cpp::string_view get_name() const = 0; - - static Benchmark *start; - static Benchmark *end; }; class WrapperBenchmark : public Benchmark { From a588fc5b2eac6e84dd0dc4f62bebfc428a695845 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Wed, 22 May 2024 16:27:44 -0400 Subject: [PATCH 06/18] refactor cmake files --- libc/benchmarks/CMakeLists.txt | 188 +++++++++---------- libc/benchmarks/gpu/CMakeLists.txt | 3 +- libc/benchmarks/gpu/src/ctype/CMakeLists.txt | 28 +-- libc/cmake/modules/LLVMLibCTestRules.cmake | 27 +-- 4 files changed, 122 insertions(+), 124 deletions(-) diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt index 221d4e11d383d..0234ccb2a7a78 100644 --- a/libc/benchmarks/CMakeLists.txt +++ b/libc/benchmarks/CMakeLists.txt @@ -8,26 +8,24 @@ find_package(Threads) set(LLVM_LINK_COMPONENTS Support TargetParser -) + ) #============================================================================== # Add Unit Testing Support #============================================================================== function(add_libc_benchmark_unittest target_name) - if(NOT LLVM_INCLUDE_TESTS) + if(NOT LLVM_INCLUDE_TESTS) return() - endif() + endif() - cmake_parse_arguments(if(LIBC_TARGET_OS_IS_GPU) - add_subdirectory(gpu) - return() + cmake_parse_arguments( "LIBC_BENCHMARKS_UNITTEST" "" # No optional arguments "SUITE" # Single value arguments "SRCS;DEPENDS" # Multi-value arguments ${ARGN} - ) + ) add_executable(${target_name} EXCLUDE_FROM_ALL @@ -55,99 +53,99 @@ endfunction() include(ExternalProject) ExternalProject_Add(google-benchmark-libc - EXCLUDE_FROM_ALL ON - PREFIX google-benchmark-libc - SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark - INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc - CMAKE_CACHE_ARGS - -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF - -DBENCHMARK_ENABLE_LTO:BOOL=OFF - -DBENCHMARK_ENABLE_TESTING:BOOL=OFF - -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR} - -DBENCHMARK_FORCE_WERROR:BOOL=OFF - -DBENCHMARK_USE_LIBCXX:BOOL=OFF - -DCMAKE_BUILD_TYPE:STRING=Release - - -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME} - -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR} - -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} - -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} - -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} - -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH} - - -DBUILD_SHARED_LIBS:BOOL=OFF - -DCMAKE_EXE_LINKER_FLAGS:STRING=-static - - -DCMAKE_CXX_STANDARD:STRING=14 - -DCMAKE_INSTALL_PREFIX:PATH= - ) + EXCLUDE_FROM_ALL ON + PREFIX google-benchmark-libc + SOURCE_DIR ${LLVM_THIRD_PARTY_DIR}/benchmark + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/google-benchmark-libc + CMAKE_CACHE_ARGS + -DBENCHMARK_ENABLE_EXCEPTIONS:BOOL=OFF + -DBENCHMARK_ENABLE_LTO:BOOL=OFF + -DBENCHMARK_ENABLE_TESTING:BOOL=OFF + -DBENCHMARK_ENABLE_WERROR:BOOL=${LLVM_ENABLE_WERROR} + -DBENCHMARK_FORCE_WERROR:BOOL=OFF + -DBENCHMARK_USE_LIBCXX:BOOL=OFF + -DCMAKE_BUILD_TYPE:STRING=Release + + -DCMAKE_SYSTEM_NAME:STRING=${CMAKE_SYSTEM_NAME} + -DCMAKE_SYSTEM_PROCESSOR:STRING=${CMAKE_SYSTEM_PROCESSOR} + -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} + -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} + -DCMAKE_FIND_ROOT_PATH:STRING=${CMAKE_FIND_ROOT_PATH} + + -DBUILD_SHARED_LIBS:BOOL=OFF + -DCMAKE_EXE_LINKER_FLAGS:STRING=-static + + -DCMAKE_CXX_STANDARD:STRING=14 + -DCMAKE_INSTALL_PREFIX:PATH= + ) add_custom_target(libc-benchmark-util-tests) # libc-benchmark add_library(libc-benchmark - STATIC - EXCLUDE_FROM_ALL - LibcBenchmark.cpp - LibcBenchmark.h + STATIC + EXCLUDE_FROM_ALL + LibcBenchmark.cpp + LibcBenchmark.h ) target_include_directories(libc-benchmark - PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR} + PUBLIC ${LLVM_INCLUDE_DIR} ${LLVM_MAIN_INCLUDE_DIR} ) target_link_libraries(libc-benchmark - PUBLIC - benchmark::benchmark - LLVMSupport - LLVMTargetParser - Threads::Threads + PUBLIC + benchmark::benchmark + LLVMSupport + LLVMTargetParser + Threads::Threads ) add_dependencies(libc-benchmark google-benchmark-libc) llvm_update_compile_flags(libc-benchmark) add_libc_benchmark_unittest(libc-benchmark-test - SRCS LibcBenchmarkTest.cpp - DEPENDS libc-benchmark + SRCS LibcBenchmarkTest.cpp + DEPENDS libc-benchmark ) # libc-memory-benchmark add_library(libc-memory-benchmark - STATIC - EXCLUDE_FROM_ALL - LibcMemoryBenchmark.cpp - LibcMemoryBenchmark.h - LibcFunctionPrototypes.h - MemorySizeDistributions.cpp - MemorySizeDistributions.h + STATIC + EXCLUDE_FROM_ALL + LibcMemoryBenchmark.cpp + LibcMemoryBenchmark.h + LibcFunctionPrototypes.h + MemorySizeDistributions.cpp + MemorySizeDistributions.h ) target_include_directories(libc-memory-benchmark - PUBLIC - ${CMAKE_CURRENT_SOURCE_DIR} + PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR} ) target_link_libraries(libc-memory-benchmark - PUBLIC - libc-benchmark + PUBLIC + libc-benchmark ) llvm_update_compile_flags(libc-memory-benchmark) add_libc_benchmark_unittest(libc-memory-benchmark-test - SRCS LibcMemoryBenchmarkTest.cpp - DEPENDS libc-memory-benchmark + SRCS LibcMemoryBenchmarkTest.cpp + DEPENDS libc-memory-benchmark ) # json add_library(json - STATIC - EXCLUDE_FROM_ALL - JSON.cpp - JSON.h + STATIC + EXCLUDE_FROM_ALL + JSON.cpp + JSON.h ) target_link_libraries(json PUBLIC libc-memory-benchmark) llvm_update_compile_flags(json) add_libc_benchmark_unittest(json-test - SRCS JSONTest.cpp - DEPENDS json + SRCS JSONTest.cpp + DEPENDS json ) #============================================================================== @@ -156,25 +154,25 @@ add_libc_benchmark_unittest(json-test # Benchmark all implementations that can run on the target CPU. function(add_libc_multi_impl_benchmark name) - get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations) - foreach(fq_config_name IN LISTS fq_implementations) - get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES) - cpu_supports(can_run "${required_cpu_features}") - if(can_run) - set(benchmark_name ${fq_config_name}_benchmark) - add_executable(${benchmark_name} - EXCLUDE_FROM_ALL - LibcMemoryBenchmarkMain.cpp - ) - get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW") - target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file}) - string(TOUPPER ${name} name_upper) - target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"") - llvm_update_compile_flags(${benchmark_name}) - else() - message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'") - endif() - endforeach() + get_property(fq_implementations GLOBAL PROPERTY ${name}_implementations) + foreach(fq_config_name IN LISTS fq_implementations) + get_target_property(required_cpu_features ${fq_config_name} REQUIRE_CPU_FEATURES) + cpu_supports(can_run "${required_cpu_features}") + if(can_run) + set(benchmark_name ${fq_config_name}_benchmark) + add_executable(${benchmark_name} + EXCLUDE_FROM_ALL + LibcMemoryBenchmarkMain.cpp + ) + get_target_property(entrypoint_object_file ${fq_config_name} "OBJECT_FILE_RAW") + target_link_libraries(${benchmark_name} PUBLIC json ${entrypoint_object_file}) + string(TOUPPER ${name} name_upper) + target_compile_definitions(${benchmark_name} PRIVATE "-DLIBC_BENCHMARK_FUNCTION_${name_upper}=LIBC_NAMESPACE::${name}" "-DLIBC_BENCHMARK_FUNCTION_NAME=\"${fq_config_name}\"") + llvm_update_compile_flags(${benchmark_name}) + else() + message(STATUS "Skipping benchmark for '${fq_config_name}' insufficient host cpu features '${required_cpu_features}'") + endif() + endforeach() endfunction() add_libc_multi_impl_benchmark(bcmp) @@ -192,20 +190,20 @@ add_libc_multi_impl_benchmark(memset) # libc memory functions compiled for the host machine. This is useful to # continuously monitor the performance of the memory functions. add_executable(libc.benchmarks.memory_functions.opt_host - EXCLUDE_FROM_ALL - LibcMemoryGoogleBenchmarkMain.cpp - LibcDefaultImplementations.cpp + EXCLUDE_FROM_ALL + LibcMemoryGoogleBenchmarkMain.cpp + LibcDefaultImplementations.cpp ) target_link_libraries(libc.benchmarks.memory_functions.opt_host - PRIVATE - libc-memory-benchmark - libc.src.string.memcmp_opt_host.__internal__ - libc.src.string.bcmp_opt_host.__internal__ - libc.src.string.memcpy_opt_host.__internal__ - libc.src.string.memset_opt_host.__internal__ - libc.src.string.bzero_opt_host.__internal__ - libc.src.string.memmove_opt_host.__internal__ - benchmark_main + PRIVATE + libc-memory-benchmark + libc.src.string.memcmp_opt_host.__internal__ + libc.src.string.bcmp_opt_host.__internal__ + libc.src.string.memcpy_opt_host.__internal__ + libc.src.string.memset_opt_host.__internal__ + libc.src.string.bzero_opt_host.__internal__ + libc.src.string.memmove_opt_host.__internal__ + benchmark_main ) llvm_update_compile_flags(libc.benchmarks.memory_functions.opt_host) diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index 51fc267df807d..9ed45eedc402e 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -13,9 +13,8 @@ function(add_benchmark benchmark_name) if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS) message(FATAL_ERROR "target does not support clock") endif() - add_libc_hermetic_test( + add_libc_hermetic( ${benchmark_name} - IS_BENCHMARK LINK_LIBRARIES LibcGpuBenchmark.hermetic ${BENCHMARK_LINK_LIBRARIES} diff --git a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt index 8d448b8ced955..79f01425770da 100644 --- a/libc/benchmarks/gpu/src/ctype/CMakeLists.txt +++ b/libc/benchmarks/gpu/src/ctype/CMakeLists.txt @@ -1,21 +1,21 @@ add_custom_target(libc-gpu-ctype-benchmarks) add_benchmark( - isalnum_benchmark - SUITE - libc-gpu-ctype-benchmarks - SRCS - isalnum_benchmark.cpp - DEPENDS - libc.src.ctype.isalnum + isalnum_benchmark + SUITE + libc-gpu-ctype-benchmarks + SRCS + isalnum_benchmark.cpp + DEPENDS + libc.src.ctype.isalnum ) add_benchmark( - isalpha_benchmark - SUITE - libc-gpu-ctype-benchmarks - SRCS - isalpha_benchmark.cpp - DEPENDS - libc.src.ctype.isalpha + isalpha_benchmark + SUITE + libc-gpu-ctype-benchmarks + SRCS + isalpha_benchmark.cpp + DEPENDS + libc.src.ctype.isalpha ) diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index 278137774e089..508694ae9fc01 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -526,12 +526,15 @@ function(add_integration_test test_name) add_dependencies(${INTEGRATION_TEST_SUITE} ${fq_target_name}) endfunction(add_integration_test) -# Rule to add a hermetic test. A hermetic test is one whose executable is fully +# Rule to add a hermetic program. A hermetic program is one whose executable is fully # statically linked and consists of pieces drawn only from LLVM's libc. Nothing, # including the startup objects, come from the system libc. # +# For the GPU, these can be either tests or benchmarks, depending on the value +# of the LINK_LIBRARIES arg. +# # Usage: -# add_libc_hermetic_test( +# add_libc_hermetic( # # SUITE # SRCS [src2.cpp ...] @@ -543,14 +546,14 @@ endfunction(add_integration_test) # LINK_LIBRARIES # LOADER_ARGS # ) -function(add_libc_hermetic_test test_name) +function(add_libc_hermetic test_name) if(NOT TARGET libc.startup.${LIBC_TARGET_OS}.crt1) message(VERBOSE "Skipping ${fq_target_name} as it is not available on ${LIBC_TARGET_OS}.") return() endif() cmake_parse_arguments( "HERMETIC_TEST" - "IS_BENCHMARK" # Optional arguments + "" # No optional arguments "SUITE" # Single value arguments "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments ${ARGN} @@ -651,13 +654,6 @@ function(add_libc_hermetic_test test_name) endif() endforeach() - # Benchmarks requires a separate library with a different `main` function - if(HERMETIC_TEST_IS_BENCHMARK) - list(APPEND link_libraries LibcGpuBenchmark.hermetic) - else() - list(APPEND link_libraries LibcTest.hermetic) - endif() - if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) target_link_options(${fq_build_target_name} PRIVATE ${LIBC_COMPILE_OPTIONS_DEFAULT} @@ -721,7 +717,7 @@ function(add_libc_hermetic_test test_name) add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name}) add_dependencies(libc-hermetic-tests ${fq_target_name}) -endfunction(add_libc_hermetic_test) +endfunction(add_libc_hermetic) # A convenience function to add both a unit test as well as a hermetic test. function(add_libc_test test_name) @@ -736,7 +732,12 @@ function(add_libc_test test_name) add_libc_unittest(${test_name}.__unit__ ${LIBC_TEST_UNPARSED_ARGUMENTS}) endif() if(LIBC_ENABLE_HERMETIC_TESTS AND NOT LIBC_TEST_UNIT_TEST_ONLY) - add_libc_hermetic_test(${test_name}.__hermetic__ ${LIBC_TEST_UNPARSED_ARGUMENTS}) + add_libc_hermetic( + ${test_name}.__hermetic__ + LINK_LIBRARIES + LibcTest.hermetic + ${LIBC_TEST_UNPARSED_ARGUMENTS} + ) get_fq_target_name(${test_name} fq_test_name) if(TARGET ${fq_test_name}.__hermetic__ AND TARGET ${fq_test_name}.__unit__) # Tests like the file tests perform file operations on disk file. If we From be303da366eb2e7dd42ad12c206268b4259264c3 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Thu, 23 May 2024 12:34:21 -0400 Subject: [PATCH 07/18] rename namespace and refactor casts --- libc/benchmarks/gpu/BenchmarkLogger.cpp | 4 +- libc/benchmarks/gpu/BenchmarkLogger.h | 4 +- libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 31 +++++------ libc/benchmarks/gpu/LibcGpuBenchmark.h | 54 ++++++++------------ libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp | 2 +- 5 files changed, 42 insertions(+), 53 deletions(-) diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp index 4f70d23a1e95e..9a36ee5b3046c 100644 --- a/libc/benchmarks/gpu/BenchmarkLogger.cpp +++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp @@ -9,7 +9,7 @@ #include namespace LIBC_NAMESPACE { -namespace libc_gpu_benchmarks { +namespace benchmarks { // cpp::string_view specialization template <> @@ -93,5 +93,5 @@ template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<320>); BenchmarkLogger blog; -} // namespace libc_gpu_benchmarks +} // namespace benchmarks } // namespace LIBC_NAMESPACE diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h index ed3cc97e59c6d..98813b28eaa91 100644 --- a/libc/benchmarks/gpu/BenchmarkLogger.h +++ b/libc/benchmarks/gpu/BenchmarkLogger.h @@ -10,7 +10,7 @@ #define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H namespace LIBC_NAMESPACE { -namespace libc_gpu_benchmarks { +namespace benchmarks { // A class to log to standard output in the context of hermetic tests. struct BenchmarkLogger { @@ -21,7 +21,7 @@ struct BenchmarkLogger { // A global TestLogger instance to be used in tests. extern BenchmarkLogger blog; -} // namespace libc_gpu_benchmarks +} // namespace benchmarks } // namespace LIBC_NAMESPACE #endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */ diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index f8021c873242f..4c49839249d56 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -4,7 +4,7 @@ #include "src/time/gpu/time_utils.h" namespace LIBC_NAMESPACE { -namespace libc_gpu_benchmarks { +namespace benchmarks { FixedVector benchmarks_to_run; @@ -23,22 +23,22 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, cpp::function wrapper_func) { BenchmarkResult result; RuntimeEstimationProgression rep; - size_t total_iterations = 0; - size_t iterations = options.initial_iterations; - if (iterations < (uint32_t)1) + uint32_t total_iterations = 0; + uint32_t iterations = options.initial_iterations; + if (iterations < 1u) iterations = 1; - size_t samples = 0; + uint32_t samples = 0; uint64_t total_time = 0; uint64_t best_guess = 0; uint64_t total_cycles = 0; - uint64_t cycles_2 = 0; - uint64_t min = UINT_MAX; + uint64_t cycles_squared = 0; + uint64_t min = UINT64_MAX; uint64_t max = 0; - for (;;) { + for (uint64_t time_budget = options.max_duration; time_budget >= 0;) { uint64_t sample_cycles = 0; uint64_t overhead = LIBC_NAMESPACE::overhead(); - const clock_t start = (double)clock(); + const clock_t start = static_cast(clock()); for (uint32_t i = 0; i < iterations; i++) { auto wrapper_intermediate = wrapper_func(); uint64_t result = wrapper_intermediate - overhead; @@ -50,9 +50,10 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, const clock_t duration_ns = ((end - start) * 1000 * 1000 * 1000) / CLOCKS_PER_SEC; total_time += duration_ns; + time_budget -= duration_ns; samples++; total_cycles += sample_cycles; - cycles_2 += sample_cycles * sample_cycles; + cycles_squared += sample_cycles * sample_cycles; total_iterations += iterations; const double change_ratio = @@ -60,8 +61,7 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, best_guess = rep.current_estimation; if (samples >= options.max_samples || - iterations >= options.max_iterations || - total_time >= options.max_duration) { + iterations >= options.max_iterations) { break; } else if (total_time >= options.min_duration && samples >= options.min_samples && @@ -72,8 +72,9 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, iterations *= options.scaling_factor; } result.cycles = best_guess; - result.standard_deviation = fputil::sqrt((double)cycles_2 / total_iterations - - (best_guess * best_guess)); + result.standard_deviation = + fputil::sqrt(static_cast(cycles_squared) / total_iterations - + (best_guess * best_guess)); result.min = min; result.max = max; result.samples = samples; @@ -82,5 +83,5 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, return result; }; -} // namespace libc_gpu_benchmarks +} // namespace benchmarks } // namespace LIBC_NAMESPACE diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index 459e4d9b6ea98..20543af66e331 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -9,12 +9,11 @@ #include "src/__support/fixedvector.h" #include "src/time/clock.h" -#include #include namespace LIBC_NAMESPACE { -namespace libc_gpu_benchmarks { +namespace benchmarks { struct BenchmarkOptions { uint32_t initial_iterations = 1; @@ -28,13 +27,13 @@ struct BenchmarkOptions { }; struct Measurement { - size_t iterations = 0; + uint32_t iterations = 0; uint64_t elapsed_cycles = 0; }; class RefinableRuntimeEstimation { uint64_t total_cycles = 0; - size_t total_iterations = 0; + uint32_t total_iterations = 0; public: uint64_t update(const Measurement &M) { @@ -53,7 +52,8 @@ class RuntimeEstimationProgression { double compute_improvement(const Measurement &M) { const uint64_t new_estimation = rre.update(M); - double ratio = ((double)current_estimation / new_estimation) - 1.0; + double ratio = + (static_cast(current_estimation) / new_estimation) - 1.0; // Get absolute value if (ratio < 0) @@ -67,10 +67,10 @@ class RuntimeEstimationProgression { struct BenchmarkResult { uint64_t cycles = 0; double standard_deviation = 0; - uint64_t min = UINT_MAX; + uint64_t min = UINT64_MAX; uint64_t max = 0; - size_t samples = 0; - size_t total_iterations = 0; + uint32_t samples = 0; + uint32_t total_iterations = 0; clock_t total_time = 0; }; @@ -78,35 +78,22 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, cpp::function wrapper_func); class Benchmark { - Benchmark *next = nullptr; - -public: - virtual ~Benchmark() {} - virtual void set_up() {} - virtual void tear_down() {} - - static int run_benchmarks(); - -protected: - static void add_benchmark(Benchmark *); - -private: - virtual void run() = 0; - virtual const cpp::string_view get_name() const = 0; -}; - -class WrapperBenchmark : public Benchmark { const cpp::function func; const cpp::string_view name; public: - WrapperBenchmark(cpp::function func, char const *name) + Benchmark(cpp::function func, char const *name) : func(func), name(name) { add_benchmark(this); } + static int run_benchmarks(); + +protected: + static void add_benchmark(Benchmark *benchmark); + private: - void run() override { + void run() { BenchmarkOptions options; auto result = benchmark(options, func); constexpr auto GREEN = "\033[32m"; @@ -115,15 +102,16 @@ class WrapperBenchmark : public Benchmark { blog << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles << " cycles, " << result.min << " min, " << result.max << " max, " << result.total_iterations << " iterations, " << result.total_time - << " ns, " << (long)result.standard_deviation << " stddev\n"; + << " ns, " << static_cast(result.standard_deviation) + << " stddev\n"; } - const cpp::string_view get_name() const override { return name; } + const cpp::string_view get_name() const { return name; } }; -} // namespace libc_gpu_benchmarks +} // namespace benchmarks } // namespace LIBC_NAMESPACE #define BENCHMARK(SuiteName, TestName, Func) \ - LIBC_NAMESPACE::libc_gpu_benchmarks::WrapperBenchmark \ - SuiteName##_##TestName##_Instance(Func, #SuiteName "." #TestName); + LIBC_NAMESPACE::benchmarks::Benchmark SuiteName##_##TestName##_Instance( \ + Func, #SuiteName "." #TestName); #endif diff --git a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp index 510fd13210494..97366e55194a9 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmarkMain.cpp @@ -1,6 +1,6 @@ #include "LibcGpuBenchmark.h" extern "C" int main(int argc, char **argv, char **envp) { - LIBC_NAMESPACE::libc_gpu_benchmarks::Benchmark::run_benchmarks(); + LIBC_NAMESPACE::benchmarks::Benchmark::run_benchmarks(); return 0; } From a41eb326ad5d4d99ba24ab17c5ae55ce022b43af Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Mon, 27 May 2024 11:32:30 -0400 Subject: [PATCH 08/18] repeat overhead measurment outside of loop --- libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index 4c49839249d56..e91d2b400444a 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -35,9 +35,15 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, uint64_t cycles_squared = 0; uint64_t min = UINT64_MAX; uint64_t max = 0; + + uint64_t total_overhead_cycles = 0; + uint32_t overhead_iterations = 10; + for (int i = 0; i < overhead_iterations; i++) + total_overhead_cycles += LIBC_NAMESPACE::overhead(); + uint64_t overhead = total_overhead_cycles / overhead_iterations; + for (uint64_t time_budget = options.max_duration; time_budget >= 0;) { uint64_t sample_cycles = 0; - uint64_t overhead = LIBC_NAMESPACE::overhead(); const clock_t start = static_cast(clock()); for (uint32_t i = 0; i < iterations; i++) { auto wrapper_intermediate = wrapper_func(); From ab6b6cae819bdd2ba7da32292063a7bcd6620e10 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Mon, 27 May 2024 11:37:14 -0400 Subject: [PATCH 09/18] switch to using min measurement for overhead --- libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index e91d2b400444a..f0ba3af23a140 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -36,11 +36,10 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, uint64_t min = UINT64_MAX; uint64_t max = 0; - uint64_t total_overhead_cycles = 0; - uint32_t overhead_iterations = 10; + uint64_t overhead = UINT64_MAX; + int overhead_iterations = 10; for (int i = 0; i < overhead_iterations; i++) - total_overhead_cycles += LIBC_NAMESPACE::overhead(); - uint64_t overhead = total_overhead_cycles / overhead_iterations; + overhead = cpp::min(overhead, LIBC_NAMESPACE::overhead()); for (uint64_t time_budget = options.max_duration; time_budget >= 0;) { uint64_t sample_cycles = 0; From c7c8445f76fef4923d0607208a621bcd7a8ef58d Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Tue, 28 May 2024 22:46:04 -0400 Subject: [PATCH 10/18] fix style --- libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 19 +++++++------------ libc/benchmarks/gpu/LibcGpuBenchmark.h | 2 +- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index f0ba3af23a140..e4f839e361dd0 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -6,17 +6,15 @@ namespace LIBC_NAMESPACE { namespace benchmarks { -FixedVector benchmarks_to_run; +FixedVector benchmarks; void Benchmark::add_benchmark(Benchmark *benchmark) { - benchmarks_to_run.push_back(benchmark); + benchmarks.push_back(benchmark); } -int Benchmark::run_benchmarks() { - for (auto it = benchmarks_to_run.rbegin(), e = benchmarks_to_run.rend(); - it != e; ++it) +void Benchmark::run_benchmarks() { + for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) (*it)->run(); - return 0; } BenchmarkResult benchmark(const BenchmarkOptions &options, @@ -65,14 +63,11 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, rep.compute_improvement({iterations, sample_cycles}); best_guess = rep.current_estimation; - if (samples >= options.max_samples || - iterations >= options.max_iterations) { + if (samples >= options.max_samples || iterations >= options.max_iterations) break; - } else if (total_time >= options.min_duration && - samples >= options.min_samples && - change_ratio < options.epsilon) { + if (total_time >= options.min_duration && samples >= options.min_samples && + change_ratio < options.epsilon) break; - } iterations *= options.scaling_factor; } diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index 20543af66e331..08e99dadc8d07 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -87,7 +87,7 @@ class Benchmark { add_benchmark(this); } - static int run_benchmarks(); + static void run_benchmarks(); protected: static void add_benchmark(Benchmark *benchmark); From c857891c2f30ffba251fcce6be2d647d39a2bf69 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Tue, 28 May 2024 23:07:26 -0400 Subject: [PATCH 11/18] unconditionally add benchmarks in gpu build --- libc/CMakeLists.txt | 4 +--- libc/benchmarks/CMakeLists.txt | 5 +++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index f35471a06a53e..4ffcd55ba9500 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -401,9 +401,7 @@ if(LLVM_INCLUDE_TESTS) add_subdirectory(fuzzing) endif() -if(LIBC_INCLUDE_BENCHMARKS) - add_subdirectory(benchmarks) -endif() +add_subdirectory(benchmarks) if (LIBC_INCLUDE_DOCS) add_subdirectory(docs) diff --git a/libc/benchmarks/CMakeLists.txt b/libc/benchmarks/CMakeLists.txt index 0234ccb2a7a78..0cff6eb12c247 100644 --- a/libc/benchmarks/CMakeLists.txt +++ b/libc/benchmarks/CMakeLists.txt @@ -3,6 +3,11 @@ if(LIBC_TARGET_OS_IS_GPU) return() endif() +# The CPU build depends on Google benchmark. +if(NOT LIBC_INCLUDE_BENCHMARKS) + return() +endif() + find_package(Threads) set(LLVM_LINK_COMPONENTS From 6073de7b30620397831ba76b5d587b88e035c14e Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Wed, 29 May 2024 22:30:00 -0400 Subject: [PATCH 12/18] add forward iterator --- libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 4 +- libc/src/__support/CPP/array.h | 26 +++++---- libc/src/__support/CPP/iterator.h | 68 ++++++++++++++++++++++++ libc/src/__support/fixedvector.h | 4 ++ 4 files changed, 90 insertions(+), 12 deletions(-) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index e4f839e361dd0..a7a02cacc3305 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -13,8 +13,8 @@ void Benchmark::add_benchmark(Benchmark *benchmark) { } void Benchmark::run_benchmarks() { - for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) - (*it)->run(); + for (Benchmark *benchmark : benchmarks) + benchmark->run(); } BenchmarkResult benchmark(const BenchmarkOptions &options, diff --git a/libc/src/__support/CPP/array.h b/libc/src/__support/CPP/array.h index 4e69ba003e800..7e4cf29847daf 100644 --- a/libc/src/__support/CPP/array.h +++ b/libc/src/__support/CPP/array.h @@ -22,10 +22,12 @@ template struct array { T Data[N]; using value_type = T; - using iterator = T *; - using const_iterator = const T *; - using reverse_iterator = cpp::reverse_iterator; - using const_reverse_iterator = cpp::reverse_iterator; + using pointer_type = T *; + using iterator = cpp::iterator; + using const_pointer_type = const T *; + using const_iterator = cpp::iterator; + using reverse_iterator = cpp::reverse_iterator; + using const_reverse_iterator = cpp::reverse_iterator; LIBC_INLINE constexpr T *data() { return Data; } LIBC_INLINE constexpr const T *data() const { return Data; } @@ -46,12 +48,16 @@ template struct array { LIBC_INLINE constexpr bool empty() const { return N == 0; } - LIBC_INLINE constexpr iterator begin() { return Data; } - LIBC_INLINE constexpr const_iterator begin() const { return Data; } + LIBC_INLINE constexpr iterator begin() { return iterator{Data}; } + LIBC_INLINE constexpr const_iterator begin() const { + return const_iterator{Data}; + } LIBC_INLINE constexpr const_iterator cbegin() const { return begin(); } - LIBC_INLINE constexpr iterator end() { return Data + N; } - LIBC_INLINE constexpr const_iterator end() const { return Data + N; } + LIBC_INLINE constexpr iterator end() { return iterator{Data + N}; } + LIBC_INLINE constexpr const_iterator end() const { + return const_iterator{Data + N}; + } LIBC_INLINE constexpr const_iterator cend() const { return end(); } LIBC_INLINE constexpr reverse_iterator rbegin() { @@ -65,10 +71,10 @@ template struct array { } LIBC_INLINE constexpr reverse_iterator rend() { - return reverse_iterator{begin()}; + return reverse_iterator{Data}; } LIBC_INLINE constexpr const_reverse_iterator rend() const { - return const_reverse_iterator{begin()}; + return const_reverse_iterator{Data}; } LIBC_INLINE constexpr const_reverse_iterator crend() const { return rend(); } }; diff --git a/libc/src/__support/CPP/iterator.h b/libc/src/__support/CPP/iterator.h index b0fd5c9f22ae0..37d631b01582e 100644 --- a/libc/src/__support/CPP/iterator.h +++ b/libc/src/__support/CPP/iterator.h @@ -92,6 +92,74 @@ template class reverse_iterator { } }; +template class iterator { + Iter current; + +public: + using reference = typename iterator_traits::reference; + using value_type = typename iterator_traits::value_type; + using iterator_type = Iter; + + LIBC_INLINE iterator() : current() {} + LIBC_INLINE constexpr explicit iterator(Iter it) : current(it) {} + + template && + cpp::is_convertible_v, + int> = 0> + LIBC_INLINE constexpr explicit iterator(const Other &it) : current(it) {} + + LIBC_INLINE friend constexpr bool operator==(const iterator &lhs, + const iterator &rhs) { + return lhs.base() == rhs.base(); + } + + LIBC_INLINE friend constexpr bool operator!=(const iterator &lhs, + const iterator &rhs) { + return lhs.base() != rhs.base(); + } + + LIBC_INLINE friend constexpr bool operator<(const iterator &lhs, + const iterator &rhs) { + return lhs.base() < rhs.base(); + } + + LIBC_INLINE friend constexpr bool operator<=(const iterator &lhs, + const iterator &rhs) { + return lhs.base() <= rhs.base(); + } + + LIBC_INLINE friend constexpr bool operator>(const iterator &lhs, + const iterator &rhs) { + return lhs.base() > rhs.base(); + } + + LIBC_INLINE friend constexpr bool operator>=(const iterator &lhs, + const iterator &rhs) { + return lhs.base() >= rhs.base(); + } + + LIBC_INLINE constexpr iterator_type base() const { return current; } + + LIBC_INLINE constexpr reference operator*() const { + Iter tmp = current; + return *tmp; + } + LIBC_INLINE constexpr iterator operator--() { + --current; + return *this; + } + LIBC_INLINE constexpr iterator &operator++() { + ++current; + return *this; + } + LIBC_INLINE constexpr iterator operator++(int) { + iterator tmp(*this); + ++current; + return tmp; + } +}; + } // namespace cpp } // namespace LIBC_NAMESPACE diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h index 403b1620d20df..ef00fede07366 100644 --- a/libc/src/__support/fixedvector.h +++ b/libc/src/__support/fixedvector.h @@ -82,6 +82,10 @@ template class FixedVector { // can easily swap one data structure for the other. static void destroy(FixedVector *store) { store->reset(); } + using iterator = typename cpp::array::iterator; + LIBC_INLINE constexpr iterator begin() { return iterator{&store[0]}; } + LIBC_INLINE constexpr iterator end() { return iterator{&store[item_count]}; } + using reverse_iterator = typename cpp::array::reverse_iterator; LIBC_INLINE constexpr reverse_iterator rbegin() { return reverse_iterator{&store[item_count]}; From 9f23d216e4b98aedf4f6bc2a56b70f05edb9b6a4 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Wed, 29 May 2024 22:40:02 -0400 Subject: [PATCH 13/18] rename logger --- libc/benchmarks/gpu/BenchmarkLogger.cpp | 2 +- libc/benchmarks/gpu/BenchmarkLogger.h | 2 +- libc/benchmarks/gpu/LibcGpuBenchmark.h | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp index 9a36ee5b3046c..2e7e8e7600fdb 100644 --- a/libc/benchmarks/gpu/BenchmarkLogger.cpp +++ b/libc/benchmarks/gpu/BenchmarkLogger.cpp @@ -91,7 +91,7 @@ template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<320>); // TODO: Add floating point formatting once it's supported by StringStream. -BenchmarkLogger blog; +BenchmarkLogger log; } // namespace benchmarks } // namespace LIBC_NAMESPACE diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h index 98813b28eaa91..332ff1439e6f5 100644 --- a/libc/benchmarks/gpu/BenchmarkLogger.h +++ b/libc/benchmarks/gpu/BenchmarkLogger.h @@ -19,7 +19,7 @@ struct BenchmarkLogger { }; // A global TestLogger instance to be used in tests. -extern BenchmarkLogger blog; +extern BenchmarkLogger log; } // namespace benchmarks } // namespace LIBC_NAMESPACE diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index 08e99dadc8d07..2a6fcd5ea2556 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -98,12 +98,12 @@ class Benchmark { auto result = benchmark(options, func); constexpr auto GREEN = "\033[32m"; constexpr auto RESET = "\033[0m"; - blog << GREEN << "[ RUN ] " << RESET << name << '\n'; - blog << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles - << " cycles, " << result.min << " min, " << result.max << " max, " - << result.total_iterations << " iterations, " << result.total_time - << " ns, " << static_cast(result.standard_deviation) - << " stddev\n"; + log << GREEN << "[ RUN ] " << RESET << name << '\n'; + log << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles + << " cycles, " << result.min << " min, " << result.max << " max, " + << result.total_iterations << " iterations, " << result.total_time + << " ns, " << static_cast(result.standard_deviation) + << " stddev\n"; } const cpp::string_view get_name() const { return name; } }; From 46b5e25304e48e896297790ce17c2ac93db5a4b2 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Wed, 29 May 2024 23:15:48 -0400 Subject: [PATCH 14/18] Revert "add forward iterator" This reverts commit a5ebf57f198cd79be132854b036f904c3983341d. --- libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 4 +- libc/src/__support/CPP/array.h | 26 ++++----- libc/src/__support/CPP/iterator.h | 68 ------------------------ libc/src/__support/fixedvector.h | 4 -- 4 files changed, 12 insertions(+), 90 deletions(-) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index a7a02cacc3305..e4f839e361dd0 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -13,8 +13,8 @@ void Benchmark::add_benchmark(Benchmark *benchmark) { } void Benchmark::run_benchmarks() { - for (Benchmark *benchmark : benchmarks) - benchmark->run(); + for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) + (*it)->run(); } BenchmarkResult benchmark(const BenchmarkOptions &options, diff --git a/libc/src/__support/CPP/array.h b/libc/src/__support/CPP/array.h index 7e4cf29847daf..4e69ba003e800 100644 --- a/libc/src/__support/CPP/array.h +++ b/libc/src/__support/CPP/array.h @@ -22,12 +22,10 @@ template struct array { T Data[N]; using value_type = T; - using pointer_type = T *; - using iterator = cpp::iterator; - using const_pointer_type = const T *; - using const_iterator = cpp::iterator; - using reverse_iterator = cpp::reverse_iterator; - using const_reverse_iterator = cpp::reverse_iterator; + using iterator = T *; + using const_iterator = const T *; + using reverse_iterator = cpp::reverse_iterator; + using const_reverse_iterator = cpp::reverse_iterator; LIBC_INLINE constexpr T *data() { return Data; } LIBC_INLINE constexpr const T *data() const { return Data; } @@ -48,16 +46,12 @@ template struct array { LIBC_INLINE constexpr bool empty() const { return N == 0; } - LIBC_INLINE constexpr iterator begin() { return iterator{Data}; } - LIBC_INLINE constexpr const_iterator begin() const { - return const_iterator{Data}; - } + LIBC_INLINE constexpr iterator begin() { return Data; } + LIBC_INLINE constexpr const_iterator begin() const { return Data; } LIBC_INLINE constexpr const_iterator cbegin() const { return begin(); } - LIBC_INLINE constexpr iterator end() { return iterator{Data + N}; } - LIBC_INLINE constexpr const_iterator end() const { - return const_iterator{Data + N}; - } + LIBC_INLINE constexpr iterator end() { return Data + N; } + LIBC_INLINE constexpr const_iterator end() const { return Data + N; } LIBC_INLINE constexpr const_iterator cend() const { return end(); } LIBC_INLINE constexpr reverse_iterator rbegin() { @@ -71,10 +65,10 @@ template struct array { } LIBC_INLINE constexpr reverse_iterator rend() { - return reverse_iterator{Data}; + return reverse_iterator{begin()}; } LIBC_INLINE constexpr const_reverse_iterator rend() const { - return const_reverse_iterator{Data}; + return const_reverse_iterator{begin()}; } LIBC_INLINE constexpr const_reverse_iterator crend() const { return rend(); } }; diff --git a/libc/src/__support/CPP/iterator.h b/libc/src/__support/CPP/iterator.h index 37d631b01582e..b0fd5c9f22ae0 100644 --- a/libc/src/__support/CPP/iterator.h +++ b/libc/src/__support/CPP/iterator.h @@ -92,74 +92,6 @@ template class reverse_iterator { } }; -template class iterator { - Iter current; - -public: - using reference = typename iterator_traits::reference; - using value_type = typename iterator_traits::value_type; - using iterator_type = Iter; - - LIBC_INLINE iterator() : current() {} - LIBC_INLINE constexpr explicit iterator(Iter it) : current(it) {} - - template && - cpp::is_convertible_v, - int> = 0> - LIBC_INLINE constexpr explicit iterator(const Other &it) : current(it) {} - - LIBC_INLINE friend constexpr bool operator==(const iterator &lhs, - const iterator &rhs) { - return lhs.base() == rhs.base(); - } - - LIBC_INLINE friend constexpr bool operator!=(const iterator &lhs, - const iterator &rhs) { - return lhs.base() != rhs.base(); - } - - LIBC_INLINE friend constexpr bool operator<(const iterator &lhs, - const iterator &rhs) { - return lhs.base() < rhs.base(); - } - - LIBC_INLINE friend constexpr bool operator<=(const iterator &lhs, - const iterator &rhs) { - return lhs.base() <= rhs.base(); - } - - LIBC_INLINE friend constexpr bool operator>(const iterator &lhs, - const iterator &rhs) { - return lhs.base() > rhs.base(); - } - - LIBC_INLINE friend constexpr bool operator>=(const iterator &lhs, - const iterator &rhs) { - return lhs.base() >= rhs.base(); - } - - LIBC_INLINE constexpr iterator_type base() const { return current; } - - LIBC_INLINE constexpr reference operator*() const { - Iter tmp = current; - return *tmp; - } - LIBC_INLINE constexpr iterator operator--() { - --current; - return *this; - } - LIBC_INLINE constexpr iterator &operator++() { - ++current; - return *this; - } - LIBC_INLINE constexpr iterator operator++(int) { - iterator tmp(*this); - ++current; - return tmp; - } -}; - } // namespace cpp } // namespace LIBC_NAMESPACE diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h index ef00fede07366..403b1620d20df 100644 --- a/libc/src/__support/fixedvector.h +++ b/libc/src/__support/fixedvector.h @@ -82,10 +82,6 @@ template class FixedVector { // can easily swap one data structure for the other. static void destroy(FixedVector *store) { store->reset(); } - using iterator = typename cpp::array::iterator; - LIBC_INLINE constexpr iterator begin() { return iterator{&store[0]}; } - LIBC_INLINE constexpr iterator end() { return iterator{&store[item_count]}; } - using reverse_iterator = typename cpp::array::reverse_iterator; LIBC_INLINE constexpr reverse_iterator rbegin() { return reverse_iterator{&store[item_count]}; From 945090f8cc726be9560411d219421f2d7e5da775 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Sat, 15 Jun 2024 12:15:31 -0400 Subject: [PATCH 15/18] support multithreaded benchmarks --- libc/benchmarks/gpu/CMakeLists.txt | 1 + libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 60 +++++++++++++++++++++- libc/benchmarks/gpu/LibcGpuBenchmark.h | 13 +---- libc/benchmarks/gpu/timing/nvptx/timing.h | 6 --- libc/cmake/modules/LLVMLibCTestRules.cmake | 8 ++- 5 files changed, 67 insertions(+), 21 deletions(-) diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index 9ed45eedc402e..2814434ccd26c 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -15,6 +15,7 @@ function(add_benchmark benchmark_name) endif() add_libc_hermetic( ${benchmark_name} + IS_BENCHMARK LINK_LIBRARIES LibcGpuBenchmark.hermetic ${BENCHMARK_LINK_LIBRARIES} diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index e4f839e361dd0..0776ebf950ddf 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -1,20 +1,76 @@ #include "LibcGpuBenchmark.h" #include "src/__support/CPP/algorithm.h" +#include "src/__support/CPP/array.h" +#include "src/__support/CPP/string.h" #include "src/__support/FPUtil/sqrt.h" +#include "src/__support/GPU/utils.h" +#include "src/__support/fixedvector.h" #include "src/time/gpu/time_utils.h" namespace LIBC_NAMESPACE { namespace benchmarks { FixedVector benchmarks; +cpp::array results; void Benchmark::add_benchmark(Benchmark *benchmark) { benchmarks.push_back(benchmark); } +BenchmarkResult reduce_results(cpp::array &results) { + BenchmarkResult result; + uint64_t cycles_sum = 0; + double standard_deviation_sum = 0; + uint64_t min = UINT64_MAX; + uint64_t max = 0; + uint32_t samples_sum = 0; + uint32_t iterations_sum = 0; + clock_t time_sum = 0; + uint64_t num_threads = gpu::get_num_threads(); + for (uint64_t i = 0; i < num_threads; i++) { + BenchmarkResult current_result = results[i]; + cycles_sum += current_result.cycles; + standard_deviation_sum += current_result.standard_deviation; + min = cpp::min(min, current_result.min); + max = cpp::max(max, current_result.max); + samples_sum += current_result.samples; + iterations_sum += current_result.total_iterations; + time_sum += current_result.total_time; + } + result.cycles = cycles_sum / num_threads; + result.standard_deviation = standard_deviation_sum / num_threads; + result.min = min; + result.max = max; + result.samples = samples_sum / num_threads; + result.total_iterations = iterations_sum / num_threads; + result.total_time = time_sum / num_threads; + return result; +} + void Benchmark::run_benchmarks() { - for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) - (*it)->run(); + uint64_t id = gpu::get_thread_id(); + gpu::sync_threads(); + + for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) { + Benchmark *benchmark = *it; + results[id] = benchmark->run(); + } + gpu::sync_threads(); + if (id == 0) { + for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) { + Benchmark *benchmark = *it; + BenchmarkResult all_results = reduce_results(results); + constexpr auto GREEN = "\033[32m"; + constexpr auto RESET = "\033[0m"; + log << GREEN << "[ RUN ] " << RESET << benchmark->get_name() << '\n'; + log << GREEN << "[ OK ] " << RESET << benchmark->get_name() << ": " + << all_results.cycles << " cycles, " << all_results.min << " min, " + << all_results.max << " max, " << all_results.total_iterations + << " iterations, " << all_results.total_time << " ns, " + << static_cast(all_results.standard_deviation) << " stddev\n"; + } + } + gpu::sync_threads(); } BenchmarkResult benchmark(const BenchmarkOptions &options, diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index 2a6fcd5ea2556..59dd589462080 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -6,7 +6,6 @@ #include "src/__support/CPP/functional.h" #include "src/__support/CPP/limits.h" #include "src/__support/CPP/string_view.h" -#include "src/__support/fixedvector.h" #include "src/time/clock.h" #include @@ -93,17 +92,9 @@ class Benchmark { static void add_benchmark(Benchmark *benchmark); private: - void run() { + BenchmarkResult run() { BenchmarkOptions options; - auto result = benchmark(options, func); - constexpr auto GREEN = "\033[32m"; - constexpr auto RESET = "\033[0m"; - log << GREEN << "[ RUN ] " << RESET << name << '\n'; - log << GREEN << "[ OK ] " << RESET << name << ": " << result.cycles - << " cycles, " << result.min << " min, " << result.max << " max, " - << result.total_iterations << " iterations, " << result.total_time - << " ns, " << static_cast(result.standard_deviation) - << " stddev\n"; + return benchmark(options, func); } const cpp::string_view get_name() const { return name; } }; diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h index 001bdd3686062..5c45425706f11 100644 --- a/libc/benchmarks/gpu/timing/nvptx/timing.h +++ b/libc/benchmarks/gpu/timing/nvptx/timing.h @@ -24,13 +24,11 @@ namespace LIBC_NAMESPACE { [[gnu::noinline]] static uint64_t overhead() { volatile uint32_t x = 1; uint32_t y = x; - gpu::sync_threads(); uint64_t start = gpu::processor_clock(); asm volatile("" ::"r"(y), "llr"(start)); uint32_t result = y; asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); uint64_t stop = gpu::processor_clock(); - gpu::sync_threads(); volatile auto storage = result; return stop - start; } @@ -47,7 +45,6 @@ template asm volatile("" ::"r"(arg)); // Get the current timestamp from the clock. - gpu::sync_threads(); gpu::memory_fence(); uint64_t start = gpu::processor_clock(); @@ -66,7 +63,6 @@ template // ordering. uint64_t stop = gpu::processor_clock(); gpu::memory_fence(); - gpu::sync_threads(); asm volatile("" ::"r"(stop)); volatile T output = result; @@ -82,7 +78,6 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { T2 arg2 = storage2; asm volatile("" ::"r"(arg), "r"(arg2)); - gpu::sync_threads(); gpu::memory_fence(); uint64_t start = gpu::processor_clock(); @@ -94,7 +89,6 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { uint64_t stop = gpu::processor_clock(); gpu::memory_fence(); - gpu::sync_threads(); asm volatile("" ::"r"(stop)); volatile auto output = result; diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake index 508694ae9fc01..fbeec32883b63 100644 --- a/libc/cmake/modules/LLVMLibCTestRules.cmake +++ b/libc/cmake/modules/LLVMLibCTestRules.cmake @@ -553,7 +553,7 @@ function(add_libc_hermetic test_name) endif() cmake_parse_arguments( "HERMETIC_TEST" - "" # No optional arguments + "IS_BENCHMARK" # Optional arguments "SUITE" # Single value arguments "SRCS;HDRS;DEPENDS;ARGS;ENV;COMPILE_OPTIONS;LINK_LIBRARIES;LOADER_ARGS" # Multi-value arguments ${ARGN} @@ -716,7 +716,11 @@ function(add_libc_hermetic test_name) ) add_dependencies(${HERMETIC_TEST_SUITE} ${fq_target_name}) - add_dependencies(libc-hermetic-tests ${fq_target_name}) + if(NOT ${HERMETIC_TEST_IS_BENCHMARK}) + # If it is a benchmark, it will already have been added to the + # gpu-benchmark target + add_dependencies(libc-hermetic-tests ${fq_target_name}) + endif() endfunction(add_libc_hermetic) # A convenience function to add both a unit test as well as a hermetic test. From 4aa5e8bc05d814e67a332f578ed2893230e90dd7 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Sat, 15 Jun 2024 14:26:17 -0400 Subject: [PATCH 16/18] use for each syntax --- libc/benchmarks/gpu/CMakeLists.txt | 2 +- libc/benchmarks/gpu/LibcGpuBenchmark.cpp | 13 +++++-------- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index 2814434ccd26c..4d2a3a4ac66d3 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -47,7 +47,7 @@ add_unittest_framework_library( libc.src.__support.macros.properties.types libc.src.__support.OSUtil.osutil libc.src.__support.uint128 - libc.src.___support.FPUtil.sqrt + libc.src.__support.FPUtil.sqrt libc.src.__support.fixedvector libc.src.time.clock libc.benchmarks.gpu.timing.timing diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index 0776ebf950ddf..69adb0c95ba76 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -51,14 +51,11 @@ void Benchmark::run_benchmarks() { uint64_t id = gpu::get_thread_id(); gpu::sync_threads(); - for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) { - Benchmark *benchmark = *it; + for (Benchmark *benchmark : benchmarks) results[id] = benchmark->run(); - } gpu::sync_threads(); if (id == 0) { - for (auto it = benchmarks.rbegin(), e = benchmarks.rend(); it != e; ++it) { - Benchmark *benchmark = *it; + for (Benchmark *benchmark : benchmarks) { BenchmarkResult all_results = reduce_results(results); constexpr auto GREEN = "\033[32m"; constexpr auto RESET = "\033[0m"; @@ -128,9 +125,9 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, iterations *= options.scaling_factor; } result.cycles = best_guess; - result.standard_deviation = - fputil::sqrt(static_cast(cycles_squared) / total_iterations - - (best_guess * best_guess)); + result.standard_deviation = fputil::sqrt( + static_cast(cycles_squared) / total_iterations - + static_cast(best_guess * best_guess)); result.min = min; result.max = max; result.samples = samples; From b93318e2aaec5fc4ed6da0e047e59e88c3251894 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Sat, 15 Jun 2024 17:53:22 -0400 Subject: [PATCH 17/18] switch LINK_LIBRARIES argument from optional to multi-value to fix build issue --- libc/benchmarks/gpu/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index 4d2a3a4ac66d3..d167abcaf2db1 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -5,9 +5,9 @@ add_custom_target(gpu-benchmark) function(add_benchmark benchmark_name) cmake_parse_arguments( "BENCHMARK" - "LINK_LIBRARIES" # Optional arguments + "" # Optional arguments "" # Single value arguments - "" # Multi-value arguments + "LINK_LIBRARIES" # Multi-value arguments ${ARGN} ) if(NOT libc.src.time.clock IN_LIST TARGET_LLVMLIBC_ENTRYPOINTS) From cb3b05c5d6f9d25d5dbdabc70053039f00165511 Mon Sep 17 00:00:00 2001 From: jameshu15869 Date: Thu, 20 Jun 2024 22:48:11 -0400 Subject: [PATCH 18/18] clean up --- libc/benchmarks/gpu/src/CMakeLists.txt | 1 - libc/benchmarks/gpu/src/math/CMakeLists.txt | 0 libc/benchmarks/gpu/timing/nvptx/timing.h | 20 ++++++++++---------- 3 files changed, 10 insertions(+), 11 deletions(-) delete mode 100644 libc/benchmarks/gpu/src/math/CMakeLists.txt diff --git a/libc/benchmarks/gpu/src/CMakeLists.txt b/libc/benchmarks/gpu/src/CMakeLists.txt index f15d082e4dd2b..42eb4f7b5909a 100644 --- a/libc/benchmarks/gpu/src/CMakeLists.txt +++ b/libc/benchmarks/gpu/src/CMakeLists.txt @@ -1,2 +1 @@ add_subdirectory(ctype) -add_subdirectory(math) diff --git a/libc/benchmarks/gpu/src/math/CMakeLists.txt b/libc/benchmarks/gpu/src/math/CMakeLists.txt deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h index 5c45425706f11..d3851a764c43d 100644 --- a/libc/benchmarks/gpu/timing/nvptx/timing.h +++ b/libc/benchmarks/gpu/timing/nvptx/timing.h @@ -25,9 +25,9 @@ namespace LIBC_NAMESPACE { volatile uint32_t x = 1; uint32_t y = x; uint64_t start = gpu::processor_clock(); - asm volatile("" ::"r"(y), "llr"(start)); + asm("" ::"r"(y), "llr"(start)); uint32_t result = y; - asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); + asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); uint64_t stop = gpu::processor_clock(); volatile auto storage = result; return stop - start; @@ -42,7 +42,7 @@ template // not constant propagate it and remove the profiling region. volatile T storage = t; T arg = storage; - asm volatile("" ::"r"(arg)); + asm("" ::"r"(arg)); // Get the current timestamp from the clock. gpu::memory_fence(); @@ -50,20 +50,20 @@ template // This forces the compiler to load the input argument and run the clock cycle // counter before the profiling region. - asm volatile("" ::"r"(arg), "llr"(start)); + asm("" ::"r"(arg), "llr"(start)); // Run the function under test and return its value. auto result = f(arg); // This inline assembly performs a no-op which forces the result to both be // used and prevents us from exiting this region before it's complete. - asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); + asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); // Obtain the current timestamp after running the calculation and force // ordering. uint64_t stop = gpu::processor_clock(); gpu::memory_fence(); - asm volatile("" ::"r"(stop)); + asm("" ::"r"(stop)); volatile T output = result; // Return the time elapsed. @@ -76,20 +76,20 @@ static LIBC_INLINE uint64_t latency(F f, T1 t1, T2 t2) { volatile T2 storage2 = t2; T1 arg = storage; T2 arg2 = storage2; - asm volatile("" ::"r"(arg), "r"(arg2)); + asm("" ::"r"(arg), "r"(arg2)); gpu::memory_fence(); uint64_t start = gpu::processor_clock(); - asm volatile("" ::"r"(arg), "r"(arg2), "llr"(start)); + asm("" ::"r"(arg), "r"(arg2), "llr"(start)); auto result = f(arg, arg2); - asm volatile("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); + asm("or.b32 %[v_reg], %[v_reg], 0;" ::[v_reg] "r"(result) :); uint64_t stop = gpu::processor_clock(); gpu::memory_fence(); - asm volatile("" ::"r"(stop)); + asm("" ::"r"(stop)); volatile auto output = result; return stop - start;