From 72f1d3e25425ef3364dd7a2a98086db88a2f925a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 27 Nov 2024 15:23:18 +0200 Subject: [PATCH] Add dav1d as potential external project To add the external source, run "git clone -b 1.5.0 https://code.videolan.org/videolan/dav1d.git" to clone the dav1d sources, e.g. in the llvm-test-suite/test-suite-externals directory (which may not exist but can be created manually), or configure with TEST_SUITE_DAV1D_ROOT pointing at a directory containing this source. This builds two targets; the dav1d command line executable (which isn't executed as a test) and the dav1d checkasm test executable (which runs compiler generated functions and compares them with handwritten assembly versions of them). The checkasm execuable can also be run manually to microbenchmark functions, e.g. "External/dav1d/dav1d_checkasm --bench --test=mc_8bpc --function=warp*". It is not very meaningful to benchmark the execution of the whole checkasm executable, as it runs a different numbers of functions depending on the number of SIMD extensions available on the target CPU. (Benchmarking on aarch64 currently requires direct access to the pmccntr_el0 register. To currently use a different timer register, edit dav1d/tests/checkasm/checkasm.h and change pmccntr_el0 into cntvct_el0.) This uses a static configuration of the project (when building the upstream project with their own build system, there's a number of build options that can be configured). Assembly is hooked up and enabled on i386, x86_64, arm and aarch64. For architectures other than those, the checkasm test won't have any reference for detecting e.g. miscompilations of functions, but building it can still be meaningful (for testing compilation, or benchmarking the execution of the C version of functions). --- External/CMakeLists.txt | 1 + External/dav1d/CMakeLists.txt | 371 ++++++++++++++++++++++++++++++++++ External/dav1d/README.md | 255 +++++++++++++++++++++++ External/dav1d/cli_config.h | 3 + External/dav1d/config.asm | 11 + External/dav1d/config.h | 169 ++++++++++++++++ External/dav1d/vcs_version.h | 1 + 7 files changed, 811 insertions(+) create mode 100644 External/dav1d/CMakeLists.txt create mode 100644 External/dav1d/README.md create mode 100644 External/dav1d/cli_config.h create mode 100644 External/dav1d/config.asm create mode 100644 External/dav1d/config.h create mode 100644 External/dav1d/vcs_version.h diff --git a/External/CMakeLists.txt b/External/CMakeLists.txt index 785d22fb07..920dfa6ac1 100644 --- a/External/CMakeLists.txt +++ b/External/CMakeLists.txt @@ -5,6 +5,7 @@ add_subdirectory(HeCBench) add_subdirectory(Nurbs) add_subdirectory(Povray) add_subdirectory(SPEC) +add_subdirectory(dav1d) add_subdirectory(skidmarks10) add_subdirectory(sollve_vv) add_subdirectory(smoke) diff --git a/External/dav1d/CMakeLists.txt b/External/dav1d/CMakeLists.txt new file mode 100644 index 0000000000..9968187ed0 --- /dev/null +++ b/External/dav1d/CMakeLists.txt @@ -0,0 +1,371 @@ +include(External) + +# git clone -b 1.5.0 https://code.videolan.org/videolan/dav1d.git +# in llvm-test-suite/test-suite-externals. + +llvm_externals_find(TEST_SUITE_DAV1D_ROOT "dav1d" "dav1d 1.5.0") + +if (NOT TEST_SUITE_DAV1D_ROOT) + return() +endif() + +include(CheckCCompilerFlag) +include(CheckFunctionExists) +include(CheckLanguage) +include(CheckLibraryExists) +include(CheckLinkerFlag) + +set(CMAKE_C_STANDARD 17) + +include_directories(.) +include_directories(${TEST_SUITE_DAV1D_ROOT}/include) +include_directories(${TEST_SUITE_DAV1D_ROOT}/include/dav1d) +include_directories(${TEST_SUITE_DAV1D_ROOT}) +include_directories(${TEST_SUITE_DAV1D_ROOT}/src) + +if (WIN32) + include_directories(${TEST_SUITE_DAV1D_ROOT}/include/compat) +endif() + +# Convenience helper for adding an option if it is supported, automatically +# setting up suitable cache variables for the tests. +function(check_enable_option option) + if (${option} MATCHES "^-Wno") + # GCC silently accepts any unknown warning class in options like -Wno-foo, + # but such unrecognized options can produce other distracting notices + # if there actual warnings to print. Therefore, for options like -Wno-foo, + # test whether -Wfoo is supported instead, and if it is, add -Wno-foo. + string(REGEX REPLACE "^-Wno-" "-W" test_option ${option}) + else() + set(test_option ${option}) + endif() + # Transform the option name into a suitable cmake cache variable name, to + # avoid requiring the caller to uniquely set one for each case. + string(REGEX REPLACE "^--*" "" varname ${test_option}) + string(TOUPPER ${varname} varname) + string(REGEX REPLACE "[-=]" "_" varname ${varname}) + set(varname "SUPPORTS_${varname}") + check_c_compiler_flag(${test_option} ${varname}) + if (${varname}) + # If supported, enable the original form of the option that was requested. + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${option}" PARENT_SCOPE) + endif() +endfunction() + +if (NOT MSVC) + # clang-cl supports -Wall, but it corresponds to -Weverything + check_enable_option(-Wall) +endif() + +check_enable_option(-Wundef) +check_enable_option(-Werror=vla) +check_enable_option(-Wno-maybe-uninitialized) +check_enable_option(-Wno-missing-field-initializers) +check_enable_option(-Wno-unused-parameter) +check_enable_option(-Wstrict-prototypes) +check_enable_option(-Werror=missing-prototypes) +check_enable_option(-Wshorten-64-to-32) + +check_function_exists(sin HAVE_DEFAULT_MATH) +if (NOT HAVE_DEFAULT_MATH) + check_library_exists(m sin "" HAVE_LIBM) + if (HAVE_LIBM) + link_libraries(m) + endif() +endif() +check_library_exists(atomic __atomic_load_8 "" HAVE_LIBATOMIC) +if (HAVE_LIBATOMIC) + link_libraries(atomic) +endif() +if (NOT WIN32) + find_package(Threads) + if (Threads_FOUND) + link_libraries(${CMAKE_THREAD_LIBS_INIT}) + endif() +endif() + +if (WIN32) + add_compile_definitions(WIN32_LEAN_AND_MEAN) + if (MSVC) + add_compile_definitions(_CRT_SECURE_NO_WARNINGS) + add_compile_definitions(_CRT_NONSTDC_NO_DEPRECATE) + endif() +endif() + +if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") + set(ARCH_AARCH64 1) + enable_language(ASM) + message(STATUS "dav1d: Enabling aarch64 assembly") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm") + set(ARCH_ARM 1) + enable_language(ASM) + message(STATUS "dav1d: Enabling arm assembly") +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^[Xx]86$") + set(ARCH_I386 1) + check_language(ASM_NASM) + if (CMAKE_ASM_NASM_COMPILER) + enable_language(ASM_NASM) + message(STATUS "dav1d: Enabling i386 nasm assembly") + else() + add_compile_definitions(NO_X86ASM) + message(STATUS "dav1d: Not enabling i386 nasm assembly") + endif() + if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR CMAKE_SYSTEM_NAME STREQUAL "Windows") + set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DPREFIX") + endif() +elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "AMD64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64") + set(ARCH_X86_64 1) + add_compile_definitions(PIC) + check_language(ASM_NASM) + if (CMAKE_ASM_NASM_COMPILER) + enable_language(ASM_NASM) + message(STATUS "dav1d: Enabling x86_64 nasm assembly") + else() + add_compile_definitions(NO_X86ASM) + message(STATUS "dav1d: Not enabling x86_64 nasm assembly") + endif() + set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DARCH_X86_64=1") + if (CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DPREFIX") + endif() +else() + message(STATUS "dav1d: Not enabling any assembly optimizations for ${CMAKE_SYSTEM_PROCESSOR}") +endif() + +# src + +set(dav1d_src + cdf.c + cpu.c + ctx.c + data.c + decode.c + dequant_tables.c + getbits.c + intra_edge.c + itx_1d.c + lf_mask.c + lib.c + log.c + mem.c + msac.c + obu.c + pal.c + picture.c + qm.c + ref.c + refmvs.c + scan.c + tables.c + thread_task.c + warpmv.c + wedge.c) + +if (WIN32) + list(APPEND dav1d_src + win32/thread.c) +endif() + +set(dav1d_tmpl_src + cdef_apply_tmpl.c + cdef_tmpl.c + fg_apply_tmpl.c + filmgrain_tmpl.c + ipred_prepare_tmpl.c + ipred_tmpl.c + itx_tmpl.c + lf_apply_tmpl.c + loopfilter_tmpl.c + looprestoration_tmpl.c + lr_apply_tmpl.c + mc_tmpl.c + recon_tmpl.c) + +if (ARCH_AARCH64) + list(APPEND dav1d_src + arm/cpu.c + arm/64/itx.S + arm/64/looprestoration_common.S + arm/64/msac.S + arm/64/refmvs.S + arm/64/cdef.S + arm/64/filmgrain.S + arm/64/ipred.S + arm/64/loopfilter.S + arm/64/looprestoration.S + arm/64/mc.S + arm/64/mc_dotprod.S + arm/64/cdef16.S + arm/64/filmgrain16.S + arm/64/ipred16.S + arm/64/itx16.S + arm/64/loopfilter16.S + arm/64/looprestoration16.S + arm/64/mc16.S + arm/64/mc16_sve.S) +elseif (ARCH_ARM) + list(APPEND dav1d_src + arm/cpu.c + arm/32/itx.S + arm/32/looprestoration_common.S + arm/32/msac.S + arm/32/refmvs.S + arm/32/cdef.S + arm/32/filmgrain.S + arm/32/ipred.S + arm/32/loopfilter.S + arm/32/looprestoration.S + arm/32/mc.S + arm/32/cdef16.S + arm/32/filmgrain16.S + arm/32/ipred16.S + arm/32/itx16.S + arm/32/loopfilter16.S + arm/32/looprestoration16.S + arm/32/mc16.S) +elseif (ARCH_I386 OR ARCH_X86_64) + list(APPEND dav1d_src + x86/cpu.c) + if (CMAKE_ASM_NASM_COMPILER) + set(x86_nasm_sources + x86/cpuid.asm + x86/msac.asm + x86/pal.asm + x86/refmvs.asm + x86/itx_avx512.asm + x86/cdef_avx2.asm + x86/itx_avx2.asm + x86/cdef_sse.asm + x86/itx_sse.asm + x86/cdef_avx512.asm + x86/filmgrain_avx512.asm + x86/ipred_avx512.asm + x86/loopfilter_avx512.asm + x86/looprestoration_avx512.asm + x86/mc_avx512.asm + x86/filmgrain_avx2.asm + x86/ipred_avx2.asm + x86/loopfilter_avx2.asm + x86/looprestoration_avx2.asm + x86/mc_avx2.asm + x86/filmgrain_sse.asm + x86/ipred_sse.asm + x86/loopfilter_sse.asm + x86/looprestoration_sse.asm + x86/mc_sse.asm + x86/cdef16_avx512.asm + x86/filmgrain16_avx512.asm + x86/ipred16_avx512.asm + x86/itx16_avx512.asm + x86/loopfilter16_avx512.asm + x86/looprestoration16_avx512.asm + x86/mc16_avx512.asm + x86/cdef16_avx2.asm + x86/filmgrain16_avx2.asm + x86/ipred16_avx2.asm + x86/itx16_avx2.asm + x86/loopfilter16_avx2.asm + x86/looprestoration16_avx2.asm + x86/mc16_avx2.asm + x86/cdef16_sse.asm + x86/filmgrain16_sse.asm + x86/ipred16_sse.asm + x86/itx16_sse.asm + x86/loopfilter16_sse.asm + x86/looprestoration16_sse.asm + x86/mc16_sse.asm) + list(APPEND dav1d_src + ${x86_nasm_sources}) + list(TRANSFORM x86_nasm_sources PREPEND ${TEST_SUITE_DAV1D_ROOT}/src/) + set_source_files_properties(${x86_nasm_sources} PROPERTIES LANGUAGE ASM_NASM) + endif() +endif() + +list(TRANSFORM dav1d_tmpl_src PREPEND ${TEST_SUITE_DAV1D_ROOT}/src/) +list(TRANSFORM dav1d_src PREPEND ${TEST_SUITE_DAV1D_ROOT}/src/) + +foreach(bitdepth 8 16) + llvm_test_library(dav1d_bitdepth_${bitdepth} OBJECT ${dav1d_tmpl_src}) + target_compile_definitions(dav1d_bitdepth_${bitdepth} PRIVATE -DBITDEPTH=${bitdepth}) + list(APPEND bitdepth_libraries dav1d_bitdepth_${bitdepth}) +endforeach() + +llvm_test_library(dav1d_lib ${dav1d_src}) +target_link_libraries(dav1d_lib LINK_PRIVATE ${bitdepth_libraries}) + + +# tools + +set(dav1d_cli_src + dav1d.c + dav1d_cli_parse.c + input/input.c + input/annexb.c + input/ivf.c + input/section5.c + output/md5.c + output/null.c + output/output.c + output/y4m2.c + output/yuv.c) + +if (WIN32) + list(APPEND dav1d_cli_src + compat/getopt.c) +endif() + +list(TRANSFORM dav1d_cli_src PREPEND ${TEST_SUITE_DAV1D_ROOT}/tools/) + +llvm_test_executable_no_test(dav1d ${dav1d_cli_src}) + +target_include_directories(dav1d PRIVATE ${TEST_SUITE_DAV1D_ROOT}/tools) +target_link_libraries(dav1d PRIVATE dav1d_lib) + + +# checkasm + +set(checkasm_src + checkasm.c + msac.c + pal.c + refmvs.c) + +set(checkasm_tmpl_src + cdef.c + filmgrain.c + ipred.c + itx.c + loopfilter.c + looprestoration.c + mc.c) + +if (ARCH_AARCH64) + list(APPEND checkasm_src + arm/checkasm_64.S) +elseif (ARCH_ARM) + list(APPEND checkasm_src + arm/checkasm_32.S) +elseif (ARCH_I386 OR ARCH_X86_64) + if (CMAKE_ASM_NASM_COMPILER) + set(x86_nasm_sources + x86/checkasm.asm) + list(APPEND checkasm_src + ${x86_nasm_sources}) + list(TRANSFORM x86_nasm_sources PREPEND ${TEST_SUITE_DAV1D_ROOT}/tests/checkasm/) + set_source_files_properties(${x86_nasm_sources} PROPERTIES LANGUAGE ASM_NASM) + endif() +endif() + +list(TRANSFORM checkasm_tmpl_src PREPEND ${TEST_SUITE_DAV1D_ROOT}/tests/checkasm/) +list(TRANSFORM checkasm_src PREPEND ${TEST_SUITE_DAV1D_ROOT}/tests/checkasm/) + +foreach(bitdepth 8 16) + llvm_test_library(checkasm_bitdepth_${bitdepth} OBJECT ${checkasm_tmpl_src}) + target_compile_definitions(checkasm_bitdepth_${bitdepth} PRIVATE -DBITDEPTH=${bitdepth}) + list(APPEND bitdepth_libraries checkasm_bitdepth_${bitdepth}) +endforeach() + +llvm_test_run() +llvm_test_executable(dav1d_checkasm ${checkasm_src}) +target_link_libraries(dav1d_checkasm LINK_PRIVATE ${bitdepth_libraries}) +target_link_libraries(dav1d_checkasm PRIVATE dav1d_lib) diff --git a/External/dav1d/README.md b/External/dav1d/README.md new file mode 100644 index 0000000000..7db5c3816c --- /dev/null +++ b/External/dav1d/README.md @@ -0,0 +1,255 @@ +dav1d +===== + +dav1d is a highly optimized video decoding library for the AV1 video format. + + +Setup +----- + +This integration of dav1d into llvm-test-suite works with dav1d 1.5.0. + +To include the dav1d library in llvm-test-suite, run +`git clone -b 1.5.0 https://code.videolan.org/videolan/dav1d.git` +within the `llvm-test-suite/test-suite-externals` directory, or +set `TEST_SUITE_DAV1D_ROOT` to point to a similar checkout, in the +CMake configuration. + +For x86 targets, the `nasm` tool is used for building assembly, if +the tool is found at configure time. If not found, the assembly is +omitted. The project also contains assembly for ARM and AArch64, but +that doesn't require any separate tool for building, it is built by +the regular GAS style assembler (via the compiler driver). + +The upstream project also contains some amount of assembly for other +architectures, but that is not currently hooked up in the integration +into llvm-test-suite. + + +Build targets +------------- + +The integration of dav1d into llvm-test-suite builds two targets; +the `dav1d` command line executable (which can decode AV1 video from +`.ivf` files), and `dav1d_checkasm`, a testing tool. The latter is +executed as part of running the llvm-test-suite tests. + + +checkasm +-------- + +The checkasm tool is originally intended for developing handwritten +SIMD optimized versions of functions - both for testing their +correctness and for benchmarking them. + +The correctness tests work by comparing the outputs of a reference C +implementation of each function with the outputs of handwritten SIMD +optimized versions. The same comparison also works in reverse; if the +reference C code gets miscompiled, the correctness test should point out +a discrepancy. By just running this executable without any arguments, +it tests all variants of all enabled functions. + +If there is only one implementation of a function (i.e. only the +reference C implementation), there is nothing to compare against, so +such miscompilations wouldn't be caught. + +However, miscompilations that show up as failed asserts within LLVM +when generating code are caught even if there is no assembly +available. + + +Benchmarking with checkasm +-------------------------- + +If benchmarking on AArch64 on Linux, see the section below for +gotchas regarding that. + +While the checkasm tool primarily is intended for benchmarking and +developing handwritten SIMD implementations, it can also be used +for benchmarking and evaluating the performance of the compiler +generated code for the reference C implementations. + +The most highlevel benchmark would be to record the runtime of +one full run of the `dav1d_checkasm` binary, and compare that between +different builds - however this is far from ideal; it only runs each +function a couple of times (as it only runs a correctness test), and +the total runtime depends on the number of SIMD implementations and +which of those implementations are supported by the current CPU. + +The ideal use of the checkasm tool is for microbenchmarking +individual functions. + +As an initial entry level case, one can benchmark all included functions +by running `External/dav1d/dav1d_checkasm --bench 0`. As each benchmarked +function is run a large number of times, this can take a long time +(a couple of minutes). To reduce the runtime of it, one can edit +`dav1d/tests/checkasm/checkasm.h` and change +`#define BENCH_RUNS (1 << 12)` into e.g. `#define BENCH_RUNS (1 << 10)` +to reduce the number of iterations. + +The last argument, `0`, sets the random seed for the execution. All +tests run with random input data; in many tests, the actual values of +the input data doesn't affect the runtime, but some tests can be +affected; therefore, it's good practice to run all benchmarks in a +comparison with the same seed. + +An example of parts of the output of such a benchmark looks like this: + +``` +mc_8tap_regular_w4_hv_8bpc_c: 15.3 ( 1.00x) +mc_8tap_regular_w4_hv_8bpc_neon: 1.8 ( 8.44x) +mc_8tap_regular_w4_hv_8bpc_dotprod: 1.4 (11.22x) +[...] +mc_8tap_regular_w128_h_8bpc_c: 394.5 ( 1.00x) +mc_8tap_regular_w128_h_8bpc_neon: 121.4 ( 3.25x) +mc_8tap_regular_w128_h_8bpc_dotprod: 68.2 ( 5.78x) +mc_8tap_regular_w128_hv_8bpc_c: 702.3 ( 1.00x) +mc_8tap_regular_w128_hv_8bpc_neon: 289.2 ( 2.43x) +mc_8tap_regular_w128_hv_8bpc_dotprod: 183.1 ( 3.84x) +``` + +This is a case where the same function, `mc_8tap_regular`, has been +executed with a number of different cases that are relevant for +use in the video decoder; `w4` means that it was run on a block +of width 4 pixels, and the suffixes `h` or `hv` indicates different +parameters that usually pick different codepaths within the +function. (To be precise, in this case it indicates whether the +function does horizontal filter, vertical, both, or no filtering at +all.) Each function may have different specialized cases that are +benchmarked separately. + +The numbers indicate that e.g. the reference C version of +`mc_8tap_regular_w128_hv` executed in 702 timer units, while +the handwritten NEON and DotProd versions took 289 and 183 timer +units each, respectively. The handwritten versions usually exploit +a lot of extra knowledge about the functions and their uses, that the +reference C implementation and the compiler lack. However they +indicate a potential best case target for what the compiler could +do, in ideal circumstances. + +The various functions are grouped into different areas; one can +choose to run only one or some groups, by adding a parameter like +`--test=mc_8bpc` or `--test=mc_*`. + +While benchmarking, one can also limit the benchmarking to a smaller +set of functions, by adding a parameter like +`--function=mc_8tap_regular_w*_hv_*`. + + +Benchmarking on AArch64 +----------------------- + +The upstream checkasm tool is meant for benchmarking and finetuning +assembly implementations. Therefore, it uses the `pmccntr_el0` register +for high precision timing on Linux and Windows. Unfortunately, this register +is normally not accessible from userspace in Linux. One can enable access +from userspace by building and loading a kernel module, e.g. +https://code.videolan.org/janne/arm64-cycle-cnt. + +Alternatively, the `dav1d/tests/checkasm/checkasm.h` source file can be +edited, changing references to `pmccntr_el0` into `cntvct_el0`. That +timer is usually accessible from userspace, but it has much lower +precision - making it less suitable for finetuning assembly functions, +but it is still good enough for coarse performance comparisons. + +On macOS, a coarse timer that always is accessible, is used by default. + +On Windows, `pmccntr_el0` is used; this register should always be +accessible from userspace on Windows. + + +Evaulating vectorization effectiveness +-------------------------------------- + +For evaluating e.g. the effectiveness of compiler autovectorization, +do two separate builds of `dav1d_checkasm`, e.g. one set up with +`-DCMAKE_C_FLAGS_RELEASE="-O3"` and one with +`-DCMAKE_C_FLAGS_RELEASE="-O3 -fno-vectorize -fno-slp-vectorize"`. +Then run benchmarks for relevant parts, and compare the measured +runtimes for the `_c` suffixed versions. If the vectorized version is +faster (lower benchmark numbers) than the non-vectorized, the compiler +handled the function well. If the vectorized version is slower than +the non-vectorized version, we have found a case that probably should be +investigated, and where compiler autovectorization is hurting the +performance of dav1d. + +As a concrete example, running +`./External/dav1d/dav1d_checkasm --bench --test=mc_8bpc --function=mct_8tap_regular_w128_0_8bpc 0` in both a vectorized and non-vectorized build, +we'd get the following numbers: + +Vectorization disabled: +``` +mct_8tap_regular_w128_0_8bpc_c: 180.9 ( 1.00x) +mct_8tap_regular_w128_0_8bpc_neon: 10.8 (16.69x) +mct_8tap_regular_w128_0_8bpc_dotprod: 10.8 (16.74x) +``` + +Vectorization enabled: +``` +mct_8tap_regular_w128_0_8bpc_c: 18.1 ( 1.00x) +mct_8tap_regular_w128_0_8bpc_neon: 10.8 ( 1.68x) +mct_8tap_regular_w128_0_8bpc_dotprod: 10.8 ( 1.67x) +``` + +Here, the compiler vectorized version was almost 10x as fast as the +non-vectorized version, reaching close to the performance of the +handwritten implementation. + + +A different example of the effect of vectorization can be found +by benchmarking with `./External/dav1d/dav1d_checkasm --bench --test=cdef_8bpc 0`. +There we can get the following numbers: + +Vectorization disabled: +``` +cdef_filter_4x8_10_8bpc_c: 7.4 ( 1.00x) +cdef_filter_4x8_10_8bpc_neon: 1.6 ( 4.51x) +``` + +Vectorization enabled: +``` +cdef_filter_4x8_10_8bpc_c: 11.3 ( 1.00x) +cdef_filter_4x8_10_8bpc_neon: 1.7 ( 6.84x) +``` + +Here, the code generated by vectorization is not beneficial, and +ends up slowing down this particular testcase. + + +Locating the source and generated code for tests +------------------------------------------------ + +Large parts of the dav1d decoder is templated C code, which is +compiled twice, with varying data type definitions - once for +`8bpc` (8 bit per component) and once for `16bpc`. Code in files +named `*_tmpl.c` is compiled in such a way. + +To investigate the behaviour behind one individual benchmark result, +the mapping from benchmark case names to actual source code isn't +always trivial. It may be easiest to start out with the definition +of the test itself, within e.g. `dav1d/tests/checkasm/*.c`, looking +for which function it actually calls. + +As an example, one function observed above, +`mct_8tap_regular_w128_0_8bpc`, gets tested in `dav1d/tests/checkasm/mc.c`, +in the `check_mct` function. The individual test variant gets set up +in this function call: + +``` + if (check_func(c->mct[filter], "mct_%s_w%d_%s_%dbpc", + filter_names[filter], w, mxy_names[mxy], BITDEPTH)) +``` + +This means that the tested function is `c->mct[filter]`. In this case, +the function pointer gets set by `bitfn(dav1d_mc_dsp_init)(&c)`, which +is implemented in `dav1d/src/mc_tmpl.c`. For the case of +`mct_8tap_regular_w128_0_8bpc`, this maps to the function +`prep_8tap_regular_c` (which is defined via macro expansion, so it's not +easily greppable), which calls the function `prep_8tap_c`. Within the +function `prep_8tap_c`, there are four different cases, switched between +based on whether the input parameters `mx` and `my` are zero or nonzero. +In the case of the `_0_` variant, both `mx` and `my` would be zero, and +the called code is in the function `prep_c`. + +The generated code for e.g. those functions can be found in the object file +`External/dav1d/CMakeFiles/dav1d_bitdepth_8.dir/__/__/test-suite-externals/dav1d/src/mc_tmpl.c.o`. diff --git a/External/dav1d/cli_config.h b/External/dav1d/cli_config.h new file mode 100644 index 0000000000..0d5925946a --- /dev/null +++ b/External/dav1d/cli_config.h @@ -0,0 +1,3 @@ +#pragma once + +#define HAVE_XXHASH_H 0 diff --git a/External/dav1d/config.asm b/External/dav1d/config.asm new file mode 100644 index 0000000000..83ada0557b --- /dev/null +++ b/External/dav1d/config.asm @@ -0,0 +1,11 @@ +%ifdef ARCH_X86_64 +%define ARCH_X86_32 0 +%define STACK_ALIGNMENT 16 +%else +%define ARCH_X86_32 1 +%define ARCH_X86_64 0 +%define STACK_ALIGNMENT 4 +%endif +%define FORCE_VEX_ENCODING 0 +%define PIC 1 +%define private_prefix dav1d diff --git a/External/dav1d/config.h b/External/dav1d/config.h new file mode 100644 index 0000000000..c0e6080f7f --- /dev/null +++ b/External/dav1d/config.h @@ -0,0 +1,169 @@ +#pragma once + +#ifdef __aarch64__ +#define ARCH_AARCH64 1 +#define AS_ARCH_LEVEL armv8.6-a+crc +#define HAVE_DOTPROD 1 +#define HAVE_I8MM 1 +#define HAVE_SVE 1 +#define HAVE_SVE2 1 +#elif defined(__arm__) +#define ARCH_ARM 1 +#elif defined(__i386__) && !defined(NO_X86ASM) +#define ARCH_X86 1 +#define ARCH_X86_32 1 +#elif defined(__x86_64__) && !defined(NO_X86ASM) +#define ARCH_X86 1 +#define ARCH_X86_64 1 +#endif + +#ifndef ARCH_AARCH64 +#define ARCH_AARCH64 0 +#endif +#ifndef ARCH_ARM +#define ARCH_ARM 0 +#endif +#ifndef ARCH_X86 +#define ARCH_X86 0 +#endif +#ifndef ARCH_X86_32 +#define ARCH_X86_32 0 +#endif +#ifndef ARCH_X86_64 +#define ARCH_X86_64 0 +#endif + +#define ARCH_LOONGARCH 0 + +#define ARCH_LOONGARCH32 0 + +#define ARCH_LOONGARCH64 0 + +#define ARCH_PPC64LE 0 + +#define ARCH_RISCV 0 + +#define ARCH_RV32 0 + +#define ARCH_RV64 0 + + +#define CONFIG_16BPC 1 + +#define CONFIG_8BPC 1 + +#define CONFIG_LOG 1 + +#define CONFIG_MACOS_KPERF 0 + +#ifdef __BYTE_ORDER__ +# if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define ENDIANNESS_BIG 1 +# else +# define ENDIANNESS_BIG 0 +# endif +#elif defined(_WIN32) +# define ENDIANNESS_BIG 0 +#else +# error Unknown endianness. +#endif + +#define HAVE_ALIGNED_ALLOC 0 + +#ifdef NO_X86ASM +#define HAVE_ASM 0 +#else +#define HAVE_ASM 1 +#endif + +#define HAVE_AS_ARCHEXT_DOTPROD_DIRECTIVE 0 + +#define HAVE_AS_ARCHEXT_I8MM_DIRECTIVE 0 + +#define HAVE_AS_ARCHEXT_SVE2_DIRECTIVE 1 + +#define HAVE_AS_ARCHEXT_SVE_DIRECTIVE 1 + +#define HAVE_AS_ARCH_DIRECTIVE 1 + +#define HAVE_AS_FUNC 0 + +#define HAVE_C11_GENERIC 1 + +#ifdef __linux__ +#define HAVE_CLOCK_GETTIME 1 +#else +#define HAVE_CLOCK_GETTIME 0 +#endif + +#define HAVE_DLSYM 0 + +#define HAVE_ELF_AUX_INFO 0 + +#ifdef __linux__ +#define HAVE_GETAUXVAL 1 +#else +#define HAVE_GETAUXVAL 0 +#endif + +#ifdef _WIN32 +#define HAVE_IO_H 1 +#else +#define HAVE_IO_H 0 +#endif + +#define HAVE_MEMALIGN 0 + +#ifdef _WIN32 +#define HAVE_POSIX_MEMALIGN 0 +#else +#define HAVE_POSIX_MEMALIGN 1 +#endif + +#ifdef __linux__ +#define HAVE_PTHREAD_GETAFFINITY_NP 1 +#else +#define HAVE_PTHREAD_GETAFFINITY_NP 0 +#endif + +#define HAVE_PTHREAD_NP_H 0 + +#ifdef __linux__ +#define HAVE_PTHREAD_SETAFFINITY_NP 1 +#else +#define HAVE_PTHREAD_SETAFFINITY_NP 0 +#endif + +#define HAVE_PTHREAD_SETNAME_NP 1 + +#define HAVE_PTHREAD_SET_NAME_NP 0 + +#define HAVE_SYS_TYPES_H 1 + +#ifdef _WIN32 +#define HAVE_UNISTD_H 0 +#else +#define HAVE_UNISTD_H 1 +#endif + +#if !defined(PIC) && (defined(__PIC__) || defined(__pic__)) +#define PIC 3 +#endif + +#if defined(__APPLE__) || (defined(_WIN32) && defined(__i386__)) +#define PREFIX 1 +#endif + +#define TRIM_DSP_FUNCTIONS 0 + +#ifdef _WIN32 +#define UNICODE 1 +#define _CRT_DECLARE_NONSTDC_NAMES 1 +#define _FILE_OFFSET_BITS 64 +#define _UNICODE 1 +#endif + +#ifdef _MSC_VER +#define fseeko _fseeki64 +#define ftello _ftelli64 +#endif diff --git a/External/dav1d/vcs_version.h b/External/dav1d/vcs_version.h new file mode 100644 index 0000000000..655fd5725c --- /dev/null +++ b/External/dav1d/vcs_version.h @@ -0,0 +1 @@ +#define DAV1D_VERSION "1.5.0"