From ac023eaf85b85c6a08b4142e96896aa23c3ffb43 Mon Sep 17 00:00:00 2001 From: Fabian Mora Date: Fri, 8 Sep 2023 14:26:04 +0000 Subject: [PATCH 1/2] [mlir][test][gpu] Migrate CUDA tests to the TargetAttr compilation workflow Migrate tests referencing `gpu-to-cubin` to the new compilation workflow using `TargetAttrs`. The `test-lower-to-nvvm` pass pipeline was modified to use the new compilation workflow to simplify the introduction of future tests. The `createLowerGpuOpsToNVVMOpsPass` function was removed, as it didn't allow for passing all options available in the `ConvertGpuOpsToNVVMOp` pass. The LIT configuration was modified to support CUDA tests only when the `ptxas` & `fatbinary` tools are present. --- .../mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h | 12 +-- mlir/include/mlir/Conversion/Passes.td | 1 - .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp | 11 +-- .../Pipelines/SparseTensorPipelines.cpp | 2 +- .../SparseTensor/GPU/CUDA/dump-ptx.mlir | 3 +- .../GPU/CUDA/sparse-mma-2-4-f16.mlir | 5 +- .../GPU/CUDA/test-reduction-distribute.mlir | 4 +- .../Vector/GPU/CUDA/test-warp-distribute.mlir | 12 +-- .../GPU/CUDA/TensorCore/wmma-matmul-f16.mlir | 4 +- .../TensorCore/wmma-matmul-f32-bare-ptr.mlir | 4 +- .../GPU/CUDA/TensorCore/wmma-matmul-f32.mlir | 4 +- .../Integration/GPU/CUDA/all-reduce-and.mlir | 8 +- .../Integration/GPU/CUDA/all-reduce-max.mlir | 4 +- .../Integration/GPU/CUDA/all-reduce-min.mlir | 4 +- .../Integration/GPU/CUDA/all-reduce-op.mlir | 4 +- .../Integration/GPU/CUDA/all-reduce-or.mlir | 4 +- .../GPU/CUDA/all-reduce-region.mlir | 4 +- .../Integration/GPU/CUDA/all-reduce-xor.mlir | 4 +- mlir/test/Integration/GPU/CUDA/async.mlir | 4 +- .../Integration/GPU/CUDA/gpu-to-cubin.mlir | 5 +- mlir/test/Integration/GPU/CUDA/lit.local.cfg | 2 +- .../GPU/CUDA/multiple-all-reduce.mlir | 4 +- mlir/test/Integration/GPU/CUDA/printf.mlir | 3 +- mlir/test/Integration/GPU/CUDA/shuffle.mlir | 4 +- .../Integration/GPU/CUDA/two-modules.mlir | 4 +- mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp | 76 +++++++++---------- mlir/test/lit.cfg.py | 10 +++ 27 files changed, 73 insertions(+), 133 deletions(-) diff --git a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h index 46f29c6dd8b92..e0f4c71051e50 100644 --- a/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h +++ b/mlir/include/mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h @@ -16,9 +16,7 @@ namespace mlir { class LLVMTypeConverter; class ConversionTarget; class RewritePatternSet; - -template -class OperationPass; +class Pass; namespace gpu { class GPUModuleOp; @@ -45,14 +43,6 @@ void populateGpuSubgroupReduceOpLoweringPattern(LLVMTypeConverter &converter, /// Collect a set of patterns to convert WMMA ops from GPU dialect to NVVM. void populateGpuWMMAToNVVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns); - -/// Creates a pass that lowers GPU dialect operations to NVVM counterparts. The -/// index bitwidth used for the lowering of the device side index computations -/// is configurable. -std::unique_ptr> createLowerGpuOpsToNVVMOpsPass( - unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout, - bool hasRedux = false); - } // namespace mlir #endif // MLIR_CONVERSION_GPUTONVVM_GPUTONVVMPASS_H_ diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td index ed37abf85275b..3218760931b8c 100644 --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -486,7 +486,6 @@ def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> { def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> { let summary = "Generate NVVM operations for gpu operations"; - let constructor = "mlir::createLowerGpuOpsToNVVMOpsPass()"; let dependentDialects = [ "cf::ControlFlowDialect", "memref::MemRefDialect", diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 06469dc82b3fc..764b6a779b98c 100644 --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -210,11 +210,7 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern { /// code. struct LowerGpuOpsToNVVMOpsPass : public impl::ConvertGpuOpsToNVVMOpsBase { - LowerGpuOpsToNVVMOpsPass() = default; - LowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux = false) { - this->indexBitwidth = indexBitwidth; - this->hasRedux = hasRedux; - } + using Base::Base; void runOnOperation() override { gpu::GPUModuleOp m = getOperation(); @@ -378,8 +374,3 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter, "__nv_tanh"); populateOpPatterns(converter, patterns, "__nv_tanf", "__nv_tan"); } - -std::unique_ptr> -mlir::createLowerGpuOpsToNVVMOpsPass(unsigned indexBitwidth, bool hasRedux) { - return std::make_unique(indexBitwidth, hasRedux); -} diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp index a7fd5a25e6831..24c4c4c43a93d 100644 --- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp +++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp @@ -52,7 +52,7 @@ void mlir::sparse_tensor::buildSparseCompiler( pm.addPass(createSparseGPUCodegenPass()); pm.addNestedPass(createStripDebugInfoPass()); pm.addNestedPass(createConvertSCFToCFPass()); - pm.addNestedPass(createLowerGpuOpsToNVVMOpsPass()); + pm.addNestedPass(createConvertGpuOpsToNVVMOps()); } // TODO(springerm): Add sparse support to the BufferDeallocation pass and add diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir index 2c1ae3ee840d0..0cb06b7bf1d20 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/dump-ptx.mlir @@ -1,6 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{dump-ptx}))' \ +// RUN: | mlir-opt -test-lower-to-nvvm -debug-only=serialize-to-isa \ // RUN: 2>&1 | FileCheck %s // CHECK: Generated by LLVM NVPTX Back-End diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir index 8eb90fd3ca994..80972f244ec02 100644 --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir @@ -2,10 +2,9 @@ // NOTE: this test requires gpu-sm80 // // RUN: mlir-opt \ -// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse,gpu.module(gpu-to-cubin{chip=sm_80 features=+ptx71}))" \ +// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \ // RUN: %s \ -// RUN: | mlir-opt --convert-vector-to-scf --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm \ -// RUN: --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \ +// RUN: | mlir-opt --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir index 8571c5ca5f3dc..8c991493a2b01 100644 --- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir +++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir @@ -1,9 +1,7 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\ // RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\ // RUN: mlir-opt -lower-affine -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm \ -// RUN: -convert-arith-to-llvm -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -convert-arith-to-llvm -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir index c671c1843862f..f26c18c4ae3dd 100644 --- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir +++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir @@ -2,9 +2,7 @@ // everything on the same thread. // RUN: mlir-opt %s -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ @@ -15,9 +13,7 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write" \ // RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ @@ -27,9 +23,7 @@ // RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" \ // RUN: -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if -canonicalize | \ // RUN: mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: -gpu-kernel-outlining |\ -// RUN: mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin))' |\ -// RUN: mlir-opt -gpu-to-llvm -reconcile-unrealized-casts |\ +// RUN: -test-lower-to-nvvm | \ // RUN: mlir-cpu-runner -e main -entry-point-result=void \ // RUN: -shared-libs=%mlir_cuda_runtime \ // RUN: -shared-libs=%mlir_c_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir index 535ba52d66f00..591bf1b4fd182 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f16.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \ -// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir index c4ca46521eeb4..51bd23f817b33 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32-bare-ptr.mlir @@ -3,9 +3,7 @@ // Similar to the wmma-matmul-f32 but but with the memref bare pointer lowering convention. // This test also uses gpu.memcpy operations (instead of gpu.host_register). // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin{chip=sm_70}))' \ -// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm="use-bare-pointers-for-host=1 use-bare-pointers-for-kernels=1" \ +// RUN: | mlir-opt -test-lower-to-nvvm="host-bare-ptr-calling-convention=1 kernel-bare-ptr-calling-convention=1 cubin-chip=sm_70" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --entry-point-result=void \ diff --git a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir index ae410dce281b1..0307b3d504be9 100644 --- a/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir +++ b/mlir/test/Integration/GPU/CUDA/TensorCore/wmma-matmul-f32.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin{chip=sm_70}))' \ -// RUN: | mlir-opt --convert-scf-to-cf -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm="cubin-chip=sm_70" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir index f4324a14a36b6..b131b8682ddee 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-and.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ @@ -10,9 +8,7 @@ // Same as above but with the memref bare pointer lowering convention. // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm{use-bare-ptr-memref-call-conv=1},gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm="use-bare-pointers-for-kernels=1" \ +// RUN: | mlir-opt -test-lower-to-nvvm="kernel-bare-ptr-calling-convention=1" \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir index 0a8d38f145279..155423db7e050 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-max.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir index bcd785d35291c..e5047b6efa3bf 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-min.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir index aa4b0e8820479..163e9fdba60c1 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-op.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir index 2e7d046c39214..381db2639c371 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-or.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir index 32cfa27c8988a..23c6c117e67f3 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-region.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir index 30767b9495b6f..3c5a100b5b90d 100644 --- a/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir +++ b/mlir/test/Integration/GPU/CUDA/all-reduce-xor.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/async.mlir b/mlir/test/Integration/GPU/CUDA/async.mlir index e6dd91ace9743..d2a5127a34c3b 100644 --- a/mlir/test/Integration/GPU/CUDA/async.mlir +++ b/mlir/test/Integration/GPU/CUDA/async.mlir @@ -1,7 +1,7 @@ // RUN: mlir-opt %s \ // RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm),nvvm-attach-target)' \ +// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm -gpu-module-to-binary \ // RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \ // RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \ // RUN: | mlir-cpu-runner \ diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir index afcb674858c86..a5d04f7322b49 100644 --- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir +++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir @@ -1,8 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -convert-vector-to-scf -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/lit.local.cfg index acb8dd43f50b4..11d3e363d16f9 100644 --- a/mlir/test/Integration/GPU/CUDA/lit.local.cfg +++ b/mlir/test/Integration/GPU/CUDA/lit.local.cfg @@ -1,2 +1,2 @@ -if not config.enable_cuda_runner: +if not config.enable_cuda_runner_tests: config.unsupported = True diff --git a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir index 444e2877c822c..7657bf4732d32 100644 --- a/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir +++ b/mlir/test/Integration/GPU/CUDA/multiple-all-reduce.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/printf.mlir b/mlir/test/Integration/GPU/CUDA/printf.mlir index fce773974d5ba..1a35d1e78b094 100644 --- a/mlir/test/Integration/GPU/CUDA/printf.mlir +++ b/mlir/test/Integration/GPU/CUDA/printf.mlir @@ -1,6 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/shuffle.mlir b/mlir/test/Integration/GPU/CUDA/shuffle.mlir index 6a784ca32f9ef..40fcea857d5b4 100644 --- a/mlir/test/Integration/GPU/CUDA/shuffle.mlir +++ b/mlir/test/Integration/GPU/CUDA/shuffle.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/Integration/GPU/CUDA/two-modules.mlir b/mlir/test/Integration/GPU/CUDA/two-modules.mlir index 5f6e5d75aff5b..5a9acdf3d8da6 100644 --- a/mlir/test/Integration/GPU/CUDA/two-modules.mlir +++ b/mlir/test/Integration/GPU/CUDA/two-modules.mlir @@ -1,7 +1,5 @@ // RUN: mlir-opt %s \ -// RUN: | mlir-opt -gpu-kernel-outlining \ -// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ -// RUN: | mlir-opt -gpu-to-llvm \ +// RUN: | mlir-opt -test-lower-to-nvvm \ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%mlir_cuda_runtime \ // RUN: --shared-libs=%mlir_runner_utils \ diff --git a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp index 5db6f56fb4b38..99e19dae0d72b 100644 --- a/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp +++ b/mlir/test/lib/Dialect/GPU/TestLowerToNVVM.cpp @@ -65,7 +65,7 @@ struct TestLowerToNVVMOptions llvm::cl::init("nvptx64-nvidia-cuda")}; PassOptions::Option cubinChip{ *this, "cubin-chip", llvm::cl::desc("Chip to use to serialize to cubin."), - llvm::cl::init("sm_80")}; + llvm::cl::init("sm_50")}; PassOptions::Option cubinFeatures{ *this, "cubin-features", llvm::cl::desc("Features to use to serialize to cubin."), @@ -126,13 +126,14 @@ void buildGpuPassPipeline(OpPassManager &pm, // TODO: C++20 designated initializers. // The following pass is inconsistent. - // ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions; - // convertGpuOpsToNVVMOpsOptions.indexBitwidth = - // options.kernelIndexBitWidth; + // TODO: fix inconsistence. + ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions; + convertGpuOpsToNVVMOpsOptions.useBarePtrCallConv = + options.kernelUseBarePtrCallConv; + convertGpuOpsToNVVMOpsOptions.indexBitwidth = options.kernelIndexBitWidth; + convertGpuOpsToNVVMOpsOptions.useOpaquePointers = true; pm.addNestedPass( - // TODO: fix inconsistence. - createLowerGpuOpsToNVVMOpsPass(/*indexBitWidth=*/ - options.kernelIndexBitWidth)); + createConvertGpuOpsToNVVMOps(convertGpuOpsToNVVMOpsOptions)); // TODO: C++20 designated initializers. ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions; @@ -141,22 +142,6 @@ void buildGpuPassPipeline(OpPassManager &pm, createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions)); pm.addNestedPass(createConvertSCFToCFPass()); - // TODO: C++20 designated initializers. - GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions; - // Note: hostBarePtrCallConv must be false for now otherwise - // gpu::HostRegister is ill-defined: it wants unranked memrefs but can't - // lower the to bare ptr. - gpuToLLVMConversionOptions.hostBarePtrCallConv = - options.hostUseBarePtrCallConv; - gpuToLLVMConversionOptions.kernelBarePtrCallConv = - options.kernelUseBarePtrCallConv; - gpuToLLVMConversionOptions.useOpaquePointers = true; - - // TODO: something useful here. - // gpuToLLVMConversionOptions.gpuBinaryAnnotation = ""; - pm.addNestedPass( - createGpuToLLVMConversionPass(gpuToLLVMConversionOptions)); - // Convert vector to LLVM (always needed). // TODO: C++20 designated initializers. ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; @@ -170,11 +155,6 @@ void buildGpuPassPipeline(OpPassManager &pm, // Finally we can reconcile unrealized casts. pm.addNestedPass(createReconcileUnrealizedCastsPass()); - -#if MLIR_GPU_TO_CUBIN_PASS_ENABLE - pm.addNestedPass(createGpuSerializeToCubinPass( - options.cubinTriple, options.cubinChip, options.cubinFeatures)); -#endif // MLIR_GPU_TO_CUBIN_PASS_ENABLE } void buildLowerToNVVMPassPipeline(OpPassManager &pm, @@ -251,22 +231,16 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm, //===----------------------------------------------------------------------===// // Host post-GPUModule-specific stuff. //===----------------------------------------------------------------------===// - // Convert vector to LLVM (always needed). + // Attach an NVVM target to all the GPU modules with the provided target + // options. // TODO: C++20 designated initializers. - ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; - convertVectorToLLVMPassOptions.reassociateFPReductions = true; - pm.addNestedPass( - createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions)); + GpuNVVMAttachTargetOptions nvvmTargetOptions; + nvvmTargetOptions.triple = options.cubinTriple; + nvvmTargetOptions.chip = options.cubinChip; + nvvmTargetOptions.features = options.cubinFeatures; + pm.addPass(createGpuNVVMAttachTarget(nvvmTargetOptions)); - ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3; - // Must be 64b on the host, things don't compose properly around - // gpu::LaunchOp and gpu::HostRegisterOp. - // TODO: fix GPU layering. - convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth; - pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3)); - - // This must happen after cubin translation otherwise gpu.launch_func is - // illegal if no cubin annotation is present. + // Convert GPU to LLVM. // TODO: C++20 designated initializers. GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions; // Note: hostBarePtrCallConv must be false for now otherwise @@ -277,10 +251,28 @@ void buildLowerToNVVMPassPipeline(OpPassManager &pm, gpuToLLVMConversionOptions.kernelBarePtrCallConv = options.kernelUseBarePtrCallConv; gpuToLLVMConversionOptions.useOpaquePointers = true; + // TODO: something useful here. // gpuToLLVMConversionOptions.gpuBinaryAnnotation = ""; pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions)); + // Serialize all GPU modules to binaries. + pm.addPass(createGpuModuleToBinaryPass()); + + // Convert vector to LLVM (always needed). + // TODO: C++20 designated initializers. + ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions; + convertVectorToLLVMPassOptions.reassociateFPReductions = true; + pm.addNestedPass( + createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions)); + + ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3; + // Must be 64b on the host, things don't compose properly around + // gpu::LaunchOp and gpu::HostRegisterOp. + // TODO: fix GPU layering. + convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth; + pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3)); + // Convert Func to LLVM (always needed). // TODO: C++20 designated initializers. ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2; diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index f265ac794c6f6..46196c7a1f0ee 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -212,8 +212,18 @@ def have_host_jit_feature_support(feature_name): if have_host_jit_feature_support("jit"): config.available_features.add("host-supports-jit") +# Find if the host has the required CUDA toolkit tools to run the integration tests. +# This tools are required for compiling GPU modules into fatbins. +def have_host_required_cuda_tools(): + return lit.util.which("ptxas") != None and lit.util.which("fatbinary") != None + if config.run_cuda_tests: config.available_features.add("host-supports-nvptx") + if have_host_required_cuda_tools(): + config.available_features.add("host-supports-cuda-runner") + config.enable_cuda_runner_tests = True + else: + config.enable_cuda_runner_tests = False if config.run_rocm_tests: config.available_features.add("host-supports-amdgpu") From 49db3f1430e4a92dd35bbc196b18ef961935e890 Mon Sep 17 00:00:00 2001 From: Fabian Mora Date: Fri, 8 Sep 2023 20:02:24 +0000 Subject: [PATCH 2/2] Remove disabling the tests if CUDA tools cannot be found --- mlir/test/Integration/GPU/CUDA/lit.local.cfg | 2 +- mlir/test/lit.cfg.py | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/mlir/test/Integration/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/lit.local.cfg index 11d3e363d16f9..acb8dd43f50b4 100644 --- a/mlir/test/Integration/GPU/CUDA/lit.local.cfg +++ b/mlir/test/Integration/GPU/CUDA/lit.local.cfg @@ -1,2 +1,2 @@ -if not config.enable_cuda_runner_tests: +if not config.enable_cuda_runner: config.unsupported = True diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 46196c7a1f0ee..f265ac794c6f6 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -212,18 +212,8 @@ def have_host_jit_feature_support(feature_name): if have_host_jit_feature_support("jit"): config.available_features.add("host-supports-jit") -# Find if the host has the required CUDA toolkit tools to run the integration tests. -# This tools are required for compiling GPU modules into fatbins. -def have_host_required_cuda_tools(): - return lit.util.which("ptxas") != None and lit.util.which("fatbinary") != None - if config.run_cuda_tests: config.available_features.add("host-supports-nvptx") - if have_host_required_cuda_tools(): - config.available_features.add("host-supports-cuda-runner") - config.enable_cuda_runner_tests = True - else: - config.enable_cuda_runner_tests = False if config.run_rocm_tests: config.available_features.add("host-supports-amdgpu")