From 8473ab3a33be6b1de82bf12ad5eb563af15d80c7 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Mon, 3 Nov 2025 16:46:07 -0800 Subject: [PATCH 1/4] [flang][cuda] Add interfaces and lowering for atomicaddvector --- .../flang/Optimizer/Builder/IntrinsicCall.h | 2 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 50 +++++++++++++++++++ flang/module/cudadevice.f90 | 16 ++++++ flang/test/Lower/CUDA/cuda-atomicadd.cuf | 19 +++++++ 4 files changed, 87 insertions(+) create mode 100644 flang/test/Lower/CUDA/cuda-atomicadd.cuf diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 9f15ce68eb3d5..bbdef481a2085 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -190,6 +190,8 @@ struct IntrinsicLibrary { mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genAtomicAddR2(mlir::Type, llvm::ArrayRef); + fir::ExtendedValue genAtomicAddVector(mlir::Type, + llvm::ArrayRef); mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genAtomicCas(mlir::Type, llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 6ebd52dcd42ea..d329bd9f14cc5 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -290,6 +290,14 @@ static constexpr IntrinsicHandler handlers[]{ {"atan2pi", &I::genAtanpi}, {"atand", &I::genAtand}, {"atanpi", &I::genAtanpi}, + {"atomicadd_r2x2", + &I::genAtomicAddVector, + {{{"a", asAddr}, {"v", asAddr}}}, + false}, + {"atomicadd_r4x2", + &I::genAtomicAddVector, + {{{"a", asAddr}, {"v", asAddr}}}, + false}, {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false}, @@ -3168,6 +3176,48 @@ IntrinsicLibrary::genAtomicAddR2(mlir::Type resultType, mlir::ArrayRef{0}); } +fir::ExtendedValue +IntrinsicLibrary::genAtomicAddVector(mlir::Type resultType, + llvm::ArrayRef args) { + assert(args.size() == 2); + mlir::Value res = fir::AllocaOp::create( + builder, loc, fir::SequenceType::get({2}, resultType)); + mlir::Value a = fir::getBase(args[0]); + if (mlir::isa(a.getType())) { + a = fir::BoxAddrOp::create(builder, loc, a); + } + auto eleTy = fir::unwrapSequenceType(resultType); + auto loc = builder.getUnknownLoc(); + auto i32Ty = builder.getI32Type(); + auto vecTy = mlir::VectorType::get({2}, eleTy); + mlir::Type idxTy = builder.getIndexType(); + auto refTy = fir::ReferenceType::get(eleTy); + auto zero = builder.createIntegerConstant(loc, idxTy, 0); + auto one = builder.createIntegerConstant(loc, idxTy, 1); + auto v1Coord = fir::CoordinateOp::create(builder, loc, refTy, + fir::getBase(args[1]), zero); + auto v2Coord = fir::CoordinateOp::create(builder, loc, refTy, + fir::getBase(args[1]), one); + auto v1 = fir::LoadOp::create(builder, loc, v1Coord); + auto v2 = fir::LoadOp::create(builder, loc, v2Coord); + mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); + mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( + builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); + mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( + builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); + auto add = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); + auto r1 = mlir::LLVM::ExtractElementOp::create( + builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 0)); + auto r2 = mlir::LLVM::ExtractElementOp::create( + builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 1)); + auto c1 = fir::CoordinateOp::create(builder, loc, refTy, res, zero); + auto c2 = fir::CoordinateOp::create(builder, loc, refTy, res, one); + fir::StoreOp::create(builder, loc, r1, c1); + fir::StoreOp::create(builder, loc, r2, c2); + mlir::Value ext = builder.createIntegerConstant(loc, idxTy, 2); + return fir::ArrayBoxValue(res, {ext}); +} + mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType, llvm::ArrayRef args) { assert(args.size() == 2); diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 7a764b589dc56..b1aef95cba8c9 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -1178,6 +1178,22 @@ attributes(device) pure integer(4) function atomicaddr2(address, val) end function end interface + interface atomicaddvector + attributes(device) pure function atomicadd_r2x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(2), dimension(2), intent(inout) :: address + real(2), dimension(2), intent(in) :: val + real(2), dimension(2) :: z + end function + + attributes(device) pure function atomicadd_r4x2(address, val) result(z) + !dir$ ignore_tkr (rd) address, (d) val + real(4), dimension(2), intent(inout) :: address + real(4), dimension(2), intent(in) :: val + real(4), dimension(2) :: z + end function + end interface + interface atomicsub attributes(device) pure integer function atomicsubi(address, val) !dir$ ignore_tkr (d) address, (d) val diff --git a/flang/test/Lower/CUDA/cuda-atomicadd.cuf b/flang/test/Lower/CUDA/cuda-atomicadd.cuf new file mode 100644 index 0000000000000..1669674e8d4ce --- /dev/null +++ b/flang/test/Lower/CUDA/cuda-atomicadd.cuf @@ -0,0 +1,19 @@ +! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s + +! Test CUDA Fortran atmoicadd functions available cudadevice module + +attributes(global) subroutine atomicaddvector_r2() + real(2), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicAddVector(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPatomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16> + +attributes(global) subroutine atomicaddvector_r4() + real(4), device :: a(2), tmp1(2), tmp2(2) + tmp1 = atomicAddVector(a, tmp2) +end subroutine + +! CHECK-LABEL: func.func @_QPatomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector From c5df064ad1d84a947ac06cb9db0877fcfa016870 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Mon, 3 Nov 2025 16:56:49 -0800 Subject: [PATCH 2/4] cleanup --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index d329bd9f14cc5..de3adfa0e8d40 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -3186,12 +3186,10 @@ IntrinsicLibrary::genAtomicAddVector(mlir::Type resultType, if (mlir::isa(a.getType())) { a = fir::BoxAddrOp::create(builder, loc, a); } - auto eleTy = fir::unwrapSequenceType(resultType); - auto loc = builder.getUnknownLoc(); auto i32Ty = builder.getI32Type(); - auto vecTy = mlir::VectorType::get({2}, eleTy); + auto vecTy = mlir::VectorType::get({2}, resultType); mlir::Type idxTy = builder.getIndexType(); - auto refTy = fir::ReferenceType::get(eleTy); + auto refTy = fir::ReferenceType::get(resultType); auto zero = builder.createIntegerConstant(loc, idxTy, 0); auto one = builder.createIntegerConstant(loc, idxTy, 1); auto v1Coord = fir::CoordinateOp::create(builder, loc, refTy, From 5487e9e342427e33ab0670b9d979c3d9ce586d95 Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Mon, 3 Nov 2025 17:02:15 -0800 Subject: [PATCH 3/4] More cleanup --- flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 31 ++++++++++--------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index de3adfa0e8d40..18217310e2707 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -3186,30 +3186,31 @@ IntrinsicLibrary::genAtomicAddVector(mlir::Type resultType, if (mlir::isa(a.getType())) { a = fir::BoxAddrOp::create(builder, loc, a); } - auto i32Ty = builder.getI32Type(); auto vecTy = mlir::VectorType::get({2}, resultType); - mlir::Type idxTy = builder.getIndexType(); auto refTy = fir::ReferenceType::get(resultType); - auto zero = builder.createIntegerConstant(loc, idxTy, 0); - auto one = builder.createIntegerConstant(loc, idxTy, 1); - auto v1Coord = fir::CoordinateOp::create(builder, loc, refTy, - fir::getBase(args[1]), zero); - auto v2Coord = fir::CoordinateOp::create(builder, loc, refTy, - fir::getBase(args[1]), one); - auto v1 = fir::LoadOp::create(builder, loc, v1Coord); - auto v2 = fir::LoadOp::create(builder, loc, v2Coord); + mlir::Type i32Ty = builder.getI32Type(); + mlir::Type idxTy = builder.getIndexType(); + mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0); + mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1); + mlir::Value v1Coord = fir::CoordinateOp::create(builder, loc, refTy, + fir::getBase(args[1]), zero); + mlir::Value v2Coord = fir::CoordinateOp::create(builder, loc, refTy, + fir::getBase(args[1]), one); + mlir::Value v1 = fir::LoadOp::create(builder, loc, v1Coord); + mlir::Value v2 = fir::LoadOp::create(builder, loc, v2Coord); mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy); mlir::Value vec1 = mlir::LLVM::InsertElementOp::create( builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0)); mlir::Value vec2 = mlir::LLVM::InsertElementOp::create( builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1)); - auto add = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); - auto r1 = mlir::LLVM::ExtractElementOp::create( + mlir::Value add = + genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2); + mlir::Value r1 = mlir::LLVM::ExtractElementOp::create( builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 0)); - auto r2 = mlir::LLVM::ExtractElementOp::create( + mlir::Value r2 = mlir::LLVM::ExtractElementOp::create( builder, loc, add, builder.createIntegerConstant(loc, i32Ty, 1)); - auto c1 = fir::CoordinateOp::create(builder, loc, refTy, res, zero); - auto c2 = fir::CoordinateOp::create(builder, loc, refTy, res, one); + mlir::Value c1 = fir::CoordinateOp::create(builder, loc, refTy, res, zero); + mlir::Value c2 = fir::CoordinateOp::create(builder, loc, refTy, res, one); fir::StoreOp::create(builder, loc, r1, c1); fir::StoreOp::create(builder, loc, r2, c2); mlir::Value ext = builder.createIntegerConstant(loc, idxTy, 2); From a63a96102c43d15acb593b3fdfab57759769104d Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Mon, 3 Nov 2025 18:27:51 -0800 Subject: [PATCH 4/4] Fix test --- flang/test/Lower/CUDA/cuda-atomicadd.cuf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/test/Lower/CUDA/cuda-atomicadd.cuf b/flang/test/Lower/CUDA/cuda-atomicadd.cuf index 1669674e8d4ce..baa6cdb3d5869 100644 --- a/flang/test/Lower/CUDA/cuda-atomicadd.cuf +++ b/flang/test/Lower/CUDA/cuda-atomicadd.cuf @@ -15,5 +15,5 @@ attributes(global) subroutine atomicaddvector_r4() tmp1 = atomicAddVector(a, tmp2) end subroutine -! CHECK-LABEL: func.func @_QPatomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc} -! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector +! CHECK-LABEL: func.func @_QPatomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc} +! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32>