diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp index e8fa70ebc39d8..bdc2d9cd9c6c4 100644 --- a/flang/lib/Optimizer/Transforms/StackArrays.cpp +++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp @@ -287,7 +287,7 @@ mlir::ChangeResult LatticePoint::join(const AbstractDenseLattice &lattice) { void LatticePoint::print(llvm::raw_ostream &os) const { for (const auto &[value, state] : stateMap) { - os << value << ": "; + os << "\n * " << value << ": "; ::print(os, state); } } @@ -361,6 +361,13 @@ void AllocationAnalysis::visitOperation(mlir::Operation *op, } else if (mlir::isa(op)) { assert(op->getNumOperands() == 1 && "fir.freemem has one operand"); mlir::Value operand = op->getOperand(0); + + // Note: StackArrays is scheduled in the pass pipeline after lowering hlfir + // to fir. Therefore, we only need to handle `fir::DeclareOp`s. + if (auto declareOp = + llvm::dyn_cast_if_present(operand.getDefiningOp())) + operand = declareOp.getMemref(); + std::optional operandState = before.get(operand); if (operandState && *operandState == AllocationState::Allocated) { // don't tag things not allocated in this function as freed, so that we @@ -452,6 +459,9 @@ StackArraysAnalysisWrapper::analyseFunction(mlir::Operation *func) { }; func->walk([&](mlir::func::ReturnOp child) { joinOperationLattice(child); }); func->walk([&](fir::UnreachableOp child) { joinOperationLattice(child); }); + func->walk( + [&](mlir::omp::TerminatorOp child) { joinOperationLattice(child); }); + llvm::DenseSet freedValues; point.appendFreedValues(freedValues); @@ -518,9 +528,18 @@ AllocMemConversion::matchAndRewrite(fir::AllocMemOp allocmem, // remove freemem operations llvm::SmallVector erases; - for (mlir::Operation *user : allocmem.getOperation()->getUsers()) + for (mlir::Operation *user : allocmem.getOperation()->getUsers()) { + if (auto declareOp = mlir::dyn_cast_if_present(user)) { + for (mlir::Operation *user : declareOp->getUsers()) { + if (mlir::isa(user)) + erases.push_back(user); + } + } + if (mlir::isa(user)) erases.push_back(user); + } + // now we are done iterating the users, it is safe to mutate them for (mlir::Operation *erase : erases) rewriter.eraseOp(erase); @@ -633,9 +652,19 @@ AllocMemConversion::findAllocaLoopInsertionPoint(fir::AllocMemOp &oldAlloc) { // find freemem ops llvm::SmallVector freeOps; - for (mlir::Operation *user : oldAllocOp->getUsers()) + + for (mlir::Operation *user : oldAllocOp->getUsers()) { + if (auto declareOp = mlir::dyn_cast_if_present(user)) { + for (mlir::Operation *user : declareOp->getUsers()) { + if (mlir::isa(user)) + freeOps.push_back(user); + } + } + if (mlir::isa(user)) freeOps.push_back(user); + } + assert(freeOps.size() && "DFA should only return freed memory"); // Don't attempt to reason about a stacksave/stackrestore between different @@ -717,12 +746,23 @@ void AllocMemConversion::insertStackSaveRestore( mlir::SymbolRefAttr stackRestoreSym = builder.getSymbolRefAttr(stackRestoreFn.getName()); + auto createStackRestoreCall = [&](mlir::Operation *user) { + builder.setInsertionPoint(user); + builder.create(user->getLoc(), + stackRestoreFn.getFunctionType().getResults(), + stackRestoreSym, mlir::ValueRange{sp}); + }; + for (mlir::Operation *user : oldAlloc->getUsers()) { + if (auto declareOp = mlir::dyn_cast_if_present(user)) { + for (mlir::Operation *user : declareOp->getUsers()) { + if (mlir::isa(user)) + createStackRestoreCall(user); + } + } + if (mlir::isa(user)) { - builder.setInsertionPoint(user); - builder.create(user->getLoc(), - stackRestoreFn.getFunctionType().getResults(), - stackRestoreSym, mlir::ValueRange{sp}); + createStackRestoreCall(user); } } diff --git a/flang/test/Transforms/stack-arrays-hlfir.f90 b/flang/test/Transforms/stack-arrays-hlfir.f90 new file mode 100644 index 0000000000000..50261b3078466 --- /dev/null +++ b/flang/test/Transforms/stack-arrays-hlfir.f90 @@ -0,0 +1,55 @@ +! Similar to stack-arrays.f90; i.e. both test the stack-arrays pass for different +! kinds of supported inputs. This one differs in that it takes the hlfir lowering +! path in flag rather than the fir one. For example, temp arrays are lowered +! differently in hlfir vs. fir and the IR that reaches the stack arrays pass looks +! quite different. + + +! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - \ +! RUN: | fir-opt --lower-hlfir-ordered-assignments \ +! RUN: --bufferize-hlfir \ +! RUN: --convert-hlfir-to-fir \ +! RUN: --array-value-copy \ +! RUN: --stack-arrays \ +! RUN: | FileCheck %s + +subroutine temp_array + implicit none + integer (8) :: lV + integer (8), dimension (2) :: iaVS + + lV = 202 + + iaVS = [lV, lV] +end subroutine temp_array +! CHECK-LABEL: func.func @_QPtemp_array{{.*}} { +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: fir.alloca !fir.array<2xi64> +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: return +! CHECK-NEXT: } + +subroutine omp_temp_array + implicit none + integer (8) :: lV + integer (8), dimension (2) :: iaVS + + lV = 202 + + !$omp target + iaVS = [lV, lV] + !$omp end target +end subroutine omp_temp_array +! CHECK-LABEL: func.func @_QPomp_temp_array{{.*}} { +! CHECK: omp.target {{.*}} { +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: fir.alloca !fir.array<2xi64> +! CHECK-NOT: fir.allocmem +! CHECK-NOT: fir.freemem +! CHECK: omp.terminator +! CHECK-NEXT: } +! CHECK: return +! CHECK-NEXT: } diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir index a2ffe555091eb..45c22c15f7995 100644 --- a/flang/test/Transforms/stack-arrays.fir +++ b/flang/test/Transforms/stack-arrays.fir @@ -339,13 +339,10 @@ func.func @omp_placement1() { return } // CHECK: func.func @omp_placement1() { +// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array<42xi32> +// CHECK-NEXT: %[[MEM_CONV:.*]] = fir.convert %[[MEM]] : (!fir.ref>) -> !fir.heap> // CHECK-NEXT: omp.sections { // CHECK-NEXT: omp.section { -// CHECK-NEXT: %[[MEM:.*]] = fir.allocmem !fir.array<42xi32> -// TODO: this allocation should be moved to the stack. Unfortunately, the data -// flow analysis fails to propogate the lattice out of the omp region to the -// return satement. -// CHECK-NEXT: fir.freemem %[[MEM]] : !fir.heap> // CHECK-NEXT: omp.terminator // CHECK-NEXT: } // CHECK-NEXT: omp.terminator @@ -369,3 +366,37 @@ func.func @stop_terminator() { // CHECK-NEXT: %[[NONE:.*]] = fir.call @_FortranAStopStatement(%[[ZERO]], %[[FALSE]], %[[FALSE]]) : (i32, i1, i1) -> none // CHECK-NEXT: fir.unreachable // CHECK-NEXT: } + + +// check that stack allocations that use fir.declare which must be placed in loops +// use stacksave +func.func @placement_loop_declare() { + %c1 = arith.constant 1 : index + %c1_i32 = fir.convert %c1 : (index) -> i32 + %c2 = arith.constant 2 : index + %c10 = arith.constant 10 : index + %0:2 = fir.do_loop %arg0 = %c1 to %c10 step %c1 iter_args(%arg1 = %c1_i32) -> (index, i32) { + %3 = arith.addi %c1, %c2 : index + // operand is now available + %4 = fir.allocmem !fir.array, %3 + %5 = fir.declare %4 {uniq_name = "temp"} : (!fir.heap>) -> !fir.heap> + // ... + fir.freemem %5 : !fir.heap> + fir.result %3, %c1_i32 : index, i32 + } + return +} +// CHECK: func.func @placement_loop_declare() { +// CHECK-NEXT: %[[C1:.*]] = arith.constant 1 : index +// CHECK-NEXT: %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32 +// CHECK-NEXT: %[[C2:.*]] = arith.constant 2 : index +// CHECK-NEXT: %[[C10:.*]] = arith.constant 10 : index +// CHECK-NEXT: fir.do_loop +// CHECK-NEXT: %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index +// CHECK-NEXT: %[[SP:.*]] = fir.call @llvm.stacksave.p0() : () -> !fir.ref +// CHECK-NEXT: %[[MEM:.*]] = fir.alloca !fir.array, %[[SUM]] +// CHECK: fir.call @llvm.stackrestore.p0(%[[SP]]) +// CHECK-NEXT: fir.result +// CHECK-NEXT: } +// CHECK-NEXT: return +// CHECK-NEXT: }