diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index e8fa70ebc39d8..bdc2d9cd9c6c4 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -287,7 +287,7 @@ mlir::ChangeResult LatticePoint::join(const AbstractDenseLattice &lattice) {
 
 void LatticePoint::print(llvm::raw_ostream &os) const {
   for (const auto &[value, state] : stateMap) {
-    os << value << ": ";
+    os << "\n * " << value << ": ";
     ::print(os, state);
   }
 }
@@ -361,6 +361,13 @@ void AllocationAnalysis::visitOperation(mlir::Operation *op,
   } else if (mlir::isa<fir::FreeMemOp>(op)) {
     assert(op->getNumOperands() == 1 && "fir.freemem has one operand");
     mlir::Value operand = op->getOperand(0);
+
+    // Note: StackArrays is scheduled in the pass pipeline after lowering hlfir
+    // to fir. Therefore, we only need to handle `fir::DeclareOp`s.
+    if (auto declareOp =
+            llvm::dyn_cast_if_present<fir::DeclareOp>(operand.getDefiningOp()))
+      operand = declareOp.getMemref();
+
     std::optional<AllocationState> operandState = before.get(operand);
     if (operandState && *operandState == AllocationState::Allocated) {
       // don't tag things not allocated in this function as freed, so that we
@@ -452,6 +459,9 @@ StackArraysAnalysisWrapper::analyseFunction(mlir::Operation *func) {
   };
   func->walk([&](mlir::func::ReturnOp child) { joinOperationLattice(child); });
   func->walk([&](fir::UnreachableOp child) { joinOperationLattice(child); });
+  func->walk(
+      [&](mlir::omp::TerminatorOp child) { joinOperationLattice(child); });
+
   llvm::DenseSet<mlir::Value> freedValues;
   point.appendFreedValues(freedValues);
 
@@ -518,9 +528,18 @@ AllocMemConversion::matchAndRewrite(fir::AllocMemOp allocmem,
 
   // remove freemem operations
   llvm::SmallVector<mlir::Operation *> erases;
-  for (mlir::Operation *user : allocmem.getOperation()->getUsers())
+  for (mlir::Operation *user : allocmem.getOperation()->getUsers()) {
+    if (auto declareOp = mlir::dyn_cast_if_present<fir::DeclareOp>(user)) {
+      for (mlir::Operation *user : declareOp->getUsers()) {
+        if (mlir::isa<fir::FreeMemOp>(user))
+          erases.push_back(user);
+      }
+    }
+
     if (mlir::isa<fir::FreeMemOp>(user))
       erases.push_back(user);
+  }
+
   // now we are done iterating the users, it is safe to mutate them
   for (mlir::Operation *erase : erases)
     rewriter.eraseOp(erase);
@@ -633,9 +652,19 @@ AllocMemConversion::findAllocaLoopInsertionPoint(fir::AllocMemOp &oldAlloc) {
 
   // find freemem ops
   llvm::SmallVector<mlir::Operation *, 1> freeOps;
-  for (mlir::Operation *user : oldAllocOp->getUsers())
+
+  for (mlir::Operation *user : oldAllocOp->getUsers()) {
+    if (auto declareOp = mlir::dyn_cast_if_present<fir::DeclareOp>(user)) {
+      for (mlir::Operation *user : declareOp->getUsers()) {
+        if (mlir::isa<fir::FreeMemOp>(user))
+          freeOps.push_back(user);
+      }
+    }
+
     if (mlir::isa<fir::FreeMemOp>(user))
       freeOps.push_back(user);
+  }
+
   assert(freeOps.size() && "DFA should only return freed memory");
 
   // Don't attempt to reason about a stacksave/stackrestore between different
@@ -717,12 +746,23 @@ void AllocMemConversion::insertStackSaveRestore(
   mlir::SymbolRefAttr stackRestoreSym =
       builder.getSymbolRefAttr(stackRestoreFn.getName());
 
+  auto createStackRestoreCall = [&](mlir::Operation *user) {
+    builder.setInsertionPoint(user);
+    builder.create<fir::CallOp>(user->getLoc(),
+                                stackRestoreFn.getFunctionType().getResults(),
+                                stackRestoreSym, mlir::ValueRange{sp});
+  };
+
   for (mlir::Operation *user : oldAlloc->getUsers()) {
+    if (auto declareOp = mlir::dyn_cast_if_present<fir::DeclareOp>(user)) {
+      for (mlir::Operation *user : declareOp->getUsers()) {
+        if (mlir::isa<fir::FreeMemOp>(user))
+          createStackRestoreCall(user);
+      }
+    }
+
     if (mlir::isa<fir::FreeMemOp>(user)) {
-      builder.setInsertionPoint(user);
-      builder.create<fir::CallOp>(user->getLoc(),
-                                  stackRestoreFn.getFunctionType().getResults(),
-                                  stackRestoreSym, mlir::ValueRange{sp});
+      createStackRestoreCall(user);
     }
   }
 
diff --git a/flang/test/Transforms/stack-arrays-hlfir.f90 b/flang/test/Transforms/stack-arrays-hlfir.f90
new file mode 100644
index 0000000000000..50261b3078466
--- /dev/null
+++ b/flang/test/Transforms/stack-arrays-hlfir.f90
@@ -0,0 +1,55 @@
+! Similar to stack-arrays.f90; i.e. both test the stack-arrays pass for different
+! kinds of supported inputs. This one differs in that it takes the hlfir lowering
+! path in flag rather than the fir one. For example, temp arrays are lowered
+! differently in hlfir vs. fir and the IR that reaches the stack arrays pass looks
+! quite different.
+
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - \
+! RUN: | fir-opt --lower-hlfir-ordered-assignments \
+! RUN:           --bufferize-hlfir \
+! RUN:           --convert-hlfir-to-fir \
+! RUN:           --array-value-copy \
+! RUN:           --stack-arrays \
+! RUN: | FileCheck %s
+
+subroutine temp_array
+  implicit none
+  integer (8) :: lV
+  integer (8), dimension (2) :: iaVS
+
+  lV = 202
+
+  iaVS = [lV, lV]
+end subroutine temp_array
+! CHECK-LABEL: func.func @_QPtemp_array{{.*}} {
+! CHECK-NOT:     fir.allocmem
+! CHECK-NOT:     fir.freemem
+! CHECK:         fir.alloca !fir.array<2xi64>
+! CHECK-NOT:     fir.allocmem
+! CHECK-NOT:     fir.freemem
+! CHECK:         return
+! CHECK-NEXT:  }
+
+subroutine omp_temp_array
+  implicit none
+  integer (8) :: lV
+  integer (8), dimension (2) :: iaVS
+
+  lV = 202
+
+  !$omp target
+    iaVS = [lV, lV]
+  !$omp end target
+end subroutine omp_temp_array
+! CHECK-LABEL: func.func @_QPomp_temp_array{{.*}} {
+! CHECK:         omp.target {{.*}} {
+! CHECK-NOT:       fir.allocmem
+! CHECK-NOT:       fir.freemem
+! CHECK:           fir.alloca !fir.array<2xi64>
+! CHECK-NOT:       fir.allocmem
+! CHECK-NOT:       fir.freemem
+! CHECK:           omp.terminator
+! CHECK-NEXT:    }
+! CHECK:         return
+! CHECK-NEXT:  }
diff --git a/flang/test/Transforms/stack-arrays.fir b/flang/test/Transforms/stack-arrays.fir
index a2ffe555091eb..45c22c15f7995 100644
--- a/flang/test/Transforms/stack-arrays.fir
+++ b/flang/test/Transforms/stack-arrays.fir
@@ -339,13 +339,10 @@ func.func @omp_placement1() {
   return
 }
 // CHECK:      func.func @omp_placement1() {
+// CHECK-NEXT:   %[[MEM:.*]] = fir.alloca !fir.array<42xi32>
+// CHECK-NEXT:   %[[MEM_CONV:.*]] = fir.convert %[[MEM]] : (!fir.ref<!fir.array<42xi32>>) -> !fir.heap<!fir.array<42xi32>>
 // CHECK-NEXT:   omp.sections {
 // CHECK-NEXT:     omp.section {
-// CHECK-NEXT:       %[[MEM:.*]] = fir.allocmem !fir.array<42xi32>
-// TODO: this allocation should be moved to the stack. Unfortunately, the data
-// flow analysis fails to propogate the lattice out of the omp region to the
-// return satement.
-// CHECK-NEXT:       fir.freemem %[[MEM]] : !fir.heap<!fir.array<42xi32>>
 // CHECK-NEXT:       omp.terminator
 // CHECK-NEXT:     }
 // CHECK-NEXT:     omp.terminator
@@ -369,3 +366,37 @@ func.func @stop_terminator() {
 // CHECK-NEXT:  %[[NONE:.*]] = fir.call @_FortranAStopStatement(%[[ZERO]], %[[FALSE]], %[[FALSE]]) : (i32, i1, i1) -> none
 // CHECK-NEXT:  fir.unreachable
 // CHECK-NEXT: }
+
+
+// check that stack allocations that use fir.declare which must be placed in loops
+// use stacksave
+func.func @placement_loop_declare() {
+  %c1 = arith.constant 1 : index
+  %c1_i32 = fir.convert %c1 : (index) -> i32
+  %c2 = arith.constant 2 : index
+  %c10 = arith.constant 10 : index
+  %0:2 = fir.do_loop %arg0 = %c1 to %c10 step %c1 iter_args(%arg1 = %c1_i32) -> (index, i32) {
+    %3 = arith.addi %c1, %c2 : index
+    // operand is now available
+    %4 = fir.allocmem !fir.array<?xi32>, %3
+    %5 = fir.declare %4 {uniq_name = "temp"} : (!fir.heap<!fir.array<?xi32>>) -> !fir.heap<!fir.array<?xi32>>
+    // ...
+    fir.freemem %5 : !fir.heap<!fir.array<?xi32>>
+    fir.result %3, %c1_i32 : index, i32
+  }
+  return
+}
+// CHECK:      func.func @placement_loop_declare() {
+// CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-NEXT:   %[[C1_I32:.*]] = fir.convert %[[C1]] : (index) -> i32
+// CHECK-NEXT:   %[[C2:.*]] = arith.constant 2 : index
+// CHECK-NEXT:   %[[C10:.*]] = arith.constant 10 : index
+// CHECK-NEXT:   fir.do_loop
+// CHECK-NEXT:     %[[SUM:.*]] = arith.addi %[[C1]], %[[C2]] : index
+// CHECK-NEXT:     %[[SP:.*]] = fir.call @llvm.stacksave.p0() : () -> !fir.ref<i8>
+// CHECK-NEXT:     %[[MEM:.*]] = fir.alloca !fir.array<?xi32>, %[[SUM]]
+// CHECK:          fir.call @llvm.stackrestore.p0(%[[SP]])
+// CHECK-NEXT:     fir.result
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
+// CHECK-NEXT: }