From bb5826b3c3468081b9a4aa39a5e91436648d2d1f Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Thu, 21 Sep 2023 13:36:48 +0100
Subject: [PATCH] [Flang][OpenMP] Create MLIR optimization pass to push index
 allocations into loop body and remove them if redundant

This patch adds a Flang-only MLIR optimization pass that aims to remove
redundant allocations of loop index variables related to OpenMP loops and
improve LLVM IR code generation. The loop operations for which this is
implemented and tested are `omp.wsloop` and `omp.simdloop`, and the main ways
in which this is approached are to move allocations inside of the loop body
(later this avoids having to pass these variables as arguments to an outlined
function in LLVM IR) and to use block arguments representing loop indices on
the loop region directly instead, if possible.

This is done in two stages:
  1. Push allocations (`fir.alloca` and `hlfir.declare`) inside of the loop
operation's region. This is only done for allocations that are used to store
loop index variables and only used inside of a single loop region. The result
of this is that, during MLIR to LLVM IR translation, when the loop operation is
lowered by creating a function the allocation does not need to be passed as an
additional argument.
  2. Remove allocations and related load and store operations, and access the
index through the corresponding block argument. If the previous step is
successful, this can also be done if all uses of the allocation are `fir.load`
or `fir.store`, meaning that it's not passed as a reference to another
function/subprocedure.

The pass has been implemented to work with and without HLFIR support enabled,
and multiple unit tests have been updated due to this pass running by default.
---
 .../flang/Optimizer/Transforms/Passes.h       |   1 +
 .../flang/Optimizer/Transforms/Passes.td      |   9 +
 flang/include/flang/Tools/CLOptions.inc       |   1 +
 flang/lib/Optimizer/Transforms/CMakeLists.txt |   1 +
 .../Transforms/OMPLoopIndexMemToReg.cpp       | 250 ++++++++++++++++++
 flang/test/Lower/OpenMP/FIR/copyin.f90        |  53 ++--
 .../OpenMP/FIR/lastprivate-commonblock.f90    |   2 -
 .../FIR/parallel-private-clause-fixes.f90     |  39 ++-
 .../OpenMP/FIR/parallel-private-clause.f90    |  28 +-
 .../OpenMP/FIR/parallel-wsloop-firstpriv.f90  |   8 +-
 .../test/Lower/OpenMP/FIR/parallel-wsloop.f90 |  24 +-
 flang/test/Lower/OpenMP/FIR/simd.f90          |  32 +--
 .../Lower/OpenMP/FIR/stop-stmt-in-region.f90  |   2 -
 flang/test/Lower/OpenMP/FIR/target.f90        |  23 +-
 flang/test/Lower/OpenMP/FIR/unstructured.f90  |  12 +-
 flang/test/Lower/OpenMP/FIR/wsloop-chunks.f90 |  12 +-
 .../test/Lower/OpenMP/FIR/wsloop-collapse.f90 |  12 +-
 .../Lower/OpenMP/FIR/wsloop-monotonic.f90     |   5 +-
 .../Lower/OpenMP/FIR/wsloop-nonmonotonic.f90  |   5 +-
 .../Lower/OpenMP/FIR/wsloop-reduction-add.f90 |  56 +---
 .../FIR/wsloop-reduction-logical-and.f90      |  21 +-
 .../FIR/wsloop-reduction-logical-eqv.f90      |  21 +-
 .../FIR/wsloop-reduction-logical-neqv.f90     |  21 +-
 .../FIR/wsloop-reduction-logical-or.f90       |  21 +-
 .../Lower/OpenMP/FIR/wsloop-reduction-mul.f90 |  56 +---
 flang/test/Lower/OpenMP/FIR/wsloop-simd.f90   |   4 +-
 .../test/Lower/OpenMP/FIR/wsloop-variable.f90 |   4 +-
 flang/test/Lower/OpenMP/FIR/wsloop.f90        |  15 +-
 .../Todo/omp-default-clause-inner-loop.f90    |   5 +-
 flang/test/Lower/OpenMP/hlfir-wsloop.f90      |  18 +-
 .../OpenMP/wsloop-reduction-add-hlfir.f90     |   6 +-
 flang/test/Transforms/omp-wsloop-index.mlir   | 247 +++++++++++++++++
 32 files changed, 663 insertions(+), 351 deletions(-)
 create mode 100644 flang/lib/Optimizer/Transforms/OMPLoopIndexMemToReg.cpp
 create mode 100644 flang/test/Transforms/omp-wsloop-index.mlir

diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h
index 8aeb3e373298e..0a9a3ca5bd030 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.h
+++ b/flang/include/flang/Optimizer/Transforms/Passes.h
@@ -79,6 +79,7 @@ createOMPEarlyOutliningPass();
 std::unique_ptr<mlir::Pass> createOMPFunctionFilteringPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 createOMPMarkDeclareTargetPass();
+std::unique_ptr<mlir::Pass> createOMPLoopIndexMemToRegPass();
 
 // declarative passes
 #define GEN_PASS_REGISTRATION
diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index 9474edf13ce46..8304b882d525c 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -326,4 +326,13 @@ def OMPFunctionFiltering : Pass<"omp-function-filtering"> {
   ];
 }
 
+def OMPLoopIndexMemToReg : Pass<"omp-loop-index-mem2reg", "mlir::func::FuncOp"> {
+  let summary = "Pushes allocations for index variables of OpenMP loops into "
+                "the loop region and, if they are never passed by reference, "
+                "they are replaced by the corresponding entry block arguments, "
+                "removing all redundant allocations in the process.";
+  let constructor = "::fir::createOMPLoopIndexMemToRegPass()";
+  let dependentDialects = ["fir::FIROpsDialect", "mlir::omp::OpenMPDialect"];
+}
+
 #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES
diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc
index 616d9ddc066a7..0b5e8a0656804 100644
--- a/flang/include/flang/Tools/CLOptions.inc
+++ b/flang/include/flang/Tools/CLOptions.inc
@@ -270,6 +270,7 @@ inline void createOpenMPFIRPassPipeline(
     pm.addPass(fir::createOMPEarlyOutliningPass());
     pm.addPass(fir::createOMPFunctionFilteringPass());
   }
+  pm.addPass(fir::createOMPLoopIndexMemToRegPass());
 }
 
 #if !defined(FLANG_EXCLUDE_CODEGEN)
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 3d2b7e5eaeade..306551b03ced1 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -19,6 +19,7 @@ add_flang_library(FIRTransforms
   OMPEarlyOutlining.cpp
   OMPFunctionFiltering.cpp
   OMPMarkDeclareTarget.cpp
+  OMPLoopIndexMemToReg.cpp
 
   DEPENDS
   FIRDialect
diff --git a/flang/lib/Optimizer/Transforms/OMPLoopIndexMemToReg.cpp b/flang/lib/Optimizer/Transforms/OMPLoopIndexMemToReg.cpp
new file mode 100644
index 0000000000000..af117d625154b
--- /dev/null
+++ b/flang/lib/Optimizer/Transforms/OMPLoopIndexMemToReg.cpp
@@ -0,0 +1,250 @@
+//===- OMPWsLoopIndexMem2Reg.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements transforms to push allocations into an OpenMP loop
+// operation region when they are used to store loop indices. Then, they are
+// removed together with any associated load or store operations if their
+// address is not needed, in which case uses of their values are replaced for
+// the block argument from which they were originally initialized.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/HLFIR/HLFIROps.h"
+#include "flang/Optimizer/Transforms/Passes.h"
+
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/FIROps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPInterfaces.h"
+#include "mlir/IR/BuiltinOps.h"
+#include <llvm/ADT/MapVector.h>
+#include <llvm/ADT/SmallSet.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/Casting.h>
+#include <mlir/Dialect/LLVMIR/LLVMDialect.h>
+#include <mlir/IR/Builders.h>
+#include <mlir/IR/Value.h>
+#include <mlir/IR/ValueRange.h>
+#include <mlir/Support/LLVM.h>
+
+namespace fir {
+#define GEN_PASS_DEF_OMPLOOPINDEXMEMTOREG
+#include "flang/Optimizer/Transforms/Passes.h.inc"
+} // namespace fir
+
+using namespace mlir;
+
+template <typename LoopOpTy>
+class LoopProcessorHelper {
+  LoopOpTy loop;
+
+  bool allUsesInLoop(ValueRange stores) {
+    for (Value store : stores) {
+      for (OpOperand &use : store.getUses()) {
+        Operation *owner = use.getOwner();
+        if (owner->getParentOfType<LoopOpTy>() != loop.getOperation())
+          return false;
+      }
+    }
+    return true;
+  }
+
+  /// Check whether a given hlfir.declare known to only be used inside of the
+  /// loop and initialized by a fir.alloca operation also only used inside of
+  /// the loop can be removed and replaced by the block argument representing
+  /// the corresponding loop index.
+  static bool isDeclareRemovable(hlfir::DeclareOp declareOp) {
+    fir::AllocaOp allocaOp = llvm::dyn_cast_if_present<fir::AllocaOp>(
+        declareOp.getMemref().getDefiningOp());
+
+    // Check that the hlfir.declare is initialized by a fir.alloca that is only
+    // used as argument to that operation.
+    if (!allocaOp || !allocaOp.getResult().hasOneUse())
+      return false;
+
+    // Check that uses of the pointers can be replaced by the block argument.
+    for (OpOperand &use : declareOp.getOriginalBase().getUses()) {
+      Operation *owner = use.getOwner();
+      if (!isa<fir::StoreOp>(owner))
+        return false;
+    }
+    for (OpOperand &use : declareOp.getBase().getUses()) {
+      Operation *owner = use.getOwner();
+      if (!isa<fir::LoadOp>(owner))
+        return false;
+    }
+
+    return true;
+  }
+
+  /// Check whether a given fir.alloca known to only be used inside of the loop
+  /// can be removed and replaced by the block argument representing the
+  /// corresponding loop index.
+  static bool isAllocaRemovable(fir::AllocaOp allocaOp) {
+    // Check that uses of the pointer are all fir.load and fir.store.
+    for (OpOperand &use : allocaOp.getResult().getUses()) {
+      Operation *owner = use.getOwner();
+      if (!isa<fir::LoadOp>(owner) && !isa<fir::StoreOp>(owner))
+        return false;
+    }
+
+    return true;
+  }
+
+  /// Try to push an hlfir.declare operation defined outside of the loop inside,
+  /// if all uses of that operation and the corresponding fir.alloca are
+  /// contained inside of the loop.
+  LogicalResult pushDeclareIntoLoop(hlfir::DeclareOp declareOp) {
+    // Check that all uses are inside of the loop.
+    if (!allUsesInLoop(declareOp->getResults()))
+      return failure();
+
+    // Push hlfir.declare into the beginning of the loop region.
+    Block &b = loop.getRegion().getBlocks().front();
+    declareOp->moveBefore(&b, b.begin());
+
+    // Find associated fir.alloca and push into the beginning of the loop
+    // region.
+    fir::AllocaOp allocaOp =
+        cast<fir::AllocaOp>(declareOp.getMemref().getDefiningOp());
+    Value allocaVal = allocaOp.getResult();
+
+    if (!allUsesInLoop(allocaVal))
+      return failure();
+
+    allocaOp->moveBefore(&b, b.begin());
+    return success();
+  }
+
+  /// Try to push a fir.alloca operation defined outside of the loop inside,
+  /// if all uses of that operation are contained inside of the loop.
+  LogicalResult pushAllocaIntoLoop(fir::AllocaOp allocaOp) {
+    Value store = allocaOp.getResult();
+
+    // Check that all uses are inside of the loop.
+    if (!allUsesInLoop(store))
+      return failure();
+
+    // Push fir.alloca into the beginning of the loop region.
+    Block &b = loop.getRegion().getBlocks().front();
+    allocaOp->moveBefore(&b, b.begin());
+    return success();
+  }
+
+  void processLoopArg(BlockArgument arg, llvm::ArrayRef<Value> argStores,
+                      SmallPtrSetImpl<Operation *> &opsToDelete) {
+    llvm::SmallPtrSet<Operation *, 16> toDelete;
+    for (Value store : argStores) {
+      Operation *op = store.getDefiningOp();
+
+      // Skip argument if storage not defined by an operation.
+      if (!op)
+        return;
+
+      // Support HLFIR flow as well as regular FIR flow.
+      if (auto declareOp = dyn_cast<hlfir::DeclareOp>(op)) {
+        if (succeeded(pushDeclareIntoLoop(declareOp)) &&
+            isDeclareRemovable(declareOp)) {
+          // Mark hlfir.declare, fir.alloca and related uses for deletion.
+          for (OpOperand &use : declareOp.getOriginalBase().getUses())
+            toDelete.insert(use.getOwner());
+
+          for (OpOperand &use : declareOp.getBase().getUses())
+            toDelete.insert(use.getOwner());
+
+          Operation *allocaOp = declareOp.getMemref().getDefiningOp();
+          toDelete.insert(declareOp);
+          toDelete.insert(allocaOp);
+        }
+      } else if (auto allocaOp = dyn_cast<fir::AllocaOp>(op)) {
+        if (succeeded(pushAllocaIntoLoop(allocaOp)) &&
+            isAllocaRemovable(allocaOp)) {
+          // Do not make any further modifications if an address to the index
+          // is necessary. Otherwise, the values can be used directly from the
+          // loop region first block's arguments.
+
+          // Mark fir.alloca and related uses for deletion.
+          for (OpOperand &use : allocaOp.getResult().getUses())
+            toDelete.insert(use.getOwner());
+
+          // Delete now-unused fir.alloca.
+          toDelete.insert(allocaOp);
+        }
+      } else {
+        return;
+      }
+    }
+
+    // Only consider marked operations if all load, store and allocation
+    // operations associated with the given loop index can be removed.
+    opsToDelete.insert(toDelete.begin(), toDelete.end());
+
+    for (Operation *op : toDelete) {
+      // Replace all fir.load operations with the index as returned by the
+      // OpenMP loop operation.
+      if (isa<fir::LoadOp>(op))
+        op->replaceAllUsesWith(ValueRange(arg));
+      // Drop all uses of fir.alloca and hlfir.declare because their defining
+      // operations will be deleted as well.
+      else if (isa<fir::AllocaOp>(op) || isa<hlfir::DeclareOp>(op))
+        op->dropAllUses();
+    }
+  }
+
+public:
+  explicit LoopProcessorHelper(LoopOpTy loop) : loop(loop) {}
+
+  void process() {
+    llvm::SmallPtrSet<Operation *, 16> opsToDelete;
+    llvm::SmallVector<llvm::SmallVector<Value>> storeAddresses;
+    llvm::ArrayRef<BlockArgument> loopArgs = loop.getRegion().getArguments();
+
+    // Collect arguments of the loop operation.
+    for (BlockArgument arg : loopArgs) {
+      // Find fir.store uses of these indices and gather all addresses where
+      // they are stored.
+      llvm::SmallVector<Value> &argStores = storeAddresses.emplace_back();
+      for (OpOperand &argUse : arg.getUses())
+        if (auto storeOp = dyn_cast<fir::StoreOp>(argUse.getOwner()))
+          argStores.push_back(storeOp.getMemref());
+    }
+
+    // Process all loop indices and mark them for deletion independently of each
+    // other.
+    for (auto it : llvm::zip(loopArgs, storeAddresses))
+      processLoopArg(std::get<0>(it), std::get<1>(it), opsToDelete);
+
+    // Delete marked operations.
+    for (Operation *op : opsToDelete)
+      op->erase();
+  }
+};
+
+namespace {
+class OMPLoopIndexMemToRegPass
+    : public fir::impl::OMPLoopIndexMemToRegBase<OMPLoopIndexMemToRegPass> {
+public:
+  void runOnOperation() override {
+    func::FuncOp func = getOperation();
+
+    func->walk(
+        [&](omp::WsLoopOp loop) { LoopProcessorHelper(loop).process(); });
+
+    func.walk(
+        [&](omp::SimdLoopOp loop) { LoopProcessorHelper(loop).process(); });
+
+    func.walk(
+        [&](omp::TaskLoopOp loop) { LoopProcessorHelper(loop).process(); });
+  }
+};
+} // namespace
+
+std::unique_ptr<Pass> fir::createOMPLoopIndexMemToRegPass() {
+  return std::make_unique<OMPLoopIndexMemToRegPass>();
+}
diff --git a/flang/test/Lower/OpenMP/FIR/copyin.f90 b/flang/test/Lower/OpenMP/FIR/copyin.f90
index ddfa0ea091462..3443b310074f5 100644
--- a/flang/test/Lower/OpenMP/FIR/copyin.f90
+++ b/flang/test/Lower/OpenMP/FIR/copyin.f90
@@ -138,17 +138,15 @@ subroutine copyin_derived_type()
 ! CHECK:         %[[VAL_1:.*]] = fir.address_of(@_QFcombined_parallel_worksharing_loopEx6) : !fir.ref<i32>
 ! CHECK:         %[[VAL_2:.*]] = omp.threadprivate %[[VAL_1]] : !fir.ref<i32> -> !fir.ref<i32>
 ! CHECK:         omp.parallel   {
-! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-! CHECK:           %[[VAL_4:.*]] = omp.threadprivate %[[VAL_1]] : !fir.ref<i32> -> !fir.ref<i32>
-! CHECK:           %[[VAL_5:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
-! CHECK:           fir.store %[[VAL_5]] to %[[VAL_4]] : !fir.ref<i32>
+! CHECK:           %[[VAL_3:.*]] = omp.threadprivate %[[VAL_1]] : !fir.ref<i32> -> !fir.ref<i32>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_3]] : !fir.ref<i32>
 ! CHECK:           omp.barrier
-! CHECK:           %[[VAL_6:.*]] = arith.constant 1 : i32
-! CHECK:           %[[VAL_7:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
-! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i32
-! CHECK:           omp.wsloop   for  (%[[VAL_9:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
-! CHECK:             fir.store %[[VAL_9]] to %[[VAL_3]] : !fir.ref<i32>
-! CHECK:             fir.call @_QPsub4(%[[VAL_4]]) {{.*}}: (!fir.ref<i32>) -> ()
+! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
+! CHECK:           %[[VAL_6:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:           omp.wsloop   for  (%[[VAL_9:.*]]) : i32 = (%[[VAL_5]]) to (%[[VAL_6]]) inclusive step (%[[VAL_7]]) {
+! CHECK:             fir.call @_QPsub4(%[[VAL_3]]) {{.*}}: (!fir.ref<i32>) -> ()
 ! CHECK:             omp.yield
 ! CHECK:           }
 ! CHECK:           omp.terminator
@@ -269,30 +267,27 @@ subroutine common_1()
 !CHECK: %[[val_7:.*]] = fir.coordinate_of %[[val_6]], %[[val_c4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
 !CHECK: %[[val_8:.*]] = fir.convert %[[val_7]] : (!fir.ref<i8>) -> !fir.ref<i32>
 !CHECK: omp.parallel {
-!CHECK: %[[val_9:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK: %[[val_10:.*]] = omp.threadprivate %[[val_1]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
-!CHECK: %[[val_11:.*]] = fir.convert %[[val_10]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK: %[[val_9:.*]] = omp.threadprivate %[[val_1]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
+!CHECK: %[[val_10:.*]] = fir.convert %[[val_9]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
 !CHECK: %[[val_c0_0:.*]] = arith.constant 0 : index
-!CHECK: %[[val_12:.*]] = fir.coordinate_of %[[val_11]], %[[val_c0_0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_13:.*]] = fir.convert %[[val_12]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_14:.*]] = fir.convert %[[val_10]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK: %[[val_11:.*]] = fir.coordinate_of %[[val_10]], %[[val_c0_0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK: %[[val_12:.*]] = fir.convert %[[val_11]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK: %[[val_13:.*]] = fir.convert %[[val_9]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
 !CHECK: %[[val_c4_1:.*]] = arith.constant 4 : index
-!CHECK: %[[val_15:.*]] = fir.coordinate_of %[[val_14]], %[[val_c4_1]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
-!CHECK: %[[val_16:.*]] = fir.convert %[[val_15]] : (!fir.ref<i8>) -> !fir.ref<i32>
-!CHECK: %[[val_17:.*]] = fir.load %[[val_5]] : !fir.ref<i32>
-!CHECK: fir.store %[[val_17]] to %[[val_13]] : !fir.ref<i32>
-!CHECK: %[[val_18:.*]] = fir.load %[[val_8]] : !fir.ref<i32>
-!CHECK: fir.store %[[val_18]] to %[[val_16]] : !fir.ref<i32>
+!CHECK: %[[val_14:.*]] = fir.coordinate_of %[[val_13]], %[[val_c4_1]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK: %[[val_15:.*]] = fir.convert %[[val_14]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK: %[[val_16:.*]] = fir.load %[[val_5]] : !fir.ref<i32>
+!CHECK: fir.store %[[val_16]] to %[[val_12]] : !fir.ref<i32>
+!CHECK: %[[val_17:.*]] = fir.load %[[val_8]] : !fir.ref<i32>
+!CHECK: fir.store %[[val_17]] to %[[val_15]] : !fir.ref<i32>
 !CHECK: omp.barrier
 !CHECK: %[[val_c1_i32:.*]] = arith.constant 1 : i32
-!CHECK: %[[val_19:.*]] = fir.load %[[val_13]] : !fir.ref<i32>
+!CHECK: %[[val_18:.*]] = fir.load %[[val_12]] : !fir.ref<i32>
 !CHECK: %[[val_c1_i32_2:.*]] = arith.constant 1 : i32
-!CHECK: omp.wsloop   for (%[[arg:.*]]) : i32 = (%[[val_c1_i32]]) to (%[[val_19]]) inclusive step (%[[val_c1_i32_2]]) {
-!CHECK: fir.store %[[arg]] to %[[val_9]] : !fir.ref<i32>
-!CHECK: %[[val_20:.*]] = fir.load %[[val_16]] : !fir.ref<i32>
-!CHECK: %[[val_21:.*]] = fir.load %[[val_9]] : !fir.ref<i32>
-!CHECK: %[[val_22:.*]] = arith.addi %[[val_20]], %[[val_21]] : i32
-!CHECK: fir.store %[[val_22]] to %[[val_16]] : !fir.ref<i32>
+!CHECK: omp.wsloop   for (%[[arg:.*]]) : i32 = (%[[val_c1_i32]]) to (%[[val_18]]) inclusive step (%[[val_c1_i32_2]]) {
+!CHECK: %[[val_19:.*]] = fir.load %[[val_15]] : !fir.ref<i32>
+!CHECK: %[[val_20:.*]] = arith.addi %[[val_19]], %[[arg]] : i32
+!CHECK: fir.store %[[val_20]] to %[[val_15]] : !fir.ref<i32>
 !CHECK: omp.yield
 !CHECK: }
 !CHECK: omp.terminator
diff --git a/flang/test/Lower/OpenMP/FIR/lastprivate-commonblock.f90 b/flang/test/Lower/OpenMP/FIR/lastprivate-commonblock.f90
index 06f3e1ca82234..bba9dbc4fc4cb 100644
--- a/flang/test/Lower/OpenMP/FIR/lastprivate-commonblock.f90
+++ b/flang/test/Lower/OpenMP/FIR/lastprivate-commonblock.f90
@@ -1,7 +1,6 @@
 ! RUN: %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s 
 
 !CHECK: func.func @_QPlastprivate_common() {
-!CHECK: %[[val_0:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK: %[[val_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFlastprivate_commonEi"}
 !CHECK: %[[val_2:.*]] = fir.address_of(@c_) : !fir.ref<!fir.array<8xi8>>
 !CHECK: %[[val_3:.*]] = fir.convert %[[val_2]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
@@ -18,7 +17,6 @@
 !CHECK: %[[val_c100_i32:.*]] = arith.constant 100 : i32
 !CHECK: %[[val_c1_i32_0:.*]] = arith.constant 1 : i32
 !CHECK: omp.wsloop   for (%[[arg:.*]]) : i32 = (%[[val_c1_i32]]) to (%[[val_c100_i32]]) inclusive step (%[[val_c1_i32_0]]) {
-!CHECK: fir.store %[[arg]] to %[[val_0]] : !fir.ref<i32>
 !CHECK: %[[val_11:.*]] = arith.cmpi eq, %[[arg]], %[[val_c100_i32]] : i32
 !CHECK: fir.if %[[val_11]] {
 !CHECK: %[[val_12:.*]] = fir.load %[[val_9]] : !fir.ref<f32>
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-private-clause-fixes.f90 b/flang/test/Lower/OpenMP/FIR/parallel-private-clause-fixes.f90
index 3152f9c44d0c6..8cf216361bcb6 100644
--- a/flang/test/Lower/OpenMP/FIR/parallel-private-clause-fixes.f90
+++ b/flang/test/Lower/OpenMP/FIR/parallel-private-clause-fixes.f90
@@ -8,34 +8,31 @@
 ! CHECK:         %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_private_fixEx"}
 ! CHECK:         omp.parallel {
 ! CHECK:           %[[PRIV_J:.*]] = fir.alloca i32 {bindc_name = "j", pinned
-! CHECK:           %[[PRIV_I:.*]] = fir.alloca i32 {adapt.valuebyref, pinned
 ! CHECK:           %[[PRIV_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned
 ! CHECK:           %[[ONE:.*]] = arith.constant 1 : i32
 ! CHECK:           %[[VAL_3:.*]] = fir.load %[[VAL_4:.*]] : !fir.ref<i32>
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i32
-! CHECK:           omp.wsloop for (%[[VAL_6:.*]]) : i32 = (%[[ONE]]) to (%[[VAL_3]]) inclusive step (%[[VAL_5]]) {
-! CHECK:             fir.store %[[VAL_6]] to %[[PRIV_I]] : !fir.ref<i32>
-! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
-! CHECK:             %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index
-! CHECK:             %[[VAL_9:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
-! CHECK:             %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index
-! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : index
-! CHECK:             %[[LB:.*]] = fir.convert %[[VAL_8]] : (index) -> i32
-! CHECK:             %[[VAL_12:.*]]:2 = fir.do_loop %[[VAL_13:[^ ]*]] =
-! CHECK-SAME:            %[[VAL_8]] to %[[VAL_10]] step %[[VAL_11]]
-! CHECK-SAME:            iter_args(%[[IV:.*]] = %[[LB]]) -> (index, i32) {
-! CHECK:               fir.store %[[IV]] to %[[PRIV_J]] : !fir.ref<i32>
-! CHECK:               %[[LOAD:.*]] = fir.load %[[PRIV_I]] : !fir.ref<i32>
-! CHECK:               %[[VAL_15:.*]] = fir.load %[[PRIV_J]] : !fir.ref<i32>
-! CHECK:               %[[VAL_16:.*]] = arith.addi %[[LOAD]], %[[VAL_15]] : i32
-! CHECK:               fir.store %[[VAL_16]] to %[[PRIV_X]] : !fir.ref<i32>
-! CHECK:               %[[VAL_17:.*]] = arith.addi %[[VAL_13]], %[[VAL_11]] : index
-! CHECK:               %[[STEPCAST:.*]] = fir.convert %[[VAL_11]] : (index) -> i32
+! CHECK:           omp.wsloop for (%[[IV_I:.*]]) : i32 = (%[[ONE]]) to (%[[VAL_3]]) inclusive step (%[[VAL_5]]) {
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i32) -> index
+! CHECK:             %[[VAL_8:.*]] = fir.load %[[VAL_4]] : !fir.ref<i32>
+! CHECK:             %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i32) -> index
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : index
+! CHECK:             %[[LB:.*]] = fir.convert %[[VAL_7]] : (index) -> i32
+! CHECK:             %[[VAL_11:.*]]:2 = fir.do_loop %[[VAL_12:[^ ]*]] =
+! CHECK-SAME:            %[[VAL_7]] to %[[VAL_9]] step %[[VAL_10]]
+! CHECK-SAME:            iter_args(%[[IV_J:.*]] = %[[LB]]) -> (index, i32) {
+! CHECK:               fir.store %[[IV_J]] to %[[PRIV_J]] : !fir.ref<i32>
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[PRIV_J]] : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = arith.addi %[[IV_I]], %[[VAL_13]] : i32
+! CHECK:               fir.store %[[VAL_14]] to %[[PRIV_X]] : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_12]], %[[VAL_10]] : index
+! CHECK:               %[[STEPCAST:.*]] = fir.convert %[[VAL_10]] : (index) -> i32
 ! CHECK:               %[[IVLOAD:.*]] = fir.load %[[PRIV_J]] : !fir.ref<i32>
 ! CHECK:               %[[IVINC:.*]] = arith.addi %[[IVLOAD]], %[[STEPCAST]]
-! CHECK:               fir.result %[[VAL_17]], %[[IVINC]] : index, i32
+! CHECK:               fir.result %[[VAL_15]], %[[IVINC]] : index, i32
 ! CHECK:             }
-! CHECK:             fir.store %[[VAL_12]]#1 to %[[PRIV_J]] : !fir.ref<i32>
+! CHECK:             fir.store %[[VAL_11]]#1 to %[[PRIV_J]] : !fir.ref<i32>
 ! CHECK:             omp.yield
 ! CHECK:           }
 ! CHECK:           omp.terminator
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90
index e9d9218702cc5..f341d0ccda423 100644
--- a/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90
+++ b/flang/test/Lower/OpenMP/FIR/parallel-private-clause.f90
@@ -249,8 +249,6 @@ subroutine simple_loop_1
   real, allocatable :: r;
   ! FIRDialect:  omp.parallel
   !$OMP PARALLEL PRIVATE(r)
-  ! FIRDialect:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-
   ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
@@ -262,9 +260,7 @@ subroutine simple_loop_1
   ! FIRDialect:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP DO
   do i=1, 9
-  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! FIRDialect:     omp.yield
@@ -285,8 +281,6 @@ subroutine simple_loop_2
   real, allocatable :: r;
   ! FIRDialect:  omp.parallel
   !$OMP PARALLEL
-  ! FIRDialect:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-
   ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
@@ -298,9 +292,7 @@ subroutine simple_loop_2
   ! FIRDialect:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP DO PRIVATE(r)
   do i=1, 9
-  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! FIRDialect:     omp.yield
@@ -320,8 +312,6 @@ subroutine simple_loop_3
   integer :: i
   real, allocatable :: r;
   ! FIRDialect:  omp.parallel
-  ! FIRDialect:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-
   ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
@@ -333,9 +323,7 @@ subroutine simple_loop_3
   ! FIRDialect:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP PARALLEL DO PRIVATE(r)
   do i=1, 9
-  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! FIRDialect:     omp.yield
@@ -353,9 +341,9 @@ subroutine simple_loop_3
 subroutine simd_loop_1
   integer :: i
   real, allocatable :: r;
-  ! IRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
-  ! IRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
-  ! IRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
 
   ! FIRDialect:     %[[LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:     %[[UB:.*]] = arith.constant 9 : i32
@@ -364,9 +352,7 @@ subroutine simd_loop_1
   ! FIRDialect: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   !$OMP SIMD PRIVATE(r)
   do i=1, 9
-  ! FIRDialect:     fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   !$OMP END SIMD
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-wsloop-firstpriv.f90 b/flang/test/Lower/OpenMP/FIR/parallel-wsloop-firstpriv.f90
index ac63c45677ffe..bcc08e59560ab 100644
--- a/flang/test/Lower/OpenMP/FIR/parallel-wsloop-firstpriv.f90
+++ b/flang/test/Lower/OpenMP/FIR/parallel-wsloop-firstpriv.f90
@@ -10,7 +10,6 @@ subroutine omp_do_firstprivate(a)
   n = a+1
   !$omp parallel do firstprivate(a)
   ! CHECK:  omp.parallel {
-  ! CHECK-NEXT: %[[REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
   ! CHECK-NEXT: %[[CLONE:.*]] = fir.alloca i32 {bindc_name = "a", pinned
   ! CHECK-NEXT: %[[LD:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
   ! CHECK-NEXT: fir.store %[[LD]] to %[[CLONE]] : !fir.ref<i32>
@@ -18,6 +17,7 @@ subroutine omp_do_firstprivate(a)
   ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[CLONE]] : !fir.ref<i32>
   ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
   ! CHECK-NEXT: omp.wsloop   for  (%[[ARG1:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]])
+  ! CHECK-NEXT: %[[REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
   ! CHECK-NEXT: fir.store %[[ARG1]] to %[[REF]] : !fir.ref<i32>
   ! CHECK-NEXT: fir.call @_QPfoo(%[[REF]], %[[CLONE]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
   ! CHECK-NEXT: omp.yield
@@ -36,19 +36,19 @@ subroutine omp_do_firstprivate2(a, n)
   n = a+1
   !$omp parallel do firstprivate(a, n)
   ! CHECK:  omp.parallel {
-  ! CHECK-NEXT: %[[REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
   ! CHECK-NEXT: %[[CLONE:.*]] = fir.alloca i32 {bindc_name = "a", pinned
   ! CHECK-NEXT: %[[LD:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
   ! CHECK-NEXT: fir.store %[[LD]] to %[[CLONE]] : !fir.ref<i32>
   ! CHECK-NEXT: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "n", pinned
   ! CHECK-NEXT: %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
   ! CHECK-NEXT: fir.store %[[LD1]] to %[[CLONE1]] : !fir.ref<i32>
-
-
+  
+  
   ! CHECK: %[[LB:.*]] = fir.load %[[CLONE]] : !fir.ref<i32>
   ! CHECK-NEXT: %[[UB:.*]] = fir.load %[[CLONE1]] : !fir.ref<i32>
   ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
   ! CHECK-NEXT: omp.wsloop   for  (%[[ARG2:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]])
+  ! CHECK-NEXT: %[[REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
   ! CHECK-NEXT: fir.store %[[ARG2]] to %[[REF]] : !fir.ref<i32>
   ! CHECK-NEXT: fir.call @_QPfoo(%[[REF]], %[[CLONE]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
   ! CHECK-NEXT: omp.yield
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-wsloop.f90 b/flang/test/Lower/OpenMP/FIR/parallel-wsloop.f90
index c302b91be8e67..bc44a12c53f4d 100644
--- a/flang/test/Lower/OpenMP/FIR/parallel-wsloop.f90
+++ b/flang/test/Lower/OpenMP/FIR/parallel-wsloop.f90
@@ -12,9 +12,7 @@ subroutine simple_parallel_do
   ! CHECK:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP PARALLEL DO
   do i=1, 9
-  ! CHECK:    fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:    %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! CHECK:       omp.yield
@@ -38,9 +36,7 @@ subroutine parallel_do_with_parallel_clauses(cond, nt)
   ! CHECK:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP PARALLEL DO IF(cond) NUM_THREADS(nt) PROC_BIND(close)
   do i=1, 9
-  ! CHECK:    fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:    %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! CHECK:       omp.yield
@@ -61,9 +57,7 @@ subroutine parallel_do_with_clauses(nt)
   ! CHECK:     omp.wsloop schedule(dynamic) for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP PARALLEL DO NUM_THREADS(nt) SCHEDULE(dynamic)
   do i=1, 9
-  ! CHECK:    fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:    %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! CHECK:       omp.yield
@@ -93,9 +87,7 @@ subroutine parallel_do_with_privatisation_clauses(cond,nt)
   ! CHECK:    omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP PARALLEL DO PRIVATE(cond) FIRSTPRIVATE(nt)
   do i=1, 9
-  ! CHECK:    fir.store %[[I]] to %[[IV_ADDR:.*]] : !fir.ref<i32>
-  ! CHECK:    %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]] : !fir.ref<i32>
-  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
   ! CHECK:      %[[PRIVATE_COND_VAL:.*]] = fir.load %[[PRIVATE_COND_REF]] : !fir.ref<!fir.logical<4>>
   ! CHECK:      %[[PRIVATE_COND_VAL_CVT:.*]] = fir.convert %[[PRIVATE_COND_VAL]] : (!fir.logical<4>) -> i1
   ! CHECK:      fir.call @_FortranAioOutputLogical({{.*}}, %[[PRIVATE_COND_VAL_CVT]]) {{.*}}: (!fir.ref<i8>, i1) -> i1
@@ -132,7 +124,6 @@ end subroutine parallel_private_do
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
 ! CHECK:           %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFparallel_private_doEi"}
 ! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:             %[[COND_ADDR:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_private_doEcond"}
 ! CHECK:             %[[NT_ADDR:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_private_doEnt"}
 ! CHECK:             %[[NT:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
@@ -141,6 +132,7 @@ end subroutine parallel_private_do
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 9 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             omp.wsloop   for  (%[[I:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               %[[I_PRIV:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:               fir.store %[[I]] to %[[I_PRIV]] : !fir.ref<i32>
 ! CHECK:               fir.call @_QPfoo(%[[I_PRIV]], %[[COND_ADDR]], %[[NT_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.logical<4>>, !fir.ref<i32>) -> ()
 ! CHECK:               omp.yield
@@ -172,7 +164,6 @@ end subroutine omp_parallel_multiple_firstprivate_do
 ! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
 ! CHECK:           %[[I_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_parallel_multiple_firstprivate_doEi"}
 ! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"}
 ! CHECK:             %[[A:.*]] = fir.load %[[A_ADDR]] : !fir.ref<i32>
 ! CHECK:             fir.store %[[A]] to %[[A_PRIV_ADDR]] : !fir.ref<i32>
@@ -183,6 +174,7 @@ end subroutine omp_parallel_multiple_firstprivate_do
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
 ! CHECK:             omp.wsloop   for  (%[[I:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:               %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:               fir.store %[[I]] to %[[I_PRIV_ADDR]] : !fir.ref<i32>
 ! CHECK:               fir.call @_QPbar(%[[I_PRIV_ADDR]], %[[A_PRIV_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
 ! CHECK:               omp.yield
@@ -216,7 +208,6 @@ end subroutine parallel_do_private
 ! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
 ! CHECK:           %[[I_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFparallel_do_privateEi"}
 ! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:             %[[COND_ADDR:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_do_privateEcond"}
 ! CHECK:             %[[NT_ADDR:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_privateEnt"}
 ! CHECK:             %[[NT:.*]] = fir.load %[[VAL_1]] : !fir.ref<i32>
@@ -225,6 +216,7 @@ end subroutine parallel_do_private
 ! CHECK:             %[[VAL_8:.*]] = arith.constant 9 : i32
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
 ! CHECK:             omp.wsloop   for  (%[[I:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:               fir.store %[[I]] to %[[I_PRIV_ADDR]] : !fir.ref<i32>
 ! CHECK:               fir.call @_QPfoo(%[[I_PRIV_ADDR]], %[[COND_ADDR]], %[[NT_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.logical<4>>, !fir.ref<i32>) -> ()
 ! CHECK:               omp.yield
@@ -256,7 +248,6 @@ end subroutine omp_parallel_do_multiple_firstprivate
 ! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
 ! CHECK:           %[[I_ADDR:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFomp_parallel_do_multiple_firstprivateEi"}
 ! CHECK:           omp.parallel   {
-! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"}
 ! CHECK:             %[[A:.*]] = fir.load %[[A_ADDR]] : !fir.ref<i32>
 ! CHECK:             fir.store %[[A]] to %[[A_PRIV_ADDR]] : !fir.ref<i32>
@@ -267,6 +258,7 @@ end subroutine omp_parallel_do_multiple_firstprivate
 ! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
 ! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
 ! CHECK:             omp.wsloop   for  (%[[I:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:               %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:               fir.store %[[I]] to %[[I_PRIV_ADDR]] : !fir.ref<i32>
 ! CHECK:               fir.call @_QPbar(%[[I_PRIV_ADDR]], %[[A_PRIV_ADDR]]) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
 ! CHECK:               omp.yield
diff --git a/flang/test/Lower/OpenMP/FIR/simd.f90 b/flang/test/Lower/OpenMP/FIR/simd.f90
index 47596cccdbc12..c0c8d7f32838a 100644
--- a/flang/test/Lower/OpenMP/FIR/simd.f90
+++ b/flang/test/Lower/OpenMP/FIR/simd.f90
@@ -11,9 +11,7 @@ subroutine simdloop
   ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
   ! CHECK-NEXT: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   do i=1, 9
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   !$OMP END SIMD 
@@ -29,9 +27,7 @@ subroutine simdloop_with_if_clause(n, threshold)
   ! CHECK: %[[COND:.*]] = arith.cmpi sge
   ! CHECK: omp.simdloop if(%[[COND:.*]]) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
   do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   !$OMP END SIMD
@@ -46,9 +42,7 @@ subroutine simdloop_with_simdlen_clause(n, threshold)
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
   ! CHECK: omp.simdloop simdlen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
   do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   !$OMP END SIMD
@@ -64,9 +58,7 @@ subroutine simdloop_with_simdlen_clause_from_param(n, threshold)
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
   ! CHECK: omp.simdloop simdlen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
   do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   !$OMP END SIMD
@@ -82,9 +74,7 @@ subroutine simdloop_with_simdlen_clause_from_expr_from_param(n, threshold)
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
   ! CHECK: omp.simdloop simdlen(6) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
   do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   !$OMP END SIMD
@@ -99,9 +89,7 @@ subroutine simdloop_with_safelen_clause(n, threshold)
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
   ! CHECK: omp.simdloop safelen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
   do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   !$OMP END SIMD
@@ -117,9 +105,7 @@ subroutine simdloop_with_safelen_clause_from_expr_from_param(n, threshold)
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
   ! CHECK: omp.simdloop safelen(6) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
   do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   !$OMP END SIMD
@@ -134,9 +120,7 @@ subroutine simdloop_with_simdlen_safelen_clause(n, threshold)
   ! CHECK: %[[STEP:.*]] = arith.constant 1 : i32
   ! CHECK: omp.simdloop simdlen(1) safelen(2) for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive  step (%[[STEP]]) {
   do i = 1, n
-    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-    ! CHECK: %[[LD:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
-    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[LD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   !$OMP END SIMD
diff --git a/flang/test/Lower/OpenMP/FIR/stop-stmt-in-region.f90 b/flang/test/Lower/OpenMP/FIR/stop-stmt-in-region.f90
index 2f73fb31966ec..ab906e1dfba5b 100644
--- a/flang/test/Lower/OpenMP/FIR/stop-stmt-in-region.f90
+++ b/flang/test/Lower/OpenMP/FIR/stop-stmt-in-region.f90
@@ -71,14 +71,12 @@ subroutine test_stop_in_region3()
 end
 
 ! CHECK-LABEL: func.func @_QPtest_stop_in_region4() {
-! CHECK:         %[[VAL_0:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK:         %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFtest_stop_in_region4Ei"}
 ! CHECK:         %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtest_stop_in_region4Ex"}
 ! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : i32
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 10 : i32
 ! CHECK:         %[[VAL_5:.*]] = arith.constant 1 : i32
 ! CHECK:         omp.wsloop   for  (%[[VAL_6:.*]]) : i32 = (%[[VAL_3]]) to (%[[VAL_4]]) inclusive step (%[[VAL_5]]) {
-! CHECK:           fir.store %[[VAL_6]] to %[[VAL_0]] : !fir.ref<i32>
 ! CHECK:           cf.br ^bb1
 ! CHECK:         ^bb1:
 ! CHECK:           %[[VAL_7:.*]] = arith.constant 3 : i32
diff --git a/flang/test/Lower/OpenMP/FIR/target.f90 b/flang/test/Lower/OpenMP/FIR/target.f90
index 9b1fb5c15ac1d..90e4e8a058297 100644
--- a/flang/test/Lower/OpenMP/FIR/target.f90
+++ b/flang/test/Lower/OpenMP/FIR/target.f90
@@ -278,19 +278,16 @@ subroutine omp_target_parallel_do
    !CHECK: omp.target   map_entries(%[[MAP]] : !fir.ref<!fir.array<1024xi32>>) {
       !CHECK-NEXT: omp.parallel
       !$omp target parallel do map(tofrom: a)
-         !CHECK: %[[VAL_2:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-         !CHECK: %[[VAL_3:.*]] = arith.constant 1 : i32
-         !CHECK: %[[VAL_4:.*]] = arith.constant 1024 : i32
-         !CHECK: %[[VAL_5:.*]] = arith.constant 1 : i32
-         !CHECK: omp.wsloop   for  (%[[VAL_6:.*]]) : i32 = (%[[VAL_3]]) to (%[[VAL_4]]) inclusive step (%[[VAL_5]]) {
-         !CHECK: fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<i32>
-         !CHECK: %[[VAL_7:.*]] = arith.constant 10 : i32
-         !CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_2]] : !fir.ref<i32>
-         !CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i32) -> i64
-         !CHECK: %[[VAL_10:.*]] = arith.constant 1 : i64
-         !CHECK: %[[VAL_11:.*]] = arith.subi %[[VAL_9]], %[[VAL_10]] : i64
-         !CHECK: %[[VAL_12:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_11]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
-         !CHECK: fir.store %[[VAL_7]] to %[[VAL_12]] : !fir.ref<i32>
+         !CHECK: %[[VAL_2:.*]] = arith.constant 1 : i32
+         !CHECK: %[[VAL_3:.*]] = arith.constant 1024 : i32
+         !CHECK: %[[VAL_4:.*]] = arith.constant 1 : i32
+         !CHECK: omp.wsloop   for  (%[[ARG:.*]]) : i32 = (%[[VAL_2]]) to (%[[VAL_3]]) inclusive step (%[[VAL_4]]) {
+         !CHECK: %[[VAL_5:.*]] = arith.constant 10 : i32
+         !CHECK: %[[VAL_6:.*]] = fir.convert %[[ARG]] : (i32) -> i64
+         !CHECK: %[[VAL_7:.*]] = arith.constant 1 : i64
+         !CHECK: %[[VAL_8:.*]] = arith.subi %[[VAL_6]], %[[VAL_7]] : i64
+         !CHECK: %[[VAL_9:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_8]] : (!fir.ref<!fir.array<1024xi32>>, i64) -> !fir.ref<i32>
+         !CHECK: fir.store %[[VAL_5]] to %[[VAL_9]] : !fir.ref<i32>
          do i = 1, 1024
             a(i) = 10
          end do
diff --git a/flang/test/Lower/OpenMP/FIR/unstructured.f90 b/flang/test/Lower/OpenMP/FIR/unstructured.f90
index e7d48bb269349..a6b572eaafecf 100644
--- a/flang/test/Lower/OpenMP/FIR/unstructured.f90
+++ b/flang/test/Lower/OpenMP/FIR/unstructured.f90
@@ -61,21 +61,16 @@ subroutine ss2(n) ! unstructured OpenMP construct; loop exit inside construct
 ! CHECK-LABEL: func @_QPss3{{.*}} {
 ! CHECK:   omp.parallel {
 ! CHECK:     %[[ALLOCA_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned}
-! CHECK:     %[[ALLOCA_1:.*]] = fir.alloca i32 {{{.*}}, pinned}
-! CHECK:     %[[ALLOCA_2:.*]] = fir.alloca i32 {{{.*}}, pinned}
 ! CHECK:     br ^bb1
 ! CHECK:   ^bb1:  // 2 preds: ^bb0, ^bb3
 ! CHECK:     cond_br %{{[0-9]*}}, ^bb2, ^bb4
 ! CHECK:   ^bb2:  // pred: ^bb1
 ! CHECK:     omp.wsloop for (%[[ARG1:.*]]) : {{.*}} {
-! CHECK:       fir.store %[[ARG1]] to %[[ALLOCA_2]] : !fir.ref<i32>
 ! CHECK:     @_FortranAioBeginExternalListOutput
-! CHECK:       %[[LOAD_1:.*]] = fir.load %[[ALLOCA_2]] : !fir.ref<i32>
-! CHECK:     @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD_1]])
+! CHECK:     @_FortranAioOutputInteger32(%{{.*}}, %[[ARG1]])
 ! CHECK:       omp.yield
 ! CHECK:     }
 ! CHECK:     omp.wsloop for (%[[ARG2:.*]]) : {{.*}} {
-! CHECK:       fir.store %[[ARG2]] to %[[ALLOCA_1]] : !fir.ref<i32>
 ! CHECK:       br ^bb1
 ! CHECK:     ^bb2:  // 2 preds: ^bb1, ^bb5
 ! CHECK:       cond_br %{{[0-9]*}}, ^bb3, ^bb6
@@ -116,15 +111,12 @@ subroutine ss3(n) ! nested unstructured OpenMP constructs
 
 ! CHECK-LABEL: func @_QPss4{{.*}} {
 ! CHECK:       omp.parallel {
-! CHECK:         %[[ALLOCA:.*]] = fir.alloca i32 {{{.*}}, pinned}
 ! CHECK:         omp.wsloop for (%[[ARG:.*]]) : {{.*}} {
-! CHECK:           fir.store %[[ARG]] to %[[ALLOCA]] : !fir.ref<i32>
 ! CHECK:           %[[COND:.*]] = arith.cmpi eq, %{{.*}}, %{{.*}}
 ! CHECK:           %[[COND_XOR:.*]] = arith.xori %[[COND]], %{{.*}}
 ! CHECK:          fir.if %[[COND_XOR]] {
 ! CHECK:           @_FortranAioBeginExternalListOutput
-! CHECK:           %[[LOAD:.*]] = fir.load %[[ALLOCA]] : !fir.ref<i32>
-! CHECK:           @_FortranAioOutputInteger32(%{{.*}}, %[[LOAD]])
+! CHECK:           @_FortranAioOutputInteger32(%{{.*}}, %[[ARG]])
 ! CHECK:          } else {
 ! CHECK:          }
 ! CHECK-NEXT:      omp.yield
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-chunks.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-chunks.f90
index 99b0cf0f1298e..f1bd5459a2b61 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-chunks.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-chunks.f90
@@ -20,9 +20,7 @@ program wsloop
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 1 : i32
 ! CHECK:         %[[VAL_5:.*]] = arith.constant 4 : i32
 ! CHECK:         omp.wsloop   schedule(static = %[[VAL_5]] : i32) nowait for  (%[[ARG0:.*]]) : i32 = (%[[VAL_2]]) to (%[[VAL_3]]) inclusive step (%[[VAL_4]]) {
-! CHECK:           fir.store %[[ARG0]] to %[[STORE_IV:.*]] : !fir.ref<i32>
-! CHECK:           %[[LOAD_IV:.*]] = fir.load %[[STORE_IV]] : !fir.ref<i32>
-! CHECK:           {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+! CHECK:           {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[ARG0]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
 ! CHECK:           omp.yield
 ! CHECK:         }
 
@@ -38,10 +36,8 @@ program wsloop
 ! CHECK:         %[[VAL_16:.*]] = arith.constant 1 : i32
 ! CHECK:         %[[VAL_17:.*]] = arith.constant 4 : i32
 ! CHECK:         omp.wsloop   schedule(static = %[[VAL_17]] : i32) nowait for  (%[[ARG1:.*]]) : i32 = (%[[VAL_14]]) to (%[[VAL_15]]) inclusive step (%[[VAL_16]]) {
-! CHECK:           fir.store %[[ARG1]] to %[[STORE_IV1:.*]] : !fir.ref<i32>
 ! CHECK:           %[[VAL_24:.*]] = arith.constant 2 : i32
-! CHECK:           %[[LOAD_IV1:.*]] = fir.load %[[STORE_IV1]] : !fir.ref<i32>
-! CHECK:           %[[VAL_25:.*]] = arith.muli %[[VAL_24]], %[[LOAD_IV1]] : i32
+! CHECK:           %[[VAL_25:.*]] = arith.muli %[[VAL_24]], %[[ARG1]] : i32
 ! CHECK:           {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[VAL_25]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
 ! CHECK:           omp.yield
 ! CHECK:         }
@@ -62,10 +58,8 @@ program wsloop
 ! CHECK:         %[[VAL_31:.*]] = arith.constant 1 : i32
 ! CHECK:         %[[VAL_32:.*]] = fir.load %[[VAL_0]] : !fir.ref<i32>
 ! CHECK:         omp.wsloop   schedule(static = %[[VAL_32]] : i32) nowait for  (%[[ARG2:.*]]) : i32 = (%[[VAL_29]]) to (%[[VAL_30]]) inclusive step (%[[VAL_31]]) {
-! CHECK:           fir.store %[[ARG2]] to %[[STORE_IV2:.*]] : !fir.ref<i32>
 ! CHECK:           %[[VAL_39:.*]] = arith.constant 3 : i32
-! CHECK:           %[[LOAD_IV2:.*]] = fir.load %[[STORE_IV2]] : !fir.ref<i32>
-! CHECK:           %[[VAL_40:.*]] = arith.muli %[[VAL_39]], %[[LOAD_IV2]] : i32
+! CHECK:           %[[VAL_40:.*]] = arith.muli %[[VAL_39]], %[[ARG2]] : i32
 ! CHECK:           {{.*}} = fir.call @_FortranAioOutputInteger32({{.*}}, %[[VAL_40]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
 ! CHECK:           omp.yield
 ! CHECK:         }
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-collapse.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-collapse.f90
index a122a41ba8b8f..ba5860faac0e6 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-collapse.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-collapse.f90
@@ -40,16 +40,10 @@ program wsloop_collapse
      do j= 1, b
         do k = 1, c
 ! CHECK:           omp.wsloop for (%[[ARG0:.*]], %[[ARG1:.*]], %[[ARG2:.*]]) : i32 = (%[[VAL_20]], %[[VAL_23]], %[[VAL_26]]) to (%[[VAL_21]], %[[VAL_24]], %[[VAL_27]]) inclusive step (%[[VAL_22]], %[[VAL_25]], %[[VAL_28]]) {
-! CHECK:             fir.store %[[ARG0]] to %[[STORE_IV0:.*]] : !fir.ref<i32>
-! CHECK:             fir.store %[[ARG1]] to %[[STORE_IV1:.*]] : !fir.ref<i32>
-! CHECK:             fir.store %[[ARG2]] to %[[STORE_IV2:.*]] : !fir.ref<i32>
 ! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_6]] : !fir.ref<i32>
-! CHECK:             %[[LOAD_IV0:.*]] = fir.load %[[STORE_IV0]] : !fir.ref<i32>
-! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[LOAD_IV0]] : i32
-! CHECK:             %[[LOAD_IV1:.*]] = fir.load %[[STORE_IV1]] : !fir.ref<i32>
-! CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[LOAD_IV1]] : i32
-! CHECK:             %[[LOAD_IV2:.*]] = fir.load %[[STORE_IV2]] : !fir.ref<i32>
-! CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[LOAD_IV2]] : i32
+! CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[ARG0]] : i32
+! CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[ARG1]] : i32
+! CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[ARG2]] : i32
 ! CHECK:             fir.store %[[VAL_15]] to %[[VAL_6]] : !fir.ref<i32>
 ! CHECK:             omp.yield
 ! CHECK:           }
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-monotonic.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-monotonic.f90
index 9509920c6ec1b..ac0021e2edf20 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-monotonic.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-monotonic.f90
@@ -11,18 +11,15 @@ program wsloop_dynamic
 !CHECK:  omp.parallel {
 
 !$OMP DO SCHEDULE(monotonic:dynamic)
-!CHECK:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
 !CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
 !CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
 !CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
 !CHECK:     omp.wsloop schedule(dynamic, monotonic) nowait for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
-!CHECK:       fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
 
   do i=1, 9
     print*, i
 !CHECK:    %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
-!CHECK:    %[[LOAD:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-!CHECK:    fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+!CHECK:    fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
 !CHECK:    fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
   end do
 !CHECK:       omp.yield
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-nonmonotonic.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-nonmonotonic.f90
index 5e4e66c77b343..39215e8d31c92 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-nonmonotonic.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-nonmonotonic.f90
@@ -12,18 +12,15 @@ program wsloop_dynamic
 !CHECK:  omp.parallel {
 
 !$OMP DO SCHEDULE(nonmonotonic:dynamic)
-!CHECK:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
 !CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
 !CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
 !CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
 !CHECK:     omp.wsloop schedule(dynamic, nonmonotonic) nowait for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
-!CHECK:       fir.store %[[I]] to %[[ALLOCA_IV]] : !fir.ref<i32>
 
   do i=1, 9
     print*, i
 !CHECK:    %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
-!CHECK:    %[[LOAD:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-!CHECK:    fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+!CHECK:    fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
 !CHECK:    fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
   end do
 !CHECK:       omp.yield
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
index 69d133d50ffa0..069b777d1cdb2 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add.f90
@@ -50,14 +50,11 @@
 !CHECK:  %[[C0_2:.*]] = arith.constant 0 : i32
 !CHECK:  fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<i32>
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL]], %[[XREF]] : i32, !fir.ref<i32>
+!CHECK:      omp.reduction %[[IVAL]], %[[XREF]] : i32, !fir.ref<i32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
 !CHECK:  return
@@ -78,14 +75,11 @@ subroutine simple_int_reduction
 !CHECK:  %[[C0_2:.*]] = arith.constant 0.000000e+00 : f32
 !CHECK:  fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<f32>
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[XREF]] : !fir.ref<f32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL_i32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[I_PVT_VAL_i32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL_f32]], %[[XREF]] : f32, !fir.ref<f32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
@@ -107,14 +101,11 @@ subroutine simple_real_reduction
 !CHECK:  %[[C0_2:.*]] = arith.constant 0 : i32
 !CHECK:  fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<i32>
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL]], %[[XREF]] : i32, !fir.ref<i32>
+!CHECK:      omp.reduction %[[IVAL]], %[[XREF]] : i32, !fir.ref<i32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
 !CHECK:  return
@@ -135,14 +126,11 @@ subroutine simple_int_reduction_switch_order
 !CHECK:  %[[C0_2:.*]] = arith.constant 0.000000e+00 : f32
 !CHECK:  fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<f32>
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[XREF]] : !fir.ref<f32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL_i32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[I_PVT_VAL_i32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL_f32]], %[[XREF]] : f32, !fir.ref<f32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
@@ -164,15 +152,10 @@ subroutine simple_real_reduction_switch_order
 !CHECK:  %[[YREF:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
 !CHECK:  %[[ZREF:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>, @[[RED_I32_NAME]] -> %[[YREF]] : !fir.ref<i32>, @[[RED_I32_NAME]] -> %[[ZREF]] : !fir.ref<i32>) for  (%[[IVAL]]) : i32
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL1]], %[[XREF]] : i32, !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL2]], %[[YREF]] : i32, !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL3:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL3]], %[[ZREF]] : i32, !fir.ref<i32>
+!CHECK:      omp.reduction %[[IVAL]], %[[XREF]] : i32, !fir.ref<i32>
+!CHECK:      omp.reduction %[[IVAL]], %[[YREF]] : i32, !fir.ref<i32>
+!CHECK:      omp.reduction %[[IVAL]], %[[ZREF]] : i32, !fir.ref<i32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
 !CHECK:  return
@@ -197,17 +180,12 @@ subroutine multiple_int_reductions_same_type
 !CHECK:  %[[YREF:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
 !CHECK:  %[[ZREF:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[XREF]] : !fir.ref<f32>, @[[RED_F32_NAME]] -> %[[YREF]] : !fir.ref<f32>, @[[RED_F32_NAME]] -> %[[ZREF]] : !fir.ref<f32>) for  (%[[IVAL]]) : i32
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1_F32:.*]] = fir.convert %[[I_PVT_VAL1_I32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL1_F32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL1_F32]], %[[XREF]] : f32, !fir.ref<f32>
-!CHECK:      %[[I_PVT_VAL2_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL2_F32:.*]] = fir.convert %[[I_PVT_VAL2_I32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL2_F32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL2_F32]], %[[YREF]] : f32, !fir.ref<f32>
-!CHECK:      %[[I_PVT_VAL3_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[I_PVT_VAL3_I32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL3_F32]], %[[ZREF]] : f32, !fir.ref<f32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
@@ -234,19 +212,13 @@ subroutine multiple_real_reductions_same_type
 !CHECK:  %[[YREF:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
 !CHECK:  %[[ZREF:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[XREF]]  : !fir.ref<i32>, @[[RED_I64_NAME]] -> %[[YREF]] : !fir.ref<i64>, @[[RED_F32_NAME]] -> %[[ZREF]]  : !fir.ref<f32>, @[[RED_F64_NAME]] -> %[[WREF]]  : !fir.ref<f64>) for  (%[[IVAL:.*]]) : i32
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL1_I32]], %[[XREF]] : i32, !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL2_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL2_I64:.*]] = fir.convert %[[I_PVT_VAL2_I32]] : (i32) -> i64
+!CHECK:      omp.reduction %[[IVAL]], %[[XREF]] : i32, !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL2_I64:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      omp.reduction %[[I_PVT_VAL2_I64]], %[[YREF]] : i64, !fir.ref<i64>
-!CHECK:      %[[I_PVT_VAL3_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[I_PVT_VAL3_I32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL3_F32]], %[[ZREF]] : f32, !fir.ref<f32>
-!CHECK:      %[[I_PVT_VAL4_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL4_F64:.*]] = fir.convert %[[I_PVT_VAL4_I32]] : (i32) -> f64
+!CHECK:      %[[I_PVT_VAL4_F64:.*]] = fir.convert %[[IVAL]] : (i32) -> f64
 !CHECK:      omp.reduction %[[I_PVT_VAL4_F64]], %[[WREF]] : f64, !fir.ref<f64>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-and.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-and.f90
index 425d37398c571..03fbc2819659d 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-and.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-and.f90
@@ -21,14 +21,11 @@
 !CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
 !CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI:.*]] = arith.subi %[[CONVI_64]], %[[C1_64]] : i64
 !CHECK:      %[[Y_PVT_REF:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
@@ -54,14 +51,11 @@ subroutine simple_reduction(y)
 !CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
 !CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI:.*]] = arith.subi %[[CONVI_64]], %[[C1_64]] : i64
 !CHECK:      %[[Y_PVT_REF:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
@@ -89,28 +83,23 @@ subroutine simple_reduction_switch_order(y)
 !CHECK:  %[[YREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
 !CHECK:  %[[ZREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[YREF]] : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[ZREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[I_PVT_VAL1]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_1:.*]] = arith.subi %[[CONVI_64_1]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_1:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_1]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
 !CHECK:      %[[WVAL:.*]] = fir.load %[[W_PVT_REF_1]] : !fir.ref<!fir.logical<4>>
 !CHECK:      omp.reduction %[[WVAL]], %[[XREF]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
-!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_2:.*]] = arith.subi %[[CONVI_64_2]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_2:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_2]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
 !CHECK:      %[[WVAL:.*]] = fir.load %[[W_PVT_REF_2]] : !fir.ref<!fir.logical<4>>
 !CHECK:      omp.reduction %[[WVAL]], %[[YREF]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
-!CHECK:      %[[I_PVT_VAL3:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_3:.*]] = fir.convert %[[I_PVT_VAL3]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_3:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_3:.*]] = arith.subi %[[CONVI_64_3]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_3:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_3]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv.f90
index e8cf46f8261c4..eaa627e6afd51 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv.f90
@@ -21,14 +21,11 @@
 !CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
 !CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI:.*]] = arith.subi %[[CONVI_64]], %[[C1_64]] : i64
 !CHECK:      %[[Y_PVT_REF:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
@@ -54,14 +51,11 @@ subroutine simple_reduction(y)
 !CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
 !CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI:.*]] = arith.subi %[[CONVI_64]], %[[C1_64]] : i64
 !CHECK:      %[[Y_PVT_REF:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
@@ -89,28 +83,23 @@ subroutine simple_reduction_switch_order(y)
 !CHECK:  %[[YREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
 !CHECK:  %[[ZREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[YREF]] : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[ZREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[I_PVT_VAL1]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_1:.*]] = arith.subi %[[CONVI_64_1]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_1:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_1]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
 !CHECK:      %[[WVAL:.*]] = fir.load %[[W_PVT_REF_1]] : !fir.ref<!fir.logical<4>>
 !CHECK:      omp.reduction %[[WVAL]], %[[XREF]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
-!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_2:.*]] = arith.subi %[[CONVI_64_2]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_2:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_2]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
 !CHECK:      %[[WVAL:.*]] = fir.load %[[W_PVT_REF_2]] : !fir.ref<!fir.logical<4>>
 !CHECK:      omp.reduction %[[WVAL]], %[[YREF]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
-!CHECK:      %[[I_PVT_VAL3:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_3:.*]] = fir.convert %[[I_PVT_VAL3]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_3:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_3:.*]] = arith.subi %[[CONVI_64_3]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_3:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_3]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv.f90
index 6e5d6c34cedc5..617198d01716f 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv.f90
@@ -21,14 +21,11 @@
 !CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
 !CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI:.*]] = arith.subi %[[CONVI_64]], %[[C1_64]] : i64
 !CHECK:      %[[Y_PVT_REF:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
@@ -54,14 +51,11 @@ subroutine simple_reduction(y)
 !CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
 !CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI:.*]] = arith.subi %[[CONVI_64]], %[[C1_64]] : i64
 !CHECK:      %[[Y_PVT_REF:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
@@ -89,28 +83,23 @@ subroutine simple_reduction_switch_order(y)
 !CHECK:  %[[YREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
 !CHECK:  %[[ZREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[YREF]] : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[ZREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[I_PVT_VAL1]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_1:.*]] = arith.subi %[[CONVI_64_1]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_1:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_1]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
 !CHECK:      %[[WVAL:.*]] = fir.load %[[W_PVT_REF_1]] : !fir.ref<!fir.logical<4>>
 !CHECK:      omp.reduction %[[WVAL]], %[[XREF]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
-!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_2:.*]] = arith.subi %[[CONVI_64_2]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_2:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_2]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
 !CHECK:      %[[WVAL:.*]] = fir.load %[[W_PVT_REF_2]] : !fir.ref<!fir.logical<4>>
 !CHECK:      omp.reduction %[[WVAL]], %[[YREF]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
-!CHECK:      %[[I_PVT_VAL3:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_3:.*]] = fir.convert %[[I_PVT_VAL3]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_3:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_3:.*]] = arith.subi %[[CONVI_64_3]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_3:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_3]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-or.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-or.f90
index cdc12500e2c30..e3d691e347ba2 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-or.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-or.f90
@@ -21,14 +21,11 @@
 !CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
 !CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI:.*]] = arith.subi %[[CONVI_64]], %[[C1_64]] : i64
 !CHECK:      %[[Y_PVT_REF:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
@@ -54,14 +51,11 @@ subroutine simple_reduction(y)
 !CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
 !CHECK:  %[[XREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[I_PVT_VAL]] : (i32) -> i64
+!CHECK:      %[[CONVI_64:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI:.*]] = arith.subi %[[CONVI_64]], %[[C1_64]] : i64
 !CHECK:      %[[Y_PVT_REF:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
@@ -89,28 +83,23 @@ subroutine simple_reduction_switch_order(y)
 !CHECK:  %[[YREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
 !CHECK:  %[[ZREF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_NAME]] -> %[[XREF]] : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[YREF]] : !fir.ref<!fir.logical<4>>, @[[RED_NAME]] -> %[[ZREF]] : !fir.ref<!fir.logical<4>>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]]) {
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[I_PVT_VAL1]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_1:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_1:.*]] = arith.subi %[[CONVI_64_1]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_1:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_1]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
 !CHECK:      %[[WVAL:.*]] = fir.load %[[W_PVT_REF_1]] : !fir.ref<!fir.logical<4>>
 !CHECK:      omp.reduction %[[WVAL]], %[[XREF]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
-!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[I_PVT_VAL2]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_2:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_2:.*]] = arith.subi %[[CONVI_64_2]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_2:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_2]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
 !CHECK:      %[[WVAL:.*]] = fir.load %[[W_PVT_REF_2]] : !fir.ref<!fir.logical<4>>
 !CHECK:      omp.reduction %[[WVAL]], %[[YREF]] : !fir.logical<4>, !fir.ref<!fir.logical<4>>
-!CHECK:      %[[I_PVT_VAL3:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[CONVI_64_3:.*]] = fir.convert %[[I_PVT_VAL3]] : (i32) -> i64
+!CHECK:      %[[CONVI_64_3:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      %[[C1_64:.*]] = arith.constant 1 : i64
 !CHECK:      %[[SUBI_3:.*]] = arith.subi %[[CONVI_64_3]], %[[C1_64]] : i64
 !CHECK:      %[[W_PVT_REF_3:.*]] = fir.coordinate_of %[[ARRAY]], %[[SUBI_3]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-mul.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-mul.f90
index c30cde66b5167..f5d2113d2e57a 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-mul.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-mul.f90
@@ -50,14 +50,11 @@
 !CHECK:  %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:  fir.store %[[C1_2]] to %[[XREF]] : !fir.ref<i32>
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C10:.*]] = arith.constant 10 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C10]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL]], %[[XREF]] : i32, !fir.ref<i32>
+!CHECK:      omp.reduction %[[IVAL]], %[[XREF]] : i32, !fir.ref<i32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
 !CHECK:  return
@@ -79,14 +76,11 @@ subroutine simple_int_reduction
 !CHECK:  %[[C0_2:.*]] = arith.constant 1.000000e+00 : f32
 !CHECK:  fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<f32>
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 10 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[XREF]] : !fir.ref<f32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL_i32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[I_PVT_VAL_i32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL_f32]], %[[XREF]] : f32, !fir.ref<f32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
@@ -108,14 +102,11 @@ subroutine simple_real_reduction
 !CHECK:  %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:  fir.store %[[C1_2]] to %[[XREF]] : !fir.ref<i32>
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C10:.*]] = arith.constant 10 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C10]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL]], %[[XREF]] : i32, !fir.ref<i32>
+!CHECK:      omp.reduction %[[IVAL]], %[[XREF]] : i32, !fir.ref<i32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
 !CHECK:  return
@@ -136,14 +127,11 @@ subroutine simple_int_reduction_switch_order
 !CHECK:  %[[C0_2:.*]] = arith.constant 1.000000e+00 : f32
 !CHECK:  fir.store %[[C0_2]] to %[[XREF]] : !fir.ref<f32>
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 10 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[XREF]] : !fir.ref<f32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL_i32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[I_PVT_VAL_i32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL_f32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL_f32]], %[[XREF]] : f32, !fir.ref<f32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
@@ -165,15 +153,10 @@ subroutine simple_real_reduction_switch_order
 !CHECK:  %[[YREF:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
 !CHECK:  %[[ZREF:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[XREF]] : !fir.ref<i32>, @[[RED_I32_NAME]] -> %[[YREF]] : !fir.ref<i32>, @[[RED_I32_NAME]] -> %[[ZREF]] : !fir.ref<i32>) for  (%[[IVAL]]) : i32
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL1]], %[[XREF]] : i32, !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL2:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL2]], %[[YREF]] : i32, !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL3:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL3]], %[[ZREF]] : i32, !fir.ref<i32>
+!CHECK:      omp.reduction %[[IVAL]], %[[XREF]] : i32, !fir.ref<i32>
+!CHECK:      omp.reduction %[[IVAL]], %[[YREF]] : i32, !fir.ref<i32>
+!CHECK:      omp.reduction %[[IVAL]], %[[ZREF]] : i32, !fir.ref<i32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
 !CHECK:  return
@@ -198,17 +181,12 @@ subroutine multiple_int_reductions_same_type
 !CHECK:  %[[YREF:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
 !CHECK:  %[[ZREF:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    omp.wsloop   reduction(@[[RED_F32_NAME]] -> %[[XREF]] : !fir.ref<f32>, @[[RED_F32_NAME]] -> %[[YREF]] : !fir.ref<f32>, @[[RED_F32_NAME]] -> %[[ZREF]] : !fir.ref<f32>) for  (%[[IVAL]]) : i32
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1_F32:.*]] = fir.convert %[[I_PVT_VAL1_I32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL1_F32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL1_F32]], %[[XREF]] : f32, !fir.ref<f32>
-!CHECK:      %[[I_PVT_VAL2_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL2_F32:.*]] = fir.convert %[[I_PVT_VAL2_I32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL2_F32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL2_F32]], %[[YREF]] : f32, !fir.ref<f32>
-!CHECK:      %[[I_PVT_VAL3_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[I_PVT_VAL3_I32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL3_F32]], %[[ZREF]] : f32, !fir.ref<f32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
@@ -235,19 +213,13 @@ subroutine multiple_real_reductions_same_type
 !CHECK:  %[[YREF:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
 !CHECK:  %[[ZREF:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 !CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %2 : !fir.ref<i32>, @[[RED_I64_NAME]] -> %3 : !fir.ref<i64>, @[[RED_F32_NAME]] -> %4 : !fir.ref<f32>, @[[RED_F64_NAME]] -> %1 : !fir.ref<f64>) for  (%[[IVAL:.*]]) : i32
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL1_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL1_I32]], %[[XREF]] : i32, !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL2_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL2_I64:.*]] = fir.convert %[[I_PVT_VAL2_I32]] : (i32) -> i64
+!CHECK:      omp.reduction %[[IVAL]], %[[XREF]] : i32, !fir.ref<i32>
+!CHECK:      %[[I_PVT_VAL2_I64:.*]] = fir.convert %[[IVAL]] : (i32) -> i64
 !CHECK:      omp.reduction %[[I_PVT_VAL2_I64]], %[[YREF]] : i64, !fir.ref<i64>
-!CHECK:      %[[I_PVT_VAL3_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[I_PVT_VAL3_I32]] : (i32) -> f32
+!CHECK:      %[[I_PVT_VAL3_F32:.*]] = fir.convert %[[IVAL]] : (i32) -> f32
 !CHECK:      omp.reduction %[[I_PVT_VAL3_F32]], %[[ZREF]] : f32, !fir.ref<f32>
-!CHECK:      %[[I_PVT_VAL4_I32:.*]] = fir.load %[[I_PVT_REF]] : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL4_F64:.*]] = fir.convert %[[I_PVT_VAL4_I32]] : (i32) -> f64
+!CHECK:      %[[I_PVT_VAL4_F64:.*]] = fir.convert %[[IVAL]] : (i32) -> f64
 !CHECK:      omp.reduction %[[I_PVT_VAL4_F64]], %[[WREF]] : f64, !fir.ref<f64>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-simd.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-simd.f90
index 47f9d572a8653..95248e99b9c63 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-simd.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-simd.f90
@@ -15,13 +15,11 @@ program wsloop_dynamic
 !CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
 !CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
 !CHECK:     omp.wsloop schedule(runtime, simd) nowait for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
-!CHECK:       fir.store %[[I]] to %[[STORE:.*]] : !fir.ref<i32>
 
   do i=1, 9
     print*, i
 !CHECK:    %[[RTBEGIN:.*]] = fir.call @_FortranAioBeginExternalListOutput
-!CHECK:    %[[LOAD:.*]] = fir.load %[[STORE]] : !fir.ref<i32>
-!CHECK:    fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[LOAD]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+!CHECK:    fir.call @_FortranAioOutputInteger32(%[[RTBEGIN]], %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
 !CHECK:    fir.call @_FortranAioEndIoStatement(%[[RTBEGIN]]) {{.*}}: (!fir.ref<i8>) -> i32
   end do
 !CHECK:       omp.yield
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-variable.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-variable.f90
index 466055868f1cc..c515ed10d0d5c 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop-variable.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-variable.f90
@@ -25,11 +25,9 @@ program wsloop_variable
 !CHECK:  omp.wsloop for (%[[ARG0:.*]], %[[ARG1:.*]]) : i64 = (%[[TMP2]], %[[TMP5]]) to (%[[TMP3]], %[[TMP6]]) inclusive step (%[[TMP4]], %[[TMP7]]) {
 !CHECK:    %[[ARG0_I16:.*]] = fir.convert %[[ARG0]] : (i64) -> i16
 !CHECK:    fir.store %[[ARG0_I16]] to %[[STORE_IV0:.*]] : !fir.ref<i16>
-!CHECK:    fir.store %[[ARG1]] to %[[STORE_IV1:.*]] : !fir.ref<i64>
 !CHECK:    %[[LOAD_IV0:.*]] = fir.load %[[STORE_IV0]] : !fir.ref<i16>
 !CHECK:    %[[LOAD_IV0_I64:.*]] = fir.convert %[[LOAD_IV0]] : (i16) -> i64
-!CHECK:    %[[LOAD_IV1:.*]] = fir.load %[[STORE_IV1]] : !fir.ref<i64>
-!CHECK:    %[[TMP10:.*]] = arith.addi %[[LOAD_IV0_I64]], %[[LOAD_IV1]] : i64
+!CHECK:    %[[TMP10:.*]] = arith.addi %[[LOAD_IV0_I64]], %[[ARG1]] : i64
 !CHECK:    %[[TMP11:.*]] = fir.convert %[[TMP10]] : (i64) -> f32
 !CHECK:    fir.store %[[TMP11]] to %{{.*}} : !fir.ref<f32>
 !CHECK:    omp.yield
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop.f90 b/flang/test/Lower/OpenMP/FIR/wsloop.f90
index 2c00d1a9fddae..49b47a6307284 100644
--- a/flang/test/Lower/OpenMP/FIR/wsloop.f90
+++ b/flang/test/Lower/OpenMP/FIR/wsloop.f90
@@ -7,16 +7,13 @@ subroutine simple_loop
   integer :: i
   ! CHECK:  omp.parallel
   !$OMP PARALLEL
-  ! CHECK:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
   ! CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
   ! CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
   ! CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
   ! CHECK:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP DO
   do i=1, 9
-  ! CHECK:             fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! CHECK:             %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! CHECK:       omp.yield
@@ -30,16 +27,13 @@ subroutine simple_loop_with_step
   integer :: i
   ! CHECK:  omp.parallel
   !$OMP PARALLEL
-  ! CHECK:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
   ! CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
   ! CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
   ! CHECK:     %[[WS_STEP:.*]] = arith.constant 2 : i32
   ! CHECK:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
-  ! CHECK:       fir.store %[[I]] to %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:       %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
   !$OMP DO
   do i=1, 9, 2
-  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! CHECK:       omp.yield
@@ -53,16 +47,13 @@ subroutine loop_with_schedule_nowait
   integer :: i
   ! CHECK:  omp.parallel
   !$OMP PARALLEL
-  ! CHECK:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
   ! CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
   ! CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
   ! CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
   ! CHECK:     omp.wsloop schedule(runtime) nowait for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP DO SCHEDULE(runtime)
   do i=1, 9
-  ! CHECK:       fir.store %[[I]] to %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:       %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[I]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! CHECK:       omp.yield
diff --git a/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90 b/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90
index fd56038231b19..ea39293ab78ff 100644
--- a/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90
+++ b/flang/test/Lower/OpenMP/Todo/omp-default-clause-inner-loop.f90
@@ -8,16 +8,13 @@
 ! CHECK: omp.parallel   {
 ! EXPECTED: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
 ! EXPECTED: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFEz"}
-! CHECK: %[[TEMP:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
 ! CHECK: %[[const_1:.*]] = arith.constant 1 : i32
 ! CHECK: %[[const_2:.*]] = arith.constant 10 : i32
 ! CHECK: %[[const_3:.*]] = arith.constant 1 : i32
 ! CHECK: omp.wsloop   for  (%[[ARG:.*]]) : i32 = (%[[const_1]]) to (%[[const_2]]) inclusive step (%[[const_3]]) {
-! CHECK: fir.store %[[ARG]] to %[[TEMP]] : !fir.ref<i32>
 ! EXPECTED: %[[temp_1:.*]] = fir.load %[[PRIVATE_Z]] : !fir.ref<i32>
 ! CHECK: %[[temp_1:.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK: %[[temp_2:.*]] = fir.load %[[TEMP]] : !fir.ref<i32>
-! CHECK: %[[result:.*]] = arith.addi %[[temp_1]], %[[temp_2]] : i32
+! CHECK: %[[result:.*]] = arith.addi %[[temp_1]], %[[ARG]] : i32
 ! EXPECTED: fir.store %[[result]] to %[[PRIVATE_Y]] : !fir.ref<i32>
 ! CHECK: fir.store %[[result]] to %{{.*}} : !fir.ref<i32>
 ! CHECK: omp.yield
diff --git a/flang/test/Lower/OpenMP/hlfir-wsloop.f90 b/flang/test/Lower/OpenMP/hlfir-wsloop.f90
index b6be77fe3016d..8b3bee48890d2 100644
--- a/flang/test/Lower/OpenMP/hlfir-wsloop.f90
+++ b/flang/test/Lower/OpenMP/hlfir-wsloop.f90
@@ -6,19 +6,21 @@
 !CHECK-LABEL: func @_QPsimple_loop()
 subroutine simple_loop
   integer :: i
-  ! CHECK-DAG:     %[[WS_ST:.*]] = arith.constant 1 : i32
-  ! CHECK-DAG:     %[[WS_END:.*]] = arith.constant 9 : i32
-  ! CHECK:  omp.parallel
+  ! CHECK-DAG: %[[WS_ST:.*]] = arith.constant 1 : i32
+  ! CHECK-DAG: %[[WS_END:.*]] = arith.constant 9 : i32
+  ! CHECK:     omp.parallel
   !$OMP PARALLEL
-  ! CHECK-DAG:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
-  ! CHECK:     %[[IV:.*]]    = fir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loopEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
   ! CHECK:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_ST]]) to (%[[WS_END]]) inclusive step (%[[WS_ST]])
   !$OMP DO
   do i=1, 9
-  ! CHECK:             fir.store %[[I]] to %[[IV:.*]] : !fir.ref<i32>
-  ! CHECK:             %[[LOAD_IV:.*]] = fir.load %[[IV]] : !fir.ref<i32>
-  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! CHECK-DAG: %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
+  ! CHECK:     %[[IV:.*]] = fir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loopEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
+  ! CHECK:     fir.store %[[I]] to %[[IV:.*]] : !fir.ref<i32>
+  ! CHECK:     %[[LOAD_IV:.*]] = fir.load %[[IV]] : !fir.ref<i32>
+  ! CHECK:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
+  ! CHECK:    fir.call @_QPfoo(%[[IV]]) {{.*}}: (!fir.ref<i32>) -> ()
+    call foo(i)
   end do
   ! CHECK:       omp.yield
   !$OMP END DO
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir.f90
index 97ee665442e3a..97510745d0539 100644
--- a/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir.f90
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir.f90
@@ -18,15 +18,11 @@
 !CHECK:  %[[C0_2:.*]] = arith.constant 0 : i32
 !CHECK:  hlfir.assign %[[C0_2]] to %[[XDECL]]#0 : i32, !fir.ref<i32>
 !CHECK:  omp.parallel
-!CHECK:    %[[I_PVT_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
-!CHECK:    %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT_REF]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 !CHECK:    %[[C1_1:.*]] = arith.constant 1 : i32
 !CHECK:    %[[C100:.*]] = arith.constant 100 : i32
 !CHECK:    %[[C1_2:.*]] = arith.constant 1 : i32
 !CHECK:    omp.wsloop   reduction(@[[RED_I32_NAME]] -> %[[XDECL]]#0 : !fir.ref<i32>) for  (%[[IVAL:.*]]) : i32 = (%[[C1_1]]) to (%[[C100]]) inclusive step (%[[C1_2]])
-!CHECK:      fir.store %[[IVAL]] to %[[I_PVT_DECL]]#1 : !fir.ref<i32>
-!CHECK:      %[[I_PVT_VAL:.*]] = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
-!CHECK:      omp.reduction %[[I_PVT_VAL]], %[[XDECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      omp.reduction %[[IVAL]], %[[XDECL]]#0 : i32, !fir.ref<i32>
 !CHECK:      omp.yield
 !CHECK:    omp.terminator
 !CHECK:  return
diff --git a/flang/test/Transforms/omp-wsloop-index.mlir b/flang/test/Transforms/omp-wsloop-index.mlir
new file mode 100644
index 0000000000000..9443209d90b9f
--- /dev/null
+++ b/flang/test/Transforms/omp-wsloop-index.mlir
@@ -0,0 +1,247 @@
+// RUN: fir-opt --omp-loop-index-mem2reg %s | FileCheck %s
+
+func.func private @foo(%arg0 : !fir.ref<i32>) -> i32
+
+// CHECK-LABEL: @wsloop_remove_alloca
+func.func @wsloop_remove_alloca() {
+  // CHECK: %[[RESULT:.*]] = fir.alloca i32
+  // CHECK: omp.parallel
+  %0 = fir.alloca i32
+  omp.parallel {
+    // CHECK-NOT: fir.alloca
+    // CHECK-DAG: arith.constant 1
+    // CHECK-DAG: arith.constant 10
+    // CHECK-NEXT: omp.wsloop for (%[[INDEX:.*]]) : i32
+    %1 = fir.alloca i32
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    omp.wsloop for (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+      // CHECK-NOT: fir.alloca
+      // CHECK: fir.store %[[INDEX]] to %[[RESULT]]
+      // CHECK: omp.yield
+      fir.store %arg0 to %1 : !fir.ref<i32>
+      %2 = fir.load %1 : !fir.ref<i32>
+      fir.store %2 to %0 : !fir.ref<i32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @simdloop_remove_alloca
+func.func @simdloop_remove_alloca() {
+  // CHECK: %[[RESULT:.*]] = fir.alloca i32
+  // CHECK: omp.parallel
+  %0 = fir.alloca i32
+  omp.parallel {
+    // CHECK-NOT: fir.alloca
+    // CHECK-DAG: arith.constant 1
+    // CHECK-DAG: arith.constant 10
+    // CHECK-NEXT: omp.simdloop for (%[[INDEX:.*]]) : i32
+    %1 = fir.alloca i32
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    omp.simdloop for (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+      // CHECK-NOT: fir.alloca
+      // CHECK: fir.store %[[INDEX]] to %[[RESULT]]
+      // CHECK: omp.yield
+      fir.store %arg0 to %1 : !fir.ref<i32>
+      %2 = fir.load %1 : !fir.ref<i32>
+      fir.store %2 to %0 : !fir.ref<i32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @wsloop_push_alloca
+func.func @wsloop_push_alloca() {
+  // CHECK: %[[RESULT:.*]] = fir.alloca i32
+  // CHECK: omp.parallel
+  %0 = fir.alloca i32
+  omp.parallel {
+    // CHECK-NOT: fir.alloca
+    // CHECK-DAG: arith.constant 1
+    // CHECK-DAG: arith.constant 10
+    // CHECK-NEXT: omp.wsloop for (%[[INDEX:.*]]) : i32
+    %1 = fir.alloca i32
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    omp.wsloop for (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+      // CHECK: %[[ALLOCA:.*]] = fir.alloca i32
+      // CHECK: fir.store %[[INDEX]] to %[[ALLOCA]]
+      // CHECK: %[[RETURN:.*]] = func.call @foo(%[[ALLOCA]])
+      // CHECK: fir.store %[[RETURN]] to %[[RESULT]]
+      // CHECK: omp.yield
+      fir.store %arg0 to %1 : !fir.ref<i32>
+      %2 = func.call @foo(%1) : (!fir.ref<i32>) -> i32
+      fir.store %2 to %0 : !fir.ref<i32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @simdloop_push_alloca
+func.func @simdloop_push_alloca() {
+  // CHECK: %[[RESULT:.*]] = fir.alloca i32
+  // CHECK: omp.parallel
+  %0 = fir.alloca i32
+  omp.parallel {
+    // CHECK-NOT: fir.alloca
+    // CHECK-DAG: arith.constant 1
+    // CHECK-DAG: arith.constant 10
+    // CHECK-NEXT: omp.simdloop for (%[[INDEX:.*]]) : i32
+    %1 = fir.alloca i32
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    omp.simdloop for (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+      // CHECK: %[[ALLOCA:.*]] = fir.alloca i32
+      // CHECK: fir.store %[[INDEX]] to %[[ALLOCA]]
+      // CHECK: %[[RETURN:.*]] = func.call @foo(%[[ALLOCA]])
+      // CHECK: fir.store %[[RETURN]] to %[[RESULT]]
+      // CHECK: omp.yield
+      fir.store %arg0 to %1 : !fir.ref<i32>
+      %2 = func.call @foo(%1) : (!fir.ref<i32>) -> i32
+      fir.store %2 to %0 : !fir.ref<i32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @hlfir_wsloop_remove_alloca
+func.func @hlfir_wsloop_remove_alloca() {
+  // CHECK: %[[RESULT_ALLOCA:.*]] = fir.alloca i32
+  // CHECK: %[[RESULT:.*]]:2 = hlfir.declare %[[RESULT_ALLOCA]]
+  // CHECK: omp.parallel
+  %0 = fir.alloca i32
+  %1:2 = hlfir.declare %0 {uniq_name = "result"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  omp.parallel {
+    // CHECK-NOT: fir.alloca
+    // CHECK-NOT: hlfir.declare
+    // CHECK-DAG: arith.constant 1
+    // CHECK-DAG: arith.constant 10
+    // CHECK-NEXT: omp.wsloop for (%[[INDEX:.*]]) : i32
+    %2 = fir.alloca i32
+    %3:2 = hlfir.declare %2 {uniq_name = "index"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    omp.wsloop for (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+      // CHECK-NOT: fir.alloca
+      // CHECK-NOT: hlfir.declare
+      // CHECK: hlfir.assign %[[INDEX]] to %[[RESULT]]#0
+      // CHECK: omp.yield
+      fir.store %arg0 to %3#1 : !fir.ref<i32>
+      %4 = fir.load %3#0 : !fir.ref<i32>
+      hlfir.assign %4 to %1#0 : i32, !fir.ref<i32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @hlfir_simdloop_remove_alloca
+func.func @hlfir_simdloop_remove_alloca() {
+  // CHECK: %[[RESULT_ALLOCA:.*]] = fir.alloca i32
+  // CHECK: %[[RESULT:.*]]:2 = hlfir.declare %[[RESULT_ALLOCA]]
+  // CHECK: omp.parallel
+  %0 = fir.alloca i32
+  %1:2 = hlfir.declare %0 {uniq_name = "result"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  omp.parallel {
+    // CHECK-NOT: fir.alloca
+    // CHECK-NOT: hlfir.declare
+    // CHECK-DAG: arith.constant 1
+    // CHECK-DAG: arith.constant 10
+    // CHECK-NEXT: omp.simdloop for (%[[INDEX:.*]]) : i32
+    %2 = fir.alloca i32
+    %3:2 = hlfir.declare %2 {uniq_name = "index"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    omp.simdloop for (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+      // CHECK-NOT: fir.alloca
+      // CHECK-NOT: hlfir.declare
+      // CHECK: hlfir.assign %[[INDEX]] to %[[RESULT]]#0
+      // CHECK: omp.yield
+      fir.store %arg0 to %3#1 : !fir.ref<i32>
+      %4 = fir.load %3#0 : !fir.ref<i32>
+      hlfir.assign %4 to %1#0 : i32, !fir.ref<i32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @hlfir_wsloop_push_alloca
+func.func @hlfir_wsloop_push_alloca() {
+  // CHECK: %[[RESULT_ALLOCA:.*]] = fir.alloca i32
+  // CHECK: %[[RESULT:.*]]:2 = hlfir.declare %[[RESULT_ALLOCA]]
+  // CHECK: omp.parallel
+  %0 = fir.alloca i32
+  %1:2 = hlfir.declare %0 {uniq_name = "result"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  omp.parallel {
+    // CHECK-NOT: fir.alloca
+    // CHECK-NOT: hlfir.declare
+    // CHECK-DAG: arith.constant 1
+    // CHECK-DAG: arith.constant 10
+    // CHECK-NEXT: omp.wsloop for (%[[INDEX:.*]]) : i32
+    %2 = fir.alloca i32
+    %3:2 = hlfir.declare %2 {uniq_name = "index"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    omp.wsloop for (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+      // CHECK: %[[INDEX_ALLOCA:.*]] = fir.alloca i32
+      // CHECK: %[[INDEX_DECL:.*]]:2 = hlfir.declare %[[INDEX_ALLOCA]]
+      // CHECK: fir.store %[[INDEX]] to %[[INDEX_DECL]]#1
+      // CHECK: %[[RETURN:.*]] = fir.call @foo(%[[INDEX_DECL]]#1)
+      // CHECK: hlfir.assign %[[RETURN]] to %[[RESULT]]#0
+      // CHECK: omp.yield
+      fir.store %arg0 to %3#1 : !fir.ref<i32>
+      %4 = fir.call @foo(%3#1) : (!fir.ref<i32>) -> i32
+      hlfir.assign %4 to %1#0 : i32, !fir.ref<i32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  return
+}
+
+// CHECK-LABEL: @hlfir_simdloop_push_alloca
+func.func @hlfir_simdloop_push_alloca() {
+  // CHECK: %[[RESULT_ALLOCA:.*]] = fir.alloca i32
+  // CHECK: %[[RESULT:.*]]:2 = hlfir.declare %[[RESULT_ALLOCA]]
+  // CHECK: omp.parallel
+  %0 = fir.alloca i32
+  %1:2 = hlfir.declare %0 {uniq_name = "result"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  omp.parallel {
+    // CHECK-NOT: fir.alloca
+    // CHECK-NOT: hlfir.declare
+    // CHECK-DAG: arith.constant 1
+    // CHECK-DAG: arith.constant 10
+    // CHECK-NEXT: omp.simdloop for (%[[INDEX:.*]]) : i32
+    %2 = fir.alloca i32
+    %3:2 = hlfir.declare %2 {uniq_name = "index"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    %c1_i32 = arith.constant 1 : i32
+    %c10_i32 = arith.constant 10 : i32
+    omp.simdloop for (%arg0) : i32 = (%c1_i32) to (%c10_i32) inclusive step (%c1_i32) {
+      // CHECK: %[[INDEX_ALLOCA:.*]] = fir.alloca i32
+      // CHECK: %[[INDEX_DECL:.*]]:2 = hlfir.declare %[[INDEX_ALLOCA]]
+      // CHECK: fir.store %[[INDEX]] to %[[INDEX_DECL]]#1
+      // CHECK: %[[RETURN:.*]] = fir.call @foo(%[[INDEX_DECL]]#1)
+      // CHECK: hlfir.assign %[[RETURN]] to %[[RESULT]]#0
+      // CHECK: omp.yield
+      fir.store %arg0 to %3#1 : !fir.ref<i32>
+      %4 = fir.call @foo(%3#1) : (!fir.ref<i32>) -> i32
+      hlfir.assign %4 to %1#0 : i32, !fir.ref<i32>
+      omp.yield
+    }
+    omp.terminator
+  }
+  return
+}