diff --git a/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h b/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h
index f3c46f5efd73c..e4de5b0661571 100644
--- a/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h
+++ b/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h
@@ -374,10 +374,10 @@ class FlatLinearValueConstraints : public FlatLinearConstraints {
       setValue(i, values[i - start]);
   }
 
-  /// Looks up the position of the variable with the specified Value. Returns
-  /// true if found (false otherwise). `pos` is set to the (column) position of
-  /// the variable.
-  bool findVar(Value val, unsigned *pos) const;
+  /// Looks up the position of the variable with the specified Value starting
+  /// with variables at offset `offset`. Returns true if found (false
+  /// otherwise). `pos` is set to the (column) position of the variable.
+  bool findVar(Value val, unsigned *pos, unsigned offset = 0) const;
 
   /// Returns true if a variable with the specified Value exists, false
   /// otherwise.
diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h b/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h
index 7376d9b992a04..3dea99cd6b3e5 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h
@@ -213,20 +213,20 @@ struct MemRefDependenceGraph {
 };
 
 /// Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered
-/// from the outermost 'affine.for' operation to the innermost one.
+/// from the outermost 'affine.for' operation to the innermost one while not
+/// traversing outside of the surrounding affine scope.
 void getAffineForIVs(Operation &op, SmallVectorImpl<AffineForOp> *loops);
 
 /// Populates 'ivs' with IVs of the surrounding affine.for and affine.parallel
-/// ops ordered from the outermost one to the innermost.
+/// ops ordered from the outermost one to the innermost while not traversing
+/// outside of the surrounding affine scope.
 void getAffineIVs(Operation &op, SmallVectorImpl<Value> &ivs);
 
 /// Populates 'ops' with affine operations enclosing `op` ordered from outermost
-/// to innermost. affine.for, affine.if, or affine.parallel ops comprise such
-/// surrounding affine ops.
-/// TODO: Change this to return a list of enclosing ops up until the op that
-/// starts an `AffineScope`. In such a case, `ops` is guaranteed by design to
-/// have a successive chain of affine parent ops, and this is primarily what is
-/// needed for most analyses.
+/// to innermost while stopping at the boundary of the affine scope. affine.for,
+/// affine.if, or affine.parallel ops comprise such surrounding affine ops.
+/// `ops` is guaranteed by design to have a successive chain of affine parent
+/// ops.
 void getEnclosingAffineOps(Operation &op, SmallVectorImpl<Operation *> *ops);
 
 /// Returns the nesting depth of this operation, i.e., the number of loops
diff --git a/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
index 284d394ddceff..4cec777f9888c 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
@@ -110,7 +110,6 @@ class FusionStrategy {
 /// returns a FusionResult explaining why fusion is not feasible.
 /// NOTE: This function is not feature complete and should only be used in
 /// testing.
-/// TODO: Update comments when this function is fully implemented.
 FusionResult
 canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, unsigned dstLoopDepth,
              ComputationSliceState *srcSlice,
diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
index 382d05f3b2d48..d76e94b7070e8 100644
--- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
+++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
@@ -958,15 +958,15 @@ areVarsUnique(const FlatLinearValueConstraints &cst, VarKind kind) {
 /// so that they have the union of all variables, with A's original
 /// variables appearing first followed by any of B's variables that didn't
 /// appear in A. Local variables in B that have the same division
-/// representation as local variables in A are merged into one.
+/// representation as local variables in A are merged into one. We allow A
+/// and B to have non-unique values for their variables; in such cases, they are
+/// still aligned with the variables appearing first aligned with those
+/// appearing first in the other system from left to right.
 //  E.g.: Input: A has ((%i, %j) [%M, %N]) and B has (%k, %j) [%P, %N, %M])
 //        Output: both A, B have (%i, %j, %k) [%M, %N, %P]
 static void mergeAndAlignVars(unsigned offset, FlatLinearValueConstraints *a,
                               FlatLinearValueConstraints *b) {
   assert(offset <= a->getNumDimVars() && offset <= b->getNumDimVars());
-  // A merge/align isn't meaningful if a cst's vars aren't distinct.
-  assert(areVarsUnique(*a) && "A's values aren't unique");
-  assert(areVarsUnique(*b) && "B's values aren't unique");
 
   assert(llvm::all_of(
       llvm::drop_begin(a->getMaybeValues(), offset),
@@ -982,9 +982,12 @@ static void mergeAndAlignVars(unsigned offset, FlatLinearValueConstraints *a,
   {
     // Merge dims from A into B.
     unsigned d = offset;
-    for (auto aDimValue : aDimValues) {
+    for (Value aDimValue : aDimValues) {
       unsigned loc;
-      if (b->findVar(aDimValue, &loc)) {
+      // Find from the position `d` since we'd like to also consider the
+      // possibility of multiple variables with the same `Value`. We align with
+      // the next appearing one.
+      if (b->findVar(aDimValue, &loc, d)) {
         assert(loc >= offset && "A's dim appears in B's aligned range");
         assert(loc < b->getNumDimVars() &&
                "A's dim appears in B's non-dim position");
@@ -1017,15 +1020,12 @@ void FlatLinearValueConstraints::mergeAndAlignVarsWithOther(
 }
 
 /// Merge and align symbols of `this` and `other` such that both get union of
-/// of symbols that are unique. Symbols in `this` and `other` should be
-/// unique. Symbols with Value as `None` are considered to be inequal to all
-/// other symbols.
+/// of symbols. Existing symbols need not be unique; they will be aligned from
+/// left to right with duplicates aligned in the same order. Symbols with Value
+/// as `None` are considered to be inequal to all other symbols.
 void FlatLinearValueConstraints::mergeSymbolVars(
     FlatLinearValueConstraints &other) {
 
-  assert(areVarsUnique(*this, VarKind::Symbol) && "Symbol vars are not unique");
-  assert(areVarsUnique(other, VarKind::Symbol) && "Symbol vars are not unique");
-
   SmallVector<Value, 4> aSymValues;
   getValues(getNumDimVars(), getNumDimAndSymbolVars(), &aSymValues);
 
@@ -1034,8 +1034,9 @@ void FlatLinearValueConstraints::mergeSymbolVars(
   for (Value aSymValue : aSymValues) {
     unsigned loc;
     // If the var is a symbol in `other`, then align it, otherwise assume that
-    // it is a new symbol
-    if (other.findVar(aSymValue, &loc) && loc >= other.getNumDimVars() &&
+    // it is a new symbol. Search in `other` starting at position `s` since the
+    // left of it is aligned.
+    if (other.findVar(aSymValue, &loc, s) && loc >= other.getNumDimVars() &&
         loc < other.getNumDimAndSymbolVars())
       other.swapVar(s, loc);
     else
@@ -1051,8 +1052,6 @@ void FlatLinearValueConstraints::mergeSymbolVars(
 
   assert(getNumSymbolVars() == other.getNumSymbolVars() &&
          "expected same number of symbols");
-  assert(areVarsUnique(*this, VarKind::Symbol) && "Symbol vars are not unique");
-  assert(areVarsUnique(other, VarKind::Symbol) && "Symbol vars are not unique");
 }
 
 bool FlatLinearValueConstraints::hasConsistentState() const {
@@ -1104,9 +1103,11 @@ FlatLinearValueConstraints::computeAlignedMap(AffineMap map,
   return alignedMap;
 }
 
-bool FlatLinearValueConstraints::findVar(Value val, unsigned *pos) const {
-  unsigned i = 0;
-  for (const auto &mayBeVar : values) {
+bool FlatLinearValueConstraints::findVar(Value val, unsigned *pos,
+                                         unsigned offset) const {
+  unsigned i = offset;
+  for (const auto &mayBeVar :
+       ArrayRef<std::optional<Value>>(values).drop_front(offset)) {
     if (mayBeVar && *mayBeVar == val) {
       *pos = i;
       return true;
diff --git a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
index 9d2998456a4b6..d9b1050c83e26 100644
--- a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp
@@ -199,10 +199,15 @@ bool MemRefDependenceGraph::init() {
           continue;
         SmallVector<AffineForOp, 4> loops;
         getAffineForIVs(*user, &loops);
-        if (loops.empty())
+        // Find the surrounding affine.for nested immediately within the
+        // block.
+        auto *it = llvm::find_if(loops, [&](AffineForOp loop) {
+          return loop->getBlock() == &block;
+        });
+        if (it == loops.end())
           continue;
-        assert(forToNodeMap.count(loops[0]) > 0 && "missing mapping");
-        unsigned userLoopNestId = forToNodeMap[loops[0]];
+        assert(forToNodeMap.count(*it) > 0 && "missing mapping");
+        unsigned userLoopNestId = forToNodeMap[*it];
         addEdge(node.id, userLoopNestId, value);
       }
     }
@@ -631,8 +636,8 @@ void mlir::affine::getAffineForIVs(Operation &op,
   AffineForOp currAffineForOp;
   // Traverse up the hierarchy collecting all 'affine.for' operation while
   // skipping over 'affine.if' operations.
-  while (currOp) {
-    if (AffineForOp currAffineForOp = dyn_cast<AffineForOp>(currOp))
+  while (currOp && !currOp->hasTrait<OpTrait::AffineScope>()) {
+    if (auto currAffineForOp = dyn_cast<AffineForOp>(currOp))
       loops->push_back(currAffineForOp);
     currOp = currOp->getParentOp();
   }
@@ -646,7 +651,7 @@ void mlir::affine::getEnclosingAffineOps(Operation &op,
 
   // Traverse up the hierarchy collecting all `affine.for`, `affine.if`, and
   // affine.parallel operations.
-  while (currOp) {
+  while (currOp && !currOp->hasTrait<OpTrait::AffineScope>()) {
     if (isa<AffineIfOp, AffineForOp, AffineParallelOp>(currOp))
       ops->push_back(currOp);
     currOp = currOp->getParentOp();
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
index 2629f5a511cec..66d921b4889f5 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp
@@ -896,6 +896,16 @@ struct GreedyFusion {
         if (fusedLoopInsPoint == nullptr)
           continue;
 
+        // It's possible this fusion is at an inner depth (i.e., there are
+        // common surrounding affine loops for the source and destination for
+        // ops). We need to get this number because the call to canFuseLoops
+        // needs to be passed the absolute depth. The max legal depth and the
+        // depths we try below are however *relative* and as such don't include
+        // the common depth.
+        SmallVector<AffineForOp, 4> surroundingLoops;
+        getAffineForIVs(*dstAffineForOp, &surroundingLoops);
+        unsigned numSurroundingLoops = surroundingLoops.size();
+
         // Compute the innermost common loop depth for dstNode
         // producer-consumer loads/stores.
         SmallVector<Operation *, 2> dstMemrefOps;
@@ -907,7 +917,8 @@ struct GreedyFusion {
           if (producerConsumerMemrefs.count(
                   cast<AffineWriteOpInterface>(op).getMemRef()))
             dstMemrefOps.push_back(op);
-        unsigned dstLoopDepthTest = getInnermostCommonLoopDepth(dstMemrefOps);
+        unsigned dstLoopDepthTest =
+            getInnermostCommonLoopDepth(dstMemrefOps) - numSurroundingLoops;
 
         // Check the feasibility of fusing src loop nest into dst loop nest
         // at loop depths in range [1, dstLoopDepthTest].
@@ -916,9 +927,10 @@ struct GreedyFusion {
         depthSliceUnions.resize(dstLoopDepthTest);
         FusionStrategy strategy(FusionStrategy::ProducerConsumer);
         for (unsigned i = 1; i <= dstLoopDepthTest; ++i) {
-          FusionResult result = affine::canFuseLoops(
-              srcAffineForOp, dstAffineForOp,
-              /*dstLoopDepth=*/i, &depthSliceUnions[i - 1], strategy);
+          FusionResult result =
+              affine::canFuseLoops(srcAffineForOp, dstAffineForOp,
+                                   /*dstLoopDepth=*/i + numSurroundingLoops,
+                                   &depthSliceUnions[i - 1], strategy);
 
           if (result.value == FusionResult::Success)
             maxLegalFusionDepth = i;
@@ -1125,9 +1137,18 @@ struct GreedyFusion {
       SmallVector<Operation *, 2> dstLoadOpInsts;
       dstNode->getLoadOpsForMemref(memref, &dstLoadOpInsts);
 
+      // It's possible this fusion is at an inner depth (i.e., there are common
+      // surrounding affine loops for the source and destination for ops). We
+      // need to get this number because the call to canFuseLoops needs to be
+      // passed the absolute depth. The max legal depth and the depths we try
+      // below are however *relative* and as such don't include the common
+      // depth.
+      SmallVector<AffineForOp, 4> surroundingLoops;
+      getAffineForIVs(*dstAffineForOp, &surroundingLoops);
+      unsigned numSurroundingLoops = surroundingLoops.size();
       SmallVector<AffineForOp, 4> dstLoopIVs;
       getAffineForIVs(*dstLoadOpInsts[0], &dstLoopIVs);
-      unsigned dstLoopDepthTest = dstLoopIVs.size();
+      unsigned dstLoopDepthTest = dstLoopIVs.size() - numSurroundingLoops;
       auto sibAffineForOp = cast<AffineForOp>(sibNode->op);
 
       // Compute loop depth and slice union for fusion.
@@ -1136,14 +1157,18 @@ struct GreedyFusion {
       unsigned maxLegalFusionDepth = 0;
       FusionStrategy strategy(memref);
       for (unsigned i = 1; i <= dstLoopDepthTest; ++i) {
-        FusionResult result = affine::canFuseLoops(
-            sibAffineForOp, dstAffineForOp,
-            /*dstLoopDepth=*/i, &depthSliceUnions[i - 1], strategy);
+        FusionResult result =
+            affine::canFuseLoops(sibAffineForOp, dstAffineForOp,
+                                 /*dstLoopDepth=*/i + numSurroundingLoops,
+                                 &depthSliceUnions[i - 1], strategy);
 
         if (result.value == FusionResult::Success)
           maxLegalFusionDepth = i;
       }
 
+      LLVM_DEBUG(llvm::dbgs() << "Max legal depth for fusion: "
+                              << maxLegalFusionDepth << '\n');
+
       // Skip if fusion is not feasible at any loop depths.
       if (maxLegalFusionDepth == 0)
         continue;
@@ -1238,9 +1263,15 @@ struct GreedyFusion {
         SmallVector<AffineForOp, 4> loops;
         getAffineForIVs(*user, &loops);
         // Skip 'use' if it is not within a loop nest.
-        if (loops.empty())
+        // Find the surrounding affine.for nested immediately within the
+        // block.
+        auto *it = llvm::find_if(loops, [&](AffineForOp loop) {
+          return loop->getBlock() == &mdg->block;
+        });
+        // Skip 'use' if it is not within a loop nest in `block`.
+        if (it == loops.end())
           continue;
-        Node *sibNode = mdg->getForOpNode(loops[0]);
+        Node *sibNode = mdg->getForOpNode(*it);
         assert(sibNode != nullptr);
         // Skip 'use' if it not a sibling to 'dstNode'.
         if (sibNode->id == dstNode->id)
@@ -1373,9 +1404,17 @@ void LoopFusion::runOnBlock(Block *block) {
 }
 
 void LoopFusion::runOnOperation() {
-  for (Region &region : getOperation()->getRegions())
-    for (Block &block : region.getBlocks())
-      runOnBlock(&block);
+  // Call fusion on every op that has at least two affine.for nests (in post
+  // order).
+  getOperation()->walk([&](Operation *op) {
+    for (Region &region : op->getRegions()) {
+      for (Block &block : region.getBlocks()) {
+        auto affineFors = block.getOps<AffineForOp>();
+        if (!affineFors.empty() && !llvm::hasSingleElement(affineFors))
+          runOnBlock(&block);
+      }
+    }
+  });
 }
 
 std::unique_ptr<Pass> mlir::affine::createLoopFusionPass(
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
index 5053b08ee0834..77ee1d4c7a02d 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
@@ -243,7 +243,6 @@ static unsigned getMaxLoopDepth(ArrayRef<Operation *> srcOps,
   return loopDepth;
 }
 
-// TODO: Prevent fusion of loop nests with side-effecting operations.
 // TODO: This pass performs some computation that is the same for all the depths
 // (e.g., getMaxLoopDepth). Implement a version of this utility that processes
 // all the depths at once or only the legal maximal depth for maximal fusion.
diff --git a/mlir/test/Dialect/Affine/loop-fusion-inner.mlir b/mlir/test/Dialect/Affine/loop-fusion-inner.mlir
new file mode 100644
index 0000000000000..61af9a4baf46d
--- /dev/null
+++ b/mlir/test/Dialect/Affine/loop-fusion-inner.mlir
@@ -0,0 +1,225 @@
+// RUN: mlir-opt -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer fusion-maximal}))' %s | FileCheck %s
+
+// Test fusion of affine nests inside other region-holding ops (scf.for in the
+// test case below).
+
+// CHECK-LABEL: func @fusion_inner_simple
+func.func @fusion_inner_simple(%A : memref<10xf32>) {
+  %cst = arith.constant 0.0 : f32
+
+  affine.for %i = 0 to 100 {
+    %B = memref.alloc() : memref<10xf32>
+    %C = memref.alloc() : memref<10xf32>
+
+    affine.for %j = 0 to 10 {
+      %v = affine.load %A[%j] : memref<10xf32>
+      affine.store %v, %B[%j] : memref<10xf32>
+    }
+
+    affine.for %j = 0 to 10 {
+      %v = affine.load %B[%j] : memref<10xf32>
+      affine.store %v, %C[%j] : memref<10xf32>
+    }
+  }
+
+  // CHECK:      affine.for %{{.*}} = 0 to 100
+  // CHECK-NEXT:   memref.alloc
+  // CHECK-NEXT:   memref.alloc
+  // CHECK-NEXT:   affine.for %{{.*}} = 0 to 10
+  // CHECK-NOT:    affine.for
+
+  return
+}
+
+// CHECK-LABEL: func @fusion_inner_simple_scf
+func.func @fusion_inner_simple_scf(%A : memref<10xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %cst = arith.constant 0.0 : f32
+
+  scf.for %i = %c0 to %c100 step %c1 {
+    %B = memref.alloc() : memref<10xf32>
+    %C = memref.alloc() : memref<10xf32>
+
+    affine.for %j = 0 to 10 {
+      %v = affine.load %A[%j] : memref<10xf32>
+      affine.store %v, %B[%j] : memref<10xf32>
+    }
+
+    affine.for %j = 0 to 10 {
+      %v = affine.load %B[%j] : memref<10xf32>
+      affine.store %v, %C[%j] : memref<10xf32>
+    }
+  }
+  // CHECK:      scf.for
+  // CHECK-NEXT:   memref.alloc
+  // CHECK-NEXT:   memref.alloc
+  // CHECK-NEXT:   affine.for %{{.*}} = 0 to 10
+  // CHECK-NOT:    affine.for
+  return
+}
+
+// CHECK-LABEL: func @fusion_inner_multiple_nests
+func.func @fusion_inner_multiple_nests() {
+  %alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<4x4xi8>
+  %alloc_10 = memref.alloc() : memref<8x4xi32>
+  affine.for %arg8 = 0 to 4 {
+    %alloc_14 = memref.alloc() : memref<4xi8>
+    %alloc_15 = memref.alloc() : memref<8x4xi8>
+    affine.for %arg9 = 0 to 4 {
+      %0 = affine.load %alloc_5[%arg9, %arg8] : memref<4x4xi8>
+      affine.store %0, %alloc_14[%arg9] : memref<4xi8>
+    }
+    %alloc_16 = memref.alloc() : memref<4xi8>
+    affine.for %arg9 = 0 to 4 {
+      %0 = affine.load %alloc_14[%arg9] : memref<4xi8>
+      affine.store %0, %alloc_16[%arg9] : memref<4xi8>
+    }
+    affine.for %arg9 = 0 to 2 {
+      %0 = affine.load %alloc_15[%arg9 * 4, 0] : memref<8x4xi8>
+      %1 = affine.load %alloc_16[0] : memref<4xi8>
+      %2 = affine.load %alloc_10[%arg9 * 4, %arg8] : memref<8x4xi32>
+      %3 = arith.muli %0, %1 : i8
+      %4 = arith.extsi %3 : i8 to i32
+      %5 = arith.addi %4, %2 : i32
+      affine.store %5, %alloc_10[%arg9 * 4 + 3, %arg8] : memref<8x4xi32>
+    }
+    memref.dealloc %alloc_16 : memref<4xi8>
+  }
+  // CHECK:      affine.for %{{.*}} = 0 to 4 {
+  // Everything inside fused into two nests (the second will be DCE'd).
+  // CHECK-NEXT:   memref.alloc() : memref<4xi8>
+  // CHECK-NEXT:   memref.alloc() : memref<1xi8>
+  // CHECK-NEXT:   memref.alloc() : memref<1xi8>
+  // CHECK-NEXT:   memref.alloc() : memref<8x4xi8>
+  // CHECK-NEXT:   memref.alloc() : memref<4xi8>
+  // CHECK-NEXT:   affine.for %{{.*}} = 0 to 2 {
+  // CHECK:        }
+  // CHECK:        affine.for %{{.*}} = 0 to 4 {
+  // CHECK:        }
+  // CHECK-NEXT:   memref.dealloc
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  return
+}
+
+// CHECK-LABEL: func @fusion_inside_scf_while
+func.func @fusion_inside_scf_while(%A : memref<10xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c100 = arith.constant 100 : index
+  %cst = arith.constant 0.0 : f32
+
+  %0 = scf.while (%arg3 = %cst) : (f32) -> (f32) {
+    %1 = arith.cmpf ult, %arg3, %cst : f32
+    scf.condition(%1) %arg3 : f32
+  } do {
+  ^bb0(%arg5: f32):
+
+    %B = memref.alloc() : memref<10xf32>
+    %C = memref.alloc() : memref<10xf32>
+
+    affine.for %j = 0 to 10 {
+      %v = affine.load %A[%j] : memref<10xf32>
+      affine.store %v, %B[%j] : memref<10xf32>
+    }
+
+    affine.for %j = 0 to 10 {
+      %v = affine.load %B[%j] : memref<10xf32>
+      affine.store %v, %C[%j] : memref<10xf32>
+    }
+    %1 = arith.mulf %arg5, %cst : f32
+    scf.yield %1 : f32
+  }
+  // CHECK:      scf.while
+  // CHECK:        affine.for %{{.*}} = 0 to 10
+  // CHECK-NOT:    affine.for
+  // CHECK:        scf.yield
+  return
+}
+
+
+memref.global "private" constant @__constant_10x2xf32 : memref<10x2xf32> = dense<0.000000e+00>
+
+// CHECK-LABEL: func @fusion_inner_long
+func.func @fusion_inner_long(%arg0: memref<10x2xf32>, %arg1: memref<10x10xf32>, %arg2: memref<10x2xf32>, %s: index) {
+  %c0 = arith.constant 0 : index
+  %cst_0 = arith.constant 1.000000e-03 : f32
+  %c9 = arith.constant 9 : index
+  %c10_i32 = arith.constant 10 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c100_i32 = arith.constant 100 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %0 = memref.get_global @__constant_10x2xf32 : memref<10x2xf32>
+  %1 = scf.for %arg3 = %c0_i32 to %c100_i32 step %c1_i32 iter_args(%arg4 = %arg0) -> (memref<10x2xf32>)  : i32 {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<10xi32>
+    affine.for %arg5 = 0 to 10 {
+      %3 = arith.index_cast %arg5 : index to i32
+      affine.store %3, %alloc[%arg5] : memref<10xi32>
+    }
+    %2 = scf.for %arg5 = %c0_i32 to %c10_i32 step %c1_i32 iter_args(%arg6 = %0) -> (memref<10x2xf32>)  : i32 {
+      %alloc_5 = memref.alloc() : memref<2xf32>
+      affine.for %arg7 = 0 to 2 {
+        %16 = affine.load %arg4[%s, %arg7] : memref<10x2xf32>
+        affine.store %16, %alloc_5[%arg7] : memref<2xf32>
+      }
+      %alloc_6 = memref.alloc() {alignment = 64 : i64} : memref<1x2xf32>
+      affine.for %arg7 = 0 to 2 {
+        %16 = affine.load %alloc_5[%arg7] : memref<2xf32>
+        affine.store %16, %alloc_6[0, %arg7] : memref<1x2xf32>
+      }
+      %alloc_7 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
+      affine.for %arg7 = 0 to 10 {
+        affine.for %arg8 = 0 to 2 {
+          %16 = affine.load %alloc_6[0, %arg8] : memref<1x2xf32>
+          affine.store %16, %alloc_7[%arg7, %arg8] : memref<10x2xf32>
+        }
+      }
+      %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
+      affine.for %arg7 = 0 to 10 {
+        affine.for %arg8 = 0 to 2 {
+          %16 = affine.load %alloc_7[%arg7, %arg8] : memref<10x2xf32>
+          %17 = affine.load %arg4[%arg7, %arg8] : memref<10x2xf32>
+          %18 = arith.subf %16, %17 : f32
+          affine.store %18, %alloc_8[%arg7, %arg8] : memref<10x2xf32>
+        }
+      }
+      scf.yield %alloc_8 : memref<10x2xf32>
+      // CHECK:      scf.for
+      // CHECK:        scf.for
+      // CHECK:          affine.for %{{.*}} = 0 to 10
+      // CHECK-NEXT:       affine.for %{{.*}} = 0 to 2
+      // CHECK-NOT:      affine.for
+      // CHECK:          scf.yield
+    }
+    %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
+    affine.for %arg5 = 0 to 10 {
+      affine.for %arg6 = 0 to 2 {
+        affine.store %cst_0, %alloc_2[%arg5, %arg6] : memref<10x2xf32>
+      }
+    }
+    %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32>
+    affine.for %arg5 = 0 to 10 {
+      affine.for %arg6 = 0 to 2 {
+        %3 = affine.load %alloc_2[%arg5, %arg6] : memref<10x2xf32>
+        %4 = affine.load %2[%arg5, %arg6] : memref<10x2xf32>
+        %5 = arith.mulf %3, %4 : f32
+        affine.store %5, %alloc_3[%arg5, %arg6] : memref<10x2xf32>
+      }
+    }
+    scf.yield %alloc_3 : memref<10x2xf32>
+    // The nests above will be fused as well.
+    // CHECK:      affine.for %{{.*}} = 0 to 10
+    // CHECK-NEXT:   affine.for %{{.*}} = 0 to 2
+    // CHECK-NOT:  affine.for
+    // CHECK:      scf.yield
+  }
+  affine.for %arg3 = 0 to 10 {
+    affine.for %arg4 = 0 to 2 {
+      %2 = affine.load %1[%arg3, %arg4] : memref<10x2xf32>
+      affine.store %2, %arg2[%arg3, %arg4] : memref<10x2xf32>
+    }
+  }
+  return
+}