diff --git a/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h b/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h index f3c46f5efd73c..e4de5b0661571 100644 --- a/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h +++ b/mlir/include/mlir/Analysis/FlatLinearValueConstraints.h @@ -374,10 +374,10 @@ class FlatLinearValueConstraints : public FlatLinearConstraints { setValue(i, values[i - start]); } - /// Looks up the position of the variable with the specified Value. Returns - /// true if found (false otherwise). `pos` is set to the (column) position of - /// the variable. - bool findVar(Value val, unsigned *pos) const; + /// Looks up the position of the variable with the specified Value starting + /// with variables at offset `offset`. Returns true if found (false + /// otherwise). `pos` is set to the (column) position of the variable. + bool findVar(Value val, unsigned *pos, unsigned offset = 0) const; /// Returns true if a variable with the specified Value exists, false /// otherwise. diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h b/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h index 7376d9b992a04..3dea99cd6b3e5 100644 --- a/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h +++ b/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h @@ -213,20 +213,20 @@ struct MemRefDependenceGraph { }; /// Populates 'loops' with IVs of the affine.for ops surrounding 'op' ordered -/// from the outermost 'affine.for' operation to the innermost one. +/// from the outermost 'affine.for' operation to the innermost one while not +/// traversing outside of the surrounding affine scope. void getAffineForIVs(Operation &op, SmallVectorImpl *loops); /// Populates 'ivs' with IVs of the surrounding affine.for and affine.parallel -/// ops ordered from the outermost one to the innermost. +/// ops ordered from the outermost one to the innermost while not traversing +/// outside of the surrounding affine scope. void getAffineIVs(Operation &op, SmallVectorImpl &ivs); /// Populates 'ops' with affine operations enclosing `op` ordered from outermost -/// to innermost. affine.for, affine.if, or affine.parallel ops comprise such -/// surrounding affine ops. -/// TODO: Change this to return a list of enclosing ops up until the op that -/// starts an `AffineScope`. In such a case, `ops` is guaranteed by design to -/// have a successive chain of affine parent ops, and this is primarily what is -/// needed for most analyses. +/// to innermost while stopping at the boundary of the affine scope. affine.for, +/// affine.if, or affine.parallel ops comprise such surrounding affine ops. +/// `ops` is guaranteed by design to have a successive chain of affine parent +/// ops. void getEnclosingAffineOps(Operation &op, SmallVectorImpl *ops); /// Returns the nesting depth of this operation, i.e., the number of loops diff --git a/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h index 284d394ddceff..4cec777f9888c 100644 --- a/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h +++ b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h @@ -110,7 +110,6 @@ class FusionStrategy { /// returns a FusionResult explaining why fusion is not feasible. /// NOTE: This function is not feature complete and should only be used in /// testing. -/// TODO: Update comments when this function is fully implemented. FusionResult canFuseLoops(AffineForOp srcForOp, AffineForOp dstForOp, unsigned dstLoopDepth, ComputationSliceState *srcSlice, diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp index 382d05f3b2d48..d76e94b7070e8 100644 --- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp +++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp @@ -958,15 +958,15 @@ areVarsUnique(const FlatLinearValueConstraints &cst, VarKind kind) { /// so that they have the union of all variables, with A's original /// variables appearing first followed by any of B's variables that didn't /// appear in A. Local variables in B that have the same division -/// representation as local variables in A are merged into one. +/// representation as local variables in A are merged into one. We allow A +/// and B to have non-unique values for their variables; in such cases, they are +/// still aligned with the variables appearing first aligned with those +/// appearing first in the other system from left to right. // E.g.: Input: A has ((%i, %j) [%M, %N]) and B has (%k, %j) [%P, %N, %M]) // Output: both A, B have (%i, %j, %k) [%M, %N, %P] static void mergeAndAlignVars(unsigned offset, FlatLinearValueConstraints *a, FlatLinearValueConstraints *b) { assert(offset <= a->getNumDimVars() && offset <= b->getNumDimVars()); - // A merge/align isn't meaningful if a cst's vars aren't distinct. - assert(areVarsUnique(*a) && "A's values aren't unique"); - assert(areVarsUnique(*b) && "B's values aren't unique"); assert(llvm::all_of( llvm::drop_begin(a->getMaybeValues(), offset), @@ -982,9 +982,12 @@ static void mergeAndAlignVars(unsigned offset, FlatLinearValueConstraints *a, { // Merge dims from A into B. unsigned d = offset; - for (auto aDimValue : aDimValues) { + for (Value aDimValue : aDimValues) { unsigned loc; - if (b->findVar(aDimValue, &loc)) { + // Find from the position `d` since we'd like to also consider the + // possibility of multiple variables with the same `Value`. We align with + // the next appearing one. + if (b->findVar(aDimValue, &loc, d)) { assert(loc >= offset && "A's dim appears in B's aligned range"); assert(loc < b->getNumDimVars() && "A's dim appears in B's non-dim position"); @@ -1017,15 +1020,12 @@ void FlatLinearValueConstraints::mergeAndAlignVarsWithOther( } /// Merge and align symbols of `this` and `other` such that both get union of -/// of symbols that are unique. Symbols in `this` and `other` should be -/// unique. Symbols with Value as `None` are considered to be inequal to all -/// other symbols. +/// of symbols. Existing symbols need not be unique; they will be aligned from +/// left to right with duplicates aligned in the same order. Symbols with Value +/// as `None` are considered to be inequal to all other symbols. void FlatLinearValueConstraints::mergeSymbolVars( FlatLinearValueConstraints &other) { - assert(areVarsUnique(*this, VarKind::Symbol) && "Symbol vars are not unique"); - assert(areVarsUnique(other, VarKind::Symbol) && "Symbol vars are not unique"); - SmallVector aSymValues; getValues(getNumDimVars(), getNumDimAndSymbolVars(), &aSymValues); @@ -1034,8 +1034,9 @@ void FlatLinearValueConstraints::mergeSymbolVars( for (Value aSymValue : aSymValues) { unsigned loc; // If the var is a symbol in `other`, then align it, otherwise assume that - // it is a new symbol - if (other.findVar(aSymValue, &loc) && loc >= other.getNumDimVars() && + // it is a new symbol. Search in `other` starting at position `s` since the + // left of it is aligned. + if (other.findVar(aSymValue, &loc, s) && loc >= other.getNumDimVars() && loc < other.getNumDimAndSymbolVars()) other.swapVar(s, loc); else @@ -1051,8 +1052,6 @@ void FlatLinearValueConstraints::mergeSymbolVars( assert(getNumSymbolVars() == other.getNumSymbolVars() && "expected same number of symbols"); - assert(areVarsUnique(*this, VarKind::Symbol) && "Symbol vars are not unique"); - assert(areVarsUnique(other, VarKind::Symbol) && "Symbol vars are not unique"); } bool FlatLinearValueConstraints::hasConsistentState() const { @@ -1104,9 +1103,11 @@ FlatLinearValueConstraints::computeAlignedMap(AffineMap map, return alignedMap; } -bool FlatLinearValueConstraints::findVar(Value val, unsigned *pos) const { - unsigned i = 0; - for (const auto &mayBeVar : values) { +bool FlatLinearValueConstraints::findVar(Value val, unsigned *pos, + unsigned offset) const { + unsigned i = offset; + for (const auto &mayBeVar : + ArrayRef>(values).drop_front(offset)) { if (mayBeVar && *mayBeVar == val) { *pos = i; return true; diff --git a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp index 9d2998456a4b6..d9b1050c83e26 100644 --- a/mlir/lib/Dialect/Affine/Analysis/Utils.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/Utils.cpp @@ -199,10 +199,15 @@ bool MemRefDependenceGraph::init() { continue; SmallVector loops; getAffineForIVs(*user, &loops); - if (loops.empty()) + // Find the surrounding affine.for nested immediately within the + // block. + auto *it = llvm::find_if(loops, [&](AffineForOp loop) { + return loop->getBlock() == █ + }); + if (it == loops.end()) continue; - assert(forToNodeMap.count(loops[0]) > 0 && "missing mapping"); - unsigned userLoopNestId = forToNodeMap[loops[0]]; + assert(forToNodeMap.count(*it) > 0 && "missing mapping"); + unsigned userLoopNestId = forToNodeMap[*it]; addEdge(node.id, userLoopNestId, value); } } @@ -631,8 +636,8 @@ void mlir::affine::getAffineForIVs(Operation &op, AffineForOp currAffineForOp; // Traverse up the hierarchy collecting all 'affine.for' operation while // skipping over 'affine.if' operations. - while (currOp) { - if (AffineForOp currAffineForOp = dyn_cast(currOp)) + while (currOp && !currOp->hasTrait()) { + if (auto currAffineForOp = dyn_cast(currOp)) loops->push_back(currAffineForOp); currOp = currOp->getParentOp(); } @@ -646,7 +651,7 @@ void mlir::affine::getEnclosingAffineOps(Operation &op, // Traverse up the hierarchy collecting all `affine.for`, `affine.if`, and // affine.parallel operations. - while (currOp) { + while (currOp && !currOp->hasTrait()) { if (isa(currOp)) ops->push_back(currOp); currOp = currOp->getParentOp(); diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp index 2629f5a511cec..66d921b4889f5 100644 --- a/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopFusion.cpp @@ -896,6 +896,16 @@ struct GreedyFusion { if (fusedLoopInsPoint == nullptr) continue; + // It's possible this fusion is at an inner depth (i.e., there are + // common surrounding affine loops for the source and destination for + // ops). We need to get this number because the call to canFuseLoops + // needs to be passed the absolute depth. The max legal depth and the + // depths we try below are however *relative* and as such don't include + // the common depth. + SmallVector surroundingLoops; + getAffineForIVs(*dstAffineForOp, &surroundingLoops); + unsigned numSurroundingLoops = surroundingLoops.size(); + // Compute the innermost common loop depth for dstNode // producer-consumer loads/stores. SmallVector dstMemrefOps; @@ -907,7 +917,8 @@ struct GreedyFusion { if (producerConsumerMemrefs.count( cast(op).getMemRef())) dstMemrefOps.push_back(op); - unsigned dstLoopDepthTest = getInnermostCommonLoopDepth(dstMemrefOps); + unsigned dstLoopDepthTest = + getInnermostCommonLoopDepth(dstMemrefOps) - numSurroundingLoops; // Check the feasibility of fusing src loop nest into dst loop nest // at loop depths in range [1, dstLoopDepthTest]. @@ -916,9 +927,10 @@ struct GreedyFusion { depthSliceUnions.resize(dstLoopDepthTest); FusionStrategy strategy(FusionStrategy::ProducerConsumer); for (unsigned i = 1; i <= dstLoopDepthTest; ++i) { - FusionResult result = affine::canFuseLoops( - srcAffineForOp, dstAffineForOp, - /*dstLoopDepth=*/i, &depthSliceUnions[i - 1], strategy); + FusionResult result = + affine::canFuseLoops(srcAffineForOp, dstAffineForOp, + /*dstLoopDepth=*/i + numSurroundingLoops, + &depthSliceUnions[i - 1], strategy); if (result.value == FusionResult::Success) maxLegalFusionDepth = i; @@ -1125,9 +1137,18 @@ struct GreedyFusion { SmallVector dstLoadOpInsts; dstNode->getLoadOpsForMemref(memref, &dstLoadOpInsts); + // It's possible this fusion is at an inner depth (i.e., there are common + // surrounding affine loops for the source and destination for ops). We + // need to get this number because the call to canFuseLoops needs to be + // passed the absolute depth. The max legal depth and the depths we try + // below are however *relative* and as such don't include the common + // depth. + SmallVector surroundingLoops; + getAffineForIVs(*dstAffineForOp, &surroundingLoops); + unsigned numSurroundingLoops = surroundingLoops.size(); SmallVector dstLoopIVs; getAffineForIVs(*dstLoadOpInsts[0], &dstLoopIVs); - unsigned dstLoopDepthTest = dstLoopIVs.size(); + unsigned dstLoopDepthTest = dstLoopIVs.size() - numSurroundingLoops; auto sibAffineForOp = cast(sibNode->op); // Compute loop depth and slice union for fusion. @@ -1136,14 +1157,18 @@ struct GreedyFusion { unsigned maxLegalFusionDepth = 0; FusionStrategy strategy(memref); for (unsigned i = 1; i <= dstLoopDepthTest; ++i) { - FusionResult result = affine::canFuseLoops( - sibAffineForOp, dstAffineForOp, - /*dstLoopDepth=*/i, &depthSliceUnions[i - 1], strategy); + FusionResult result = + affine::canFuseLoops(sibAffineForOp, dstAffineForOp, + /*dstLoopDepth=*/i + numSurroundingLoops, + &depthSliceUnions[i - 1], strategy); if (result.value == FusionResult::Success) maxLegalFusionDepth = i; } + LLVM_DEBUG(llvm::dbgs() << "Max legal depth for fusion: " + << maxLegalFusionDepth << '\n'); + // Skip if fusion is not feasible at any loop depths. if (maxLegalFusionDepth == 0) continue; @@ -1238,9 +1263,15 @@ struct GreedyFusion { SmallVector loops; getAffineForIVs(*user, &loops); // Skip 'use' if it is not within a loop nest. - if (loops.empty()) + // Find the surrounding affine.for nested immediately within the + // block. + auto *it = llvm::find_if(loops, [&](AffineForOp loop) { + return loop->getBlock() == &mdg->block; + }); + // Skip 'use' if it is not within a loop nest in `block`. + if (it == loops.end()) continue; - Node *sibNode = mdg->getForOpNode(loops[0]); + Node *sibNode = mdg->getForOpNode(*it); assert(sibNode != nullptr); // Skip 'use' if it not a sibling to 'dstNode'. if (sibNode->id == dstNode->id) @@ -1373,9 +1404,17 @@ void LoopFusion::runOnBlock(Block *block) { } void LoopFusion::runOnOperation() { - for (Region ®ion : getOperation()->getRegions()) - for (Block &block : region.getBlocks()) - runOnBlock(&block); + // Call fusion on every op that has at least two affine.for nests (in post + // order). + getOperation()->walk([&](Operation *op) { + for (Region ®ion : op->getRegions()) { + for (Block &block : region.getBlocks()) { + auto affineFors = block.getOps(); + if (!affineFors.empty() && !llvm::hasSingleElement(affineFors)) + runOnBlock(&block); + } + } + }); } std::unique_ptr mlir::affine::createLoopFusionPass( diff --git a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp index 5053b08ee0834..77ee1d4c7a02d 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp @@ -243,7 +243,6 @@ static unsigned getMaxLoopDepth(ArrayRef srcOps, return loopDepth; } -// TODO: Prevent fusion of loop nests with side-effecting operations. // TODO: This pass performs some computation that is the same for all the depths // (e.g., getMaxLoopDepth). Implement a version of this utility that processes // all the depths at once or only the legal maximal depth for maximal fusion. diff --git a/mlir/test/Dialect/Affine/loop-fusion-inner.mlir b/mlir/test/Dialect/Affine/loop-fusion-inner.mlir new file mode 100644 index 0000000000000..61af9a4baf46d --- /dev/null +++ b/mlir/test/Dialect/Affine/loop-fusion-inner.mlir @@ -0,0 +1,225 @@ +// RUN: mlir-opt -pass-pipeline='builtin.module(func.func(affine-loop-fusion{mode=producer fusion-maximal}))' %s | FileCheck %s + +// Test fusion of affine nests inside other region-holding ops (scf.for in the +// test case below). + +// CHECK-LABEL: func @fusion_inner_simple +func.func @fusion_inner_simple(%A : memref<10xf32>) { + %cst = arith.constant 0.0 : f32 + + affine.for %i = 0 to 100 { + %B = memref.alloc() : memref<10xf32> + %C = memref.alloc() : memref<10xf32> + + affine.for %j = 0 to 10 { + %v = affine.load %A[%j] : memref<10xf32> + affine.store %v, %B[%j] : memref<10xf32> + } + + affine.for %j = 0 to 10 { + %v = affine.load %B[%j] : memref<10xf32> + affine.store %v, %C[%j] : memref<10xf32> + } + } + + // CHECK: affine.for %{{.*}} = 0 to 100 + // CHECK-NEXT: memref.alloc + // CHECK-NEXT: memref.alloc + // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 + // CHECK-NOT: affine.for + + return +} + +// CHECK-LABEL: func @fusion_inner_simple_scf +func.func @fusion_inner_simple_scf(%A : memref<10xf32>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c100 = arith.constant 100 : index + %cst = arith.constant 0.0 : f32 + + scf.for %i = %c0 to %c100 step %c1 { + %B = memref.alloc() : memref<10xf32> + %C = memref.alloc() : memref<10xf32> + + affine.for %j = 0 to 10 { + %v = affine.load %A[%j] : memref<10xf32> + affine.store %v, %B[%j] : memref<10xf32> + } + + affine.for %j = 0 to 10 { + %v = affine.load %B[%j] : memref<10xf32> + affine.store %v, %C[%j] : memref<10xf32> + } + } + // CHECK: scf.for + // CHECK-NEXT: memref.alloc + // CHECK-NEXT: memref.alloc + // CHECK-NEXT: affine.for %{{.*}} = 0 to 10 + // CHECK-NOT: affine.for + return +} + +// CHECK-LABEL: func @fusion_inner_multiple_nests +func.func @fusion_inner_multiple_nests() { + %alloc_5 = memref.alloc() {alignment = 64 : i64} : memref<4x4xi8> + %alloc_10 = memref.alloc() : memref<8x4xi32> + affine.for %arg8 = 0 to 4 { + %alloc_14 = memref.alloc() : memref<4xi8> + %alloc_15 = memref.alloc() : memref<8x4xi8> + affine.for %arg9 = 0 to 4 { + %0 = affine.load %alloc_5[%arg9, %arg8] : memref<4x4xi8> + affine.store %0, %alloc_14[%arg9] : memref<4xi8> + } + %alloc_16 = memref.alloc() : memref<4xi8> + affine.for %arg9 = 0 to 4 { + %0 = affine.load %alloc_14[%arg9] : memref<4xi8> + affine.store %0, %alloc_16[%arg9] : memref<4xi8> + } + affine.for %arg9 = 0 to 2 { + %0 = affine.load %alloc_15[%arg9 * 4, 0] : memref<8x4xi8> + %1 = affine.load %alloc_16[0] : memref<4xi8> + %2 = affine.load %alloc_10[%arg9 * 4, %arg8] : memref<8x4xi32> + %3 = arith.muli %0, %1 : i8 + %4 = arith.extsi %3 : i8 to i32 + %5 = arith.addi %4, %2 : i32 + affine.store %5, %alloc_10[%arg9 * 4 + 3, %arg8] : memref<8x4xi32> + } + memref.dealloc %alloc_16 : memref<4xi8> + } + // CHECK: affine.for %{{.*}} = 0 to 4 { + // Everything inside fused into two nests (the second will be DCE'd). + // CHECK-NEXT: memref.alloc() : memref<4xi8> + // CHECK-NEXT: memref.alloc() : memref<1xi8> + // CHECK-NEXT: memref.alloc() : memref<1xi8> + // CHECK-NEXT: memref.alloc() : memref<8x4xi8> + // CHECK-NEXT: memref.alloc() : memref<4xi8> + // CHECK-NEXT: affine.for %{{.*}} = 0 to 2 { + // CHECK: } + // CHECK: affine.for %{{.*}} = 0 to 4 { + // CHECK: } + // CHECK-NEXT: memref.dealloc + // CHECK-NEXT: } + // CHECK-NEXT: return + return +} + +// CHECK-LABEL: func @fusion_inside_scf_while +func.func @fusion_inside_scf_while(%A : memref<10xf32>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c100 = arith.constant 100 : index + %cst = arith.constant 0.0 : f32 + + %0 = scf.while (%arg3 = %cst) : (f32) -> (f32) { + %1 = arith.cmpf ult, %arg3, %cst : f32 + scf.condition(%1) %arg3 : f32 + } do { + ^bb0(%arg5: f32): + + %B = memref.alloc() : memref<10xf32> + %C = memref.alloc() : memref<10xf32> + + affine.for %j = 0 to 10 { + %v = affine.load %A[%j] : memref<10xf32> + affine.store %v, %B[%j] : memref<10xf32> + } + + affine.for %j = 0 to 10 { + %v = affine.load %B[%j] : memref<10xf32> + affine.store %v, %C[%j] : memref<10xf32> + } + %1 = arith.mulf %arg5, %cst : f32 + scf.yield %1 : f32 + } + // CHECK: scf.while + // CHECK: affine.for %{{.*}} = 0 to 10 + // CHECK-NOT: affine.for + // CHECK: scf.yield + return +} + + +memref.global "private" constant @__constant_10x2xf32 : memref<10x2xf32> = dense<0.000000e+00> + +// CHECK-LABEL: func @fusion_inner_long +func.func @fusion_inner_long(%arg0: memref<10x2xf32>, %arg1: memref<10x10xf32>, %arg2: memref<10x2xf32>, %s: index) { + %c0 = arith.constant 0 : index + %cst_0 = arith.constant 1.000000e-03 : f32 + %c9 = arith.constant 9 : index + %c10_i32 = arith.constant 10 : i32 + %c1_i32 = arith.constant 1 : i32 + %c100_i32 = arith.constant 100 : i32 + %c0_i32 = arith.constant 0 : i32 + %0 = memref.get_global @__constant_10x2xf32 : memref<10x2xf32> + %1 = scf.for %arg3 = %c0_i32 to %c100_i32 step %c1_i32 iter_args(%arg4 = %arg0) -> (memref<10x2xf32>) : i32 { + %alloc = memref.alloc() {alignment = 64 : i64} : memref<10xi32> + affine.for %arg5 = 0 to 10 { + %3 = arith.index_cast %arg5 : index to i32 + affine.store %3, %alloc[%arg5] : memref<10xi32> + } + %2 = scf.for %arg5 = %c0_i32 to %c10_i32 step %c1_i32 iter_args(%arg6 = %0) -> (memref<10x2xf32>) : i32 { + %alloc_5 = memref.alloc() : memref<2xf32> + affine.for %arg7 = 0 to 2 { + %16 = affine.load %arg4[%s, %arg7] : memref<10x2xf32> + affine.store %16, %alloc_5[%arg7] : memref<2xf32> + } + %alloc_6 = memref.alloc() {alignment = 64 : i64} : memref<1x2xf32> + affine.for %arg7 = 0 to 2 { + %16 = affine.load %alloc_5[%arg7] : memref<2xf32> + affine.store %16, %alloc_6[0, %arg7] : memref<1x2xf32> + } + %alloc_7 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32> + affine.for %arg7 = 0 to 10 { + affine.for %arg8 = 0 to 2 { + %16 = affine.load %alloc_6[0, %arg8] : memref<1x2xf32> + affine.store %16, %alloc_7[%arg7, %arg8] : memref<10x2xf32> + } + } + %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32> + affine.for %arg7 = 0 to 10 { + affine.for %arg8 = 0 to 2 { + %16 = affine.load %alloc_7[%arg7, %arg8] : memref<10x2xf32> + %17 = affine.load %arg4[%arg7, %arg8] : memref<10x2xf32> + %18 = arith.subf %16, %17 : f32 + affine.store %18, %alloc_8[%arg7, %arg8] : memref<10x2xf32> + } + } + scf.yield %alloc_8 : memref<10x2xf32> + // CHECK: scf.for + // CHECK: scf.for + // CHECK: affine.for %{{.*}} = 0 to 10 + // CHECK-NEXT: affine.for %{{.*}} = 0 to 2 + // CHECK-NOT: affine.for + // CHECK: scf.yield + } + %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32> + affine.for %arg5 = 0 to 10 { + affine.for %arg6 = 0 to 2 { + affine.store %cst_0, %alloc_2[%arg5, %arg6] : memref<10x2xf32> + } + } + %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<10x2xf32> + affine.for %arg5 = 0 to 10 { + affine.for %arg6 = 0 to 2 { + %3 = affine.load %alloc_2[%arg5, %arg6] : memref<10x2xf32> + %4 = affine.load %2[%arg5, %arg6] : memref<10x2xf32> + %5 = arith.mulf %3, %4 : f32 + affine.store %5, %alloc_3[%arg5, %arg6] : memref<10x2xf32> + } + } + scf.yield %alloc_3 : memref<10x2xf32> + // The nests above will be fused as well. + // CHECK: affine.for %{{.*}} = 0 to 10 + // CHECK-NEXT: affine.for %{{.*}} = 0 to 2 + // CHECK-NOT: affine.for + // CHECK: scf.yield + } + affine.for %arg3 = 0 to 10 { + affine.for %arg4 = 0 to 2 { + %2 = affine.load %1[%arg3, %arg4] : memref<10x2xf32> + affine.store %2, %arg2[%arg3, %arg4] : memref<10x2xf32> + } + } + return +}