diff --git a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp index 5c94ec2985c3d..b58bf3f271d47 100644 --- a/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp +++ b/mlir/lib/Dialect/Affine/Utils/LoopUtils.cpp @@ -1774,23 +1774,37 @@ findHighestBlockForPlacement(const MemRefRegion ®ion, Block &block, SmallVector symbols; cst->getValues(cst->getNumDimVars(), cst->getNumDimAndSymbolVars(), &symbols); - SmallVector enclosingFors; - getAffineForIVs(*block.begin(), &enclosingFors); + SmallVector enclosingAffineOps; + getEnclosingAffineOps(*block.begin(), &enclosingAffineOps); // Walk up loop parents till we find an IV on which this region is - // symbolic/variant. - auto it = enclosingFors.rbegin(); - for (auto e = enclosingFors.rend(); it != e; ++it) { + // symbolic/variant or we hit `hoistGuard`. + auto it = enclosingAffineOps.rbegin(); + AffineForOp lastInvariantFor; + for (auto e = enclosingAffineOps.rend(); it != e; ++it) { + Operation *enclosingOp = *it; + // We can't hoist past the definition of the memref being copied. + Value memref = region.memref; + if (!memref.getParentRegion()->isAncestor(enclosingOp->getParentRegion())) { + LLVM_DEBUG( + llvm::dbgs() + << "memref definition will end up not dominating hoist location\n"); + break; + } + + auto affineFor = dyn_cast(enclosingOp); + if (!affineFor) + break; // TODO: also need to be checking this for regions symbols that // aren't loop IVs, whether we are within their resp. defs' dominance scope. - if (llvm::is_contained(symbols, it->getInductionVar())) + if (llvm::is_contained(symbols, affineFor.getInductionVar())) break; + lastInvariantFor = affineFor; } - if (it != enclosingFors.rbegin()) { - auto lastInvariantIV = *std::prev(it); - *copyInPlacementStart = Block::iterator(lastInvariantIV.getOperation()); + if (it != enclosingAffineOps.rbegin()) { + *copyInPlacementStart = Block::iterator(lastInvariantFor); *copyOutPlacementStart = std::next(*copyInPlacementStart); - *copyPlacementBlock = lastInvariantIV->getBlock(); + *copyPlacementBlock = lastInvariantFor->getBlock(); } else { *copyInPlacementStart = begin; *copyOutPlacementStart = end; diff --git a/mlir/test/Dialect/Affine/affine-data-copy.mlir b/mlir/test/Dialect/Affine/affine-data-copy.mlir index 1f3aac3401bea..a1f0d952e7c63 100644 --- a/mlir/test/Dialect/Affine/affine-data-copy.mlir +++ b/mlir/test/Dialect/Affine/affine-data-copy.mlir @@ -8,6 +8,7 @@ // affine.load op in the innermost loop as a filter. // RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter' | FileCheck %s --check-prefix=FILTER // RUN: mlir-opt %s -split-input-file -test-affine-data-copy='for-memref-region' | FileCheck %s --check-prefix=MEMREF_REGION +// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='capacity-kib=32' | FileCheck %s --check-prefix=LIMITED-MEM // -copy-skip-non-stride-loops forces the copies to be placed right inside the // tile space loops, avoiding the sensitivity of copy placement depth to memory @@ -23,6 +24,7 @@ // CHECK-LABEL: func @matmul // FILTER-LABEL: func @matmul +// LIMITED-MEM-LABEL: func @matmul func.func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> { affine.for %i = 0 to 4096 step 128 { affine.for %j = 0 to 4096 step 128 { @@ -43,6 +45,7 @@ func.func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memr } } return %C : memref<4096x4096xf32> + // LIMITED-MEM: return } // Buffers of size 128x128 get created here for all three matrices. @@ -421,6 +424,7 @@ func.func @scalar_memref_copy_in_loop(%3:memref<480xi1>) { } // CHECK-LABEL: func @memref_def_inside +// LIMITED-MEM-LABEL: func @memref_def_inside func.func @memref_def_inside(%arg0: index) { %0 = llvm.mlir.constant(1.000000e+00 : f32) : f32 // No copy generation can happen at this depth given the definition inside. @@ -429,5 +433,17 @@ func.func @memref_def_inside(%arg0: index) { // CHECK: affine.store {{.*}} : memref<1xf32> affine.store %0, %alloc_7[0] : memref<1xf32> } + + // With the limited capacity specified, buffer generation happens at the + // innermost depth. Tests that copy-placement is proper and respects the + // memref definition. + + // LIMITED-MEM: affine.for %{{.*}} = 0 to 29 + // LIMITED-MEM-NEXT: memref.alloc() : memref<1xf32> + // LIMITED-MEM-NEXT: memref.alloc() : memref<1xf32> + // LIMITED-MEM-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> + // LIMITED-MEM-NEXT: affine.load %{{.*}}[%c0{{.*}}] : memref<1xf32> + // LIMITED-MEM-NEXT: affine.store %{{.*}}, %{{.*}}[0] : memref<1xf32> + // LIMITED-MEM-NEXT: memref.dealloc %{{.*}} : memref<1xf32> return } diff --git a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp index 404f34ebee17a..d6aaa6faf94cb 100644 --- a/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp +++ b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp @@ -53,6 +53,11 @@ struct TestAffineDataCopy *this, "for-memref-region", llvm::cl::desc("Test copy generation for a single memref region"), llvm::cl::init(false)}; + Option clCapacityLimit{ + *this, "capacity-kib", + llvm::cl::desc("Test copy generation enforcing a limit of capacity " + "(default: unlimited)"), + llvm::cl::init(UINT64_MAX)}; }; } // namespace @@ -73,24 +78,24 @@ void TestAffineDataCopy::runOnOperation() { auto loopNest = depthToLoops[0][0]; auto innermostLoop = depthToLoops[innermostLoopIdx][0]; AffineLoadOp load; + // For simplicity, these options are tested on the first memref being loaded + // from in the innermost loop. if (clMemRefFilter || clTestGenerateCopyForMemRegion) { - // Gather MemRef filter. For simplicity, we use the first loaded memref - // found in the innermost loop. for (auto &op : *innermostLoop.getBody()) { if (auto ld = dyn_cast(op)) { load = ld; break; } } + if (!load) + return; } - if (!load) - return; AffineCopyOptions copyOptions = {/*generateDma=*/false, /*slowMemorySpace=*/0, /*fastMemorySpace=*/0, /*tagMemorySpace=*/0, - /*fastMemCapacityBytes=*/32 * 1024 * 1024UL}; + /*fastMemCapacityBytes=*/clCapacityLimit}; DenseSet copyNests; if (clMemRefFilter) { if (failed(affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), @@ -103,6 +108,10 @@ void TestAffineDataCopy::runOnOperation() { return; if (failed(generateCopyForMemRegion(region, loopNest, copyOptions, result))) return; + } else if (failed(affineDataCopyGenerate(loopNest, copyOptions, + /*filterMemref=*/std::nullopt, + copyNests))) { + return; } // Promote any single iteration loops in the copy nests and simplify