From bad2ae08252a7d95e4655cf4fe080004a440ecf9 Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Mon, 17 Jun 2024 11:44:27 +0100 Subject: [PATCH 1/9] [mlir][linalg] Implement TilingInterface for winograd operations In order to support arbitrary size input data of conv2d, implement TilingInterface for winograd operations. Before converting winograd operations into nested loops with matrix multiply, tile the input of conv2d into the supported size first. Add a transform operation structured.decompose_winograd_op to decompose winograd operations. Before applying the transform op, use tile_using_for to tile the input data into supported size. The test case shows how to tile and decompose winograd operations. --- .../mlir/Dialect/Linalg/IR/LinalgOps.td | 22 +- .../Linalg/TransformOps/LinalgTransformOps.td | 37 +++ .../Dialect/Linalg/Transforms/Transforms.h | 45 +++ mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 216 +++++++++++++++ .../TransformOps/LinalgTransformOps.cpp | 41 +++ .../Linalg/Transforms/WinogradConv2D.cpp | 18 ++ .../transform-tile-and-winograd-rewrite.mlir | 260 ++++++++++++++++++ 7 files changed, 633 insertions(+), 6 deletions(-) create mode 100644 mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td index a9007c8db3078..bc2df7082ef8e 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td @@ -154,8 +154,8 @@ def Linalg_SoftmaxOp : Linalg_Op<"softmax", let hasVerifier = 1; } -def Linalg_WinogradFilterTransformOp : - Linalg_Op<"winograd_filter_transform", [AllElementTypesMatch<["filter", "output"]>]> { +def Linalg_WinogradFilterTransformOp : Linalg_Op<"winograd_filter_transform", + [AllElementTypesMatch<["filter", "output"]>]> { let summary = "Winograd filter transform operator"; let description = [{ Winograd Conv2D algorithm will convert linalg Conv2D operator into batched @@ -193,8 +193,13 @@ def Linalg_WinogradFilterTransformOp : let hasVerifier = 1; } -def Linalg_WinogradInputTransformOp : - Linalg_Op<"winograd_input_transform", [AllElementTypesMatch<["input", "output"]>]> { +def Linalg_WinogradInputTransformOp : Linalg_Op<"winograd_input_transform", + [AllElementTypesMatch<["input", "output"]>, + DeclareOpInterfaceMethods]> { let summary = "Winograd input transform operator"; let description = [{ Winograd Conv2D algorithm will convert linalg Conv2D operator into batched @@ -232,8 +237,13 @@ def Linalg_WinogradInputTransformOp : let hasVerifier = 1; } -def Linalg_WinogradOutputTransformOp : - Linalg_Op<"winograd_output_transform", [AllElementTypesMatch<["value", "output"]>]> { +def Linalg_WinogradOutputTransformOp : Linalg_Op<"winograd_output_transform", + [AllElementTypesMatch<["value", "output"]>, + DeclareOpInterfaceMethods]> { let summary = "Winograd output transform operator"; let description = [{ Winograd Conv2D algorithm will convert linalg Conv2D operator into batched diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td index ecc86999006db..106f0d79d9792 100644 --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -2697,4 +2697,41 @@ def WinogradConv2DOp : Op { + let description = [{ + Decompose winograd operations. It will convert filter, input and output + transform operations into a combination of scf, tensor, and linalg + equivalent operations. Before applying this transform operations, users + need to tile winograd transform operations into supported sizes. + + #### Return modes: + + This operation fails if `target` is unsupported. Otherwise, the operation + succeeds and returns a handle of the sequence that replaces the original + operations. + }]; + + let arguments = (ins TransformHandleTypeInterface:$target); + let results = (outs TransformHandleTypeInterface:$transformed); + + let assemblyFormat = + "$target attr-dict `:` functional-type($target, results)"; + + let builders = [ + OpBuilder<(ins "Value":$target)> + ]; + + let extraClassDeclaration = [{ + ::mlir::DiagnosedSilenceableFailure applyToOne( + ::mlir::transform::TransformRewriter &rewriter, + ::mlir::Operation *target, + ::mlir::transform::ApplyToEachResultList &results, + ::mlir::transform::TransformState &state); + }]; +} + #endif // LINALG_TRANSFORM_OPS diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index 477ef7bfafb18..89f907cfc7ade 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -1316,6 +1316,51 @@ FailureOr winogradConv2D(RewriterBase &rewriter, linalg::Conv2DNhwcFhwcOp op, int64_t m, int64_t r); +/// Rewrite linalg.winograd_filter_transform. The data layout of the filter is +/// FHWC. The transformation matrix is 2-dimension. We need to extract H x W +/// from FHWC first. We need to generate 2 levels of loops to iterate on F and +/// C. After the rewriting, we get +/// +/// scf.for %f = lo_f to hi_f step 1 +/// scf.for %c = lo_c to hi_c step 1 +/// %extracted = extract filter from filter +/// %ret = linalg.matmul G, %extracted +/// %ret = linalg.matmul %ret, GT +/// %inserted = insert %ret into filter +FailureOr +decomposeWinogradFilterTransformOp(RewriterBase &rewriter, + linalg::WinogradFilterTransformOp op); + +/// Rewrite linalg.winograd_input_transform. The data layout of the input is +/// NHWC. The transformation matrix is 2-dimension. We need to extract H x W +/// from NHWC first. We need to generate 2 levels of loops to iterate on N and +/// C. After the rewriting, we get +/// +/// scf.for %n = lo_n to hi_n step 1 +/// scf.for %c = lo_c to hi_c step 1 +/// %extracted = extract input from input +/// %ret = linalg.matmul BT, %extracted +/// %ret = linalg.matmul %ret, B +/// %inserted = insert %ret into input +FailureOr +decomposeWinogradInputTransformOp(RewriterBase &rewriter, + linalg::WinogradInputTransformOp op); + +/// Rewrite linalg.winograd_output_transform. The data layout of the output is +/// HWNF. The transformation matrix is 2-dimension. We need to extract H x W +/// from HWNF first. We need to generate 2 levels of loops to iterate on N and +/// F. After the transformation, we get +/// +/// scf.for %n = lo_n to hi_n step 1 +/// scf.for %f = lo_f to hi_f step 1 +/// %extracted = extract input from result +/// %ret = linalg.matmul AT, %extracted +/// %ret = linalg.matmul %ret, A +/// %inserted = insert %ret into ret +FailureOr +decomposeWinogradOutputTransformOp(RewriterBase &rewriter, + linalg::WinogradOutputTransformOp op); + //===----------------------------------------------------------------------===// // Rewrite patterns wrapping transformations. // TODO: every single such pattern should be a close to noop wrapper around a diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index a101552e419bc..47800dfc1d1a2 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -2885,6 +2885,15 @@ LogicalResult WinogradFilterTransformOp::verify() { // WinogradInputTransformOp //===----------------------------------------------------------------------===// +Value getValueFromOpFoldResult(OpFoldResult opFoldResult, OpBuilder &builder, + Location loc) { + if (auto attr = opFoldResult.dyn_cast()) { + auto intAttr = cast(attr); + return builder.create(loc, intAttr); + } + return opFoldResult.get(); +} + LogicalResult WinogradInputTransformOp::verify() { auto inputType = cast(getInput().getType()); ArrayRef inputShape = inputType.getShape(); @@ -2922,6 +2931,113 @@ LogicalResult WinogradInputTransformOp::verify() { return success(); } +SmallVector +WinogradInputTransformOp::getIterationDomain(OpBuilder &builder) { + Location loc = getLoc(); + auto indexType = builder.getIndexType(); + auto zeroAttr = builder.getIntegerAttr(indexType, 0); + auto oneAttr = builder.getIntegerAttr(indexType, 1); + Value output = getOutput(); + SmallVector loopBounds(6); + for (unsigned dim = 0; dim < 6; ++dim) { + loopBounds[dim].offset = zeroAttr; + loopBounds[dim].size = getDimValue(builder, loc, output, dim); + loopBounds[dim].stride = oneAttr; + } + return loopBounds; +} + +SmallVector +WinogradInputTransformOp::getLoopIteratorTypes() { + SmallVector iteratorTypes(6, + utils::IteratorType::parallel); + return iteratorTypes; +} + +LogicalResult WinogradInputTransformOp::getResultTilePosition( + OpBuilder &builder, unsigned resultNumber, ArrayRef offsets, + ArrayRef sizes, SmallVector &resultOffsets, + SmallVector &resultSizes) { + auto zeroAttr = builder.getI64IntegerAttr(0); + auto oneAttr = builder.getI64IntegerAttr(1); + + resultOffsets.push_back(zeroAttr); + resultOffsets.push_back(zeroAttr); + resultOffsets.push_back(offsets[2]); + resultOffsets.push_back(offsets[3]); + resultOffsets.push_back(zeroAttr); + resultOffsets.push_back(zeroAttr); + resultSizes.push_back(sizes[0]); + resultSizes.push_back(sizes[1]); + resultSizes.push_back(oneAttr); + resultSizes.push_back(oneAttr); + resultSizes.push_back(sizes[4]); + resultSizes.push_back(sizes[5]); + + return success(); +} + +FailureOr +WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder, + ArrayRef offsets, + ArrayRef sizes) { + auto oneAttr = builder.getI64IntegerAttr(1); + auto zeroAttr = builder.getI64IntegerAttr(0); + Value input = getInput(); + auto inputType = cast(input.getType()); + auto inputShape = inputType.getShape(); + int64_t inputH = inputShape[1]; + int64_t inputW = inputShape[2]; + int64_t m = getM(); + int64_t r = getR(); + int64_t alpha = m + r - 1; + int64_t alphaH = inputH != 1 ? alpha : 1; + int64_t alphaW = inputW != 1 ? alpha : 1; + auto alphaHAttr = builder.getI64IntegerAttr(alphaH); + auto alphaWAttr = builder.getI64IntegerAttr(alphaW); + + Location loc = getLoc(); + SmallVector tiledOperands; + SmallVector sliceOffsets, sliceSizes; + + auto context = builder.getContext(); + auto affineMap = + AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); + Value mappedOffset1 = builder.create( + loc, affineMap, getValueFromOpFoldResult(offsets[2], builder, loc)); + Value mappedOffset2 = builder.create( + loc, affineMap, getValueFromOpFoldResult(offsets[3], builder, loc)); + + sliceOffsets.push_back(zeroAttr); + sliceOffsets.push_back(mappedOffset1); + sliceOffsets.push_back(mappedOffset2); + sliceOffsets.push_back(zeroAttr); + sliceSizes.push_back(sizes[4]); + sliceSizes.push_back(alphaHAttr); + sliceSizes.push_back(alphaWAttr); + sliceSizes.push_back(sizes[5]); + SmallVector inputStrides(4, oneAttr); + tiledOperands.emplace_back(builder.create( + loc, getInput(), sliceOffsets, sliceSizes, inputStrides)); + + sliceOffsets.clear(); + sliceSizes.clear(); + if (failed(getResultTilePosition(builder, 1, offsets, sizes, sliceOffsets, + sliceSizes))) + return failure(); + + SmallVector outputStrides(6, oneAttr); + tiledOperands.emplace_back(builder.create( + loc, getOutput(), sliceOffsets, sliceSizes, outputStrides)); + + SmallVector resultTypes; + resultTypes.push_back(tiledOperands[1].getType()); + Operation *tiledOp = + mlir::clone(builder, getOperation(), resultTypes, tiledOperands); + + return TilingResult{{tiledOp}, SmallVector(tiledOp->getResults())}; +} + //===----------------------------------------------------------------------===// // WinogradOutputTransformOp //===----------------------------------------------------------------------===// @@ -2964,6 +3080,106 @@ LogicalResult WinogradOutputTransformOp::verify() { return success(); } +SmallVector +WinogradOutputTransformOp::getIterationDomain(OpBuilder &builder) { + Location loc = getLoc(); + auto indexType = builder.getIndexType(); + auto zeroAttr = builder.getIntegerAttr(indexType, 0); + auto oneAttr = builder.getIntegerAttr(indexType, 1); + Value value = getValue(); + SmallVector loopBounds(6); + for (unsigned dim = 0; dim < 6; ++dim) { + loopBounds[dim].offset = zeroAttr; + loopBounds[dim].size = getDimValue(builder, loc, value, dim); + loopBounds[dim].stride = oneAttr; + } + return loopBounds; +} + +SmallVector +WinogradOutputTransformOp::getLoopIteratorTypes() { + SmallVector iteratorTypes(6, + utils::IteratorType::parallel); + return iteratorTypes; +} + +LogicalResult WinogradOutputTransformOp::getResultTilePosition( + OpBuilder &builder, unsigned resultNumber, ArrayRef offsets, + ArrayRef sizes, SmallVector &resultOffsets, + SmallVector &resultSizes) { + auto zeroAttr = builder.getI64IntegerAttr(0); + Value output = getOutput(); + auto outputType = cast(output.getType()); + auto outputShape = outputType.getShape(); + int64_t outputH = outputShape[1]; + int64_t outputW = outputShape[2]; + int64_t m = getM(); + auto heightM = builder.getI64IntegerAttr(outputH != 1 ? m : 1); + auto widthM = builder.getI64IntegerAttr(outputW != 1 ? m : 1); + + Location loc = getLoc(); + auto context = builder.getContext(); + auto affineMap = + AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); + Value mappedOffset1 = builder.create( + loc, affineMap, getValueFromOpFoldResult(offsets[2], builder, loc)); + Value mappedOffset2 = builder.create( + loc, affineMap, getValueFromOpFoldResult(offsets[3], builder, loc)); + + resultOffsets.push_back(zeroAttr); + resultOffsets.push_back(mappedOffset1); + resultOffsets.push_back(mappedOffset2); + resultOffsets.push_back(zeroAttr); + resultSizes.push_back(sizes[4]); + resultSizes.push_back(heightM); + resultSizes.push_back(widthM); + resultSizes.push_back(sizes[5]); + return success(); +} + +FailureOr WinogradOutputTransformOp::getTiledImplementation( + OpBuilder &builder, ArrayRef offsets, + ArrayRef sizes) { + auto oneAttr = builder.getI64IntegerAttr(1); + auto zeroAttr = builder.getI64IntegerAttr(0); + Location loc = getLoc(); + SmallVector tiledOperands; + SmallVector sliceOffsets, sliceSizes; + + sliceOffsets.push_back(zeroAttr); + sliceOffsets.push_back(zeroAttr); + sliceOffsets.push_back(offsets[2]); + sliceOffsets.push_back(offsets[3]); + sliceOffsets.push_back(zeroAttr); + sliceOffsets.push_back(zeroAttr); + sliceSizes.push_back(sizes[0]); + sliceSizes.push_back(sizes[1]); + sliceSizes.push_back(oneAttr); + sliceSizes.push_back(oneAttr); + sliceSizes.push_back(sizes[4]); + sliceSizes.push_back(sizes[5]); + SmallVector sliceStrides(6, oneAttr); + tiledOperands.emplace_back(builder.create( + loc, getValue(), sliceOffsets, sliceSizes, sliceStrides)); + + sliceOffsets.clear(); + sliceSizes.clear(); + if (failed(getResultTilePosition(builder, 1, offsets, sizes, sliceOffsets, + sliceSizes))) + return failure(); + + SmallVector strides(4, oneAttr); + tiledOperands.emplace_back(builder.create( + loc, getOutput(), sliceOffsets, sliceSizes, strides)); + + SmallVector resultTypes; + resultTypes.push_back(tiledOperands[1].getType()); + Operation *tiledOp = + mlir::clone(builder, getOperation(), resultTypes, tiledOperands); + + return TilingResult{{tiledOp}, SmallVector(tiledOp->getResults())}; +} + //===----------------------------------------------------------------------===// // LinalgDialect //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp index 9baf358a95503..325dc04a828f0 100644 --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -3853,6 +3853,47 @@ DiagnosedSilenceableFailure transform::WinogradConv2DOp::applyToOne( return DiagnosedSilenceableFailure::success(); } +DiagnosedSilenceableFailure transform::DecomposeWinogradOp::applyToOne( + transform::TransformRewriter &rewriter, Operation *target, + transform::ApplyToEachResultList &results, + transform::TransformState &state) { + rewriter.setInsertionPoint(target); + FailureOr maybeTransformed = failure(); + bool supported = + TypeSwitch(target) + .Case([&](linalg::WinogradFilterTransformOp op) { + maybeTransformed = decomposeWinogradFilterTransformOp(rewriter, op); + return true; + }) + .Case([&](linalg::WinogradInputTransformOp op) { + maybeTransformed = decomposeWinogradInputTransformOp(rewriter, op); + return true; + }) + .Case([&](linalg::WinogradOutputTransformOp op) { + maybeTransformed = decomposeWinogradOutputTransformOp(rewriter, op); + return true; + }) + .Default([&](Operation *op) { return false; }); + + if (!supported) { + DiagnosedSilenceableFailure diag = + emitSilenceableError() + << "this operation is not supported to decompose into other operations"; + diag.attachNote(target->getLoc()) << "target op"; + return diag; + } + + if (supported && failed(maybeTransformed)) { + DiagnosedSilenceableFailure diag = + emitSilenceableError() << "decompose Winograd operations failed"; + diag.attachNote(target->getLoc()) << "target op"; + return diag; + } + + results.push_back(*maybeTransformed); + return DiagnosedSilenceableFailure::success(); +} + #include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOpsEnums.cpp.inc" #define GET_OP_CLASSES diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp index c6c770e2781ff..c01a5f4b055f1 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp @@ -1169,6 +1169,24 @@ FailureOr winogradConv2D(RewriterBase &rewriter, return winogradConv2DHelper(rewriter, op, m, r); } +FailureOr +decomposeWinogradFilterTransformOp(RewriterBase &rewriter, + linalg::WinogradFilterTransformOp op) { + return decomposeWinogradFilterTransformHelper(rewriter, op); +} + +FailureOr +decomposeWinogradInputTransformOp(RewriterBase &rewriter, + linalg::WinogradInputTransformOp op) { + return decomposeWinogradInputTransformHelper(rewriter, op); +} + +FailureOr +decomposeWinogradOutputTransformOp(RewriterBase &rewriter, + linalg::WinogradOutputTransformOp op) { + return decomposeWinogradOutputTransformHelper(rewriter, op); +} + void populateWinogradConv2DPatterns(RewritePatternSet &patterns, int64_t m, int64_t r) { MLIRContext *context = patterns.getContext(); diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir new file mode 100644 index 0000000000000..29833324bb21b --- /dev/null +++ b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir @@ -0,0 +1,260 @@ +// RUN: mlir-opt %s -transform-interpreter -canonicalize --split-input-file | FileCheck %s + +func.func @conv2d(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> { + %0 = tensor.empty() : tensor<6x6x5x2xf32> + %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%0 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> + %2 = tensor.empty() : tensor<6x6x2x2x2x5xf32> + %3 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%2 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> + %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32> + %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x2x2x2x5xf32> into tensor<36x8x5xf32> + %4 = tensor.empty() : tensor<36x8x2xf32> + %5 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x8x5xf32>, tensor<36x5x2xf32>) outs(%4 : tensor<36x8x2xf32>) -> tensor<36x8x2xf32> + %expanded = tensor.expand_shape %5 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] : tensor<36x8x2xf32> into tensor<6x6x2x2x2x2xf32> + %6 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x2x2x2x2xf32>) outs(%arg2 : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> + return %6 : tensor<2x8x8x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) + %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op + %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) + %10 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %5 : (!transform.any_op) -> !transform.any_op + %11 = transform.structured.decompose_winograd_op %10 : (!transform.any_op) -> (!transform.any_op) + transform.yield + } +} + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK-LABEL: func.func @conv2d +// CHECK-SAME: (%[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> { +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf32> +// CHECK: scf.for {{.*}} = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x5x2xf32>) { +// CHECK-NEXT: scf.for {{.*}} = %[[C0]] to %[[C5]] step %[[C1]] iter_args({{.*}} = %[[ARG3]]) -> (tensor<6x6x5x2xf32>) { +// CHECK: linalg.matmul +// CHECK: linalg.matmul +// CHECK: %[[S5:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> +// CHECK: %[[S6:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> +// CHECK: scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S6]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK: tensor.extract_slice %[[ARG0]][0, %[[S13]], %[[S14]], 0] [2, 6, 6, 5] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x6x6x5xf32> +// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[S5]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x2x5xf32> +// CHECK: %[[S15:.*]] = scf.for {{.*}} = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) -> (tensor<6x6x1x1x2x5xf32>) { +// CHECK-NEXt: scf.for {{.*}} = %[[C0]] to %[[C5]] step %[[C1]] iter_args({{.*}} = %[[ARG8]]) -> (tensor<6x6x1x1x2x5xf32>) { +// CHECK: linalg.matmul +// CHECK: linalg.matmul +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S15]] into %[[ARG6]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x5xf32> into tensor<6x6x2x2x2x5xf32> +// CHECK: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x2x2x2x5xf32> +// CHECK: %[[S9:.*]] = linalg.batch_matmul +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S9]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] : tensor<36x8x2xf32> into tensor<6x6x2x2x2x2xf32> +// CHECK: %[[S10:.*]] = tensor.empty() : tensor<2x8x8x2xf32> +// CHECK: scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S10]]) -> (tensor<2x8x8x2xf32>) { +// CHECK-NEXT: scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<2x8x8x2xf32>) { +// CHECK: tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> +// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG2]][0, %[[S13]], %[[S14]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x4x4x2xf32> +// CHECK: %[[S15:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) -> (tensor<2x4x4x2xf32>) { +// CHECK-NEXT: scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x4x4x2xf32>) { +// CHECK: linalg.matmul +// CHECK: linalg.matmul +// CHECK: %[[S16:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK: %[[S17:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S15]] into %[[ARG6]][0, %[[S16]], %[[S17]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x4x4x2xf32> into tensor<2x8x8x2xf32> +// CHECK: scf.yield %[[INSERTED_SLICE]] : tensor<2x8x8x2xf32> + +// ----- + +func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<1xf32>, %arg3: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<6x6x5x2xf32> + %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%0 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> + %padded = tensor.pad %arg0 low[0, 0, 0, 0] high[0, 3, 3, 0] { + ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index): + tensor.yield %cst : f32 + } : tensor<2x11x11x5xf32> to tensor<2x14x14x5xf32> + %2 = tensor.empty() : tensor<6x6x3x3x2x5xf32> + %3 = linalg.winograd_input_transform m(4) r(3) ins(%padded : tensor<2x14x14x5xf32>) outs(%2 : tensor<6x6x3x3x2x5xf32>) -> tensor<6x6x3x3x2x5xf32> + %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32> + %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32> + %4 = tensor.empty() : tensor<36x18x2xf32> + %5 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%4 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32> + %expanded = tensor.expand_shape %5 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32> + %padded_1 = tensor.pad %arg3 low[0, 0, 0, 0] high[0, 3, 3, 0] { + ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index): + tensor.yield %cst : f32 + } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32> + %6 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32> + %extracted_slice = tensor.extract_slice %6[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32> + return %extracted_slice : tensor<2x9x9x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) + %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op + %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) + %10 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %5 : (!transform.any_op) -> !transform.any_op + %11 = transform.structured.decompose_winograd_op %10 : (!transform.any_op) -> (!transform.any_op) + transform.yield + } +} + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK-LABEL: func.func @conv2d_unaligned +// CHECK-SAME: (%[[ARG0:.*]]: tensor<2x11x11x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> { +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[CST_6:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf32> +// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S0]]) -> (tensor<6x6x5x2xf32>) { +// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x5x2xf32>) { +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG4]], 0, 0, %[[ARG6]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x1xf32> +// CHECK: tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<1x3x3x1xf32> to tensor<3x3xf32> +// CHECK: linalg.matmul +// CHECK: %[[S13:.*]] = linalg.matmul +// CHECK: %[[S14:.*]] = tensor.empty() : tensor<6x6x1x1xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[S14]][0, 0, 0, 0] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x1x1xf32> +// CHECK: %[[INSERTED_SLICE_11:.*]] = tensor.insert_slice %[[INSERTED_SLICE]] into %[[ARG7]][0, 0, %[[ARG6]], %[[ARG4]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x1x1xf32> into tensor<6x6x5x2xf32> +// CHECK: scf.yield %[[INSERTED_SLICE_11]] : tensor<6x6x5x2xf32> +// CHECK: %[[PADDED:.*]] = tensor.pad %[[ARG0]] low[0, 0, 0, 0] high[0, 3, 3, 0] { +// CHECK: ^bb0({{.*}}): +// CHECK: tensor.yield %[[CST_6]] : f32 +// CHECK: } : tensor<2x11x11x5xf32> to tensor<2x14x14x5xf32> +// CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32> +// CHECK: %[[S3:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32> +// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S3]]) -> (tensor<6x6x3x3x2x5xf32>) { +// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x3x3x2x5xf32>) { +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK: tensor.extract_slice %[[PADDED]][0, %[[S10]], %[[S11]], 0] [2, 6, 6, 5] [1, 1, 1, 1] : tensor<2x14x14x5xf32> to tensor<2x6x6x5xf32> +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x5xf32> to tensor<6x6x1x1x2x5xf32> +// CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) -> (tensor<6x6x1x1x2x5xf32>) { +// CHECK: scf.for %[[ARG10:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<6x6x1x1x2x5xf32>) { +// CHECK: linalg.matmul +// CHECK: %[[S17:.*]] = linalg.matmul +// CHECK: %[[S18:.*]] = tensor.empty() : tensor<6x6x1x1x1x1xf32> +// CHECK: %[[INSERTED_SLICE_13:.*]] = tensor.insert_slice %[[S17]] into %[[S18]][0, 0, 0, 0, 0, 0] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x1x1x1x1xf32> +// CHECK: %[[INSERTED_SLICE_14:.*]] = tensor.insert_slice %[[INSERTED_SLICE_13]] into %[[ARG11]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x1x1xf32> into tensor<6x6x1x1x2x5xf32> +// CHECK: scf.yield %[[INSERTED_SLICE_14]] : tensor<6x6x1x1x2x5xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x5xf32> into tensor<6x6x3x3x2x5xf32> +// CHECK: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x3x3x2x5xf32> +// CHECK: %[[S6:.*]] = linalg.batch_matmul +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32> +// CHECK: %[[PADDED_8:.*]] = tensor.pad %[[ARG3]] low[0, 0, 0, 0] high[0, 3, 3, 0] { +// CHECK: ^bb0({{.*}}): +// CHECK: tensor.yield %[[CST_6]] : f32 +// CHECK: } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32> +// CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x12x12x2xf32> +// CHECK: %[[S8:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S7]]) -> (tensor<2x12x12x2xf32>) { +// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<2x12x12x2xf32>) { +// CHECK: tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x2xf32> to tensor<6x6x1x1x2x2xf32> +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[PADDED_8]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x4x4x2xf32> +// CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) -> (tensor<2x4x4x2xf32>) { +// CHECK: scf.for %[[ARG10:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<2x4x4x2xf32>) { +// CHECK: linalg.matmul +// CHECK: linalg.matmul +// CHECK: %[[S22:.*]] = linalg.mul +// CHECK: %[[S23:.*]] = tensor.empty() : tensor<1x4x4x1xf32> +// CHECK: %[[INSERTED_SLICE_13:.*]] = tensor.insert_slice %[[S22]] into %[[S23]][0, 0, 0, 0] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<1x4x4x1xf32> +// CHECK: %[[INSERTED_SLICE_14:.*]] = tensor.insert_slice %[[INSERTED_SLICE_13]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<1x4x4x1xf32> into tensor<2x4x4x2xf32> +// CHECK: scf.yield %[[INSERTED_SLICE_14]] : tensor<2x4x4x2xf32> +// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, %[[S13]], %[[S14]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x4x4x2xf32> into tensor<2x12x12x2xf32> +// CHECK: scf.yield %[[INSERTED_SLICE]] : tensor<2x12x12x2xf32> +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S8]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32> +// CHECK: return %[[EXTRACTED_SLICE]] : tensor<2x9x9x2xf32> +// CHECK: } + +// ----- + +func.func @conv2d_mx1_rx1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>, %arg2: tensor<1xf32>, %arg3: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> { + %0 = tensor.empty() : tensor<6x1x5x2xf32> + %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x1x5xf32>) outs(%0 : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32> + %2 = tensor.empty() : tensor<6x1x1x1x2x5xf32> + %3 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x6x1x5xf32>) outs(%2 : tensor<6x1x1x1x2x5xf32>) -> tensor<6x1x1x1x2x5xf32> + %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x1x5x2xf32> into tensor<6x5x2xf32> + %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x1x1x1x2x5xf32> into tensor<6x2x5xf32> + %4 = tensor.empty() : tensor<6x2x2xf32> + %5 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%4 : tensor<6x2x2xf32>) -> tensor<6x2x2xf32> + %expanded = tensor.expand_shape %5 [[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32> + %6 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x1x1x1x2x2xf32>) outs(%arg3 : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> + return %6 : tensor<2x4x1x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) + %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op + %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) + %10 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %5 : (!transform.any_op) -> !transform.any_op + %11 = transform.structured.decompose_winograd_op %10 : (!transform.any_op) -> (!transform.any_op) + transform.yield + } +} + +// CHECK-LABEL: func.func @conv2d_mx1_rx1 +// CHECK-SAME: (%[[ARG0:.*]]: tensor<2x6x1x5xf32>, %[[ARG1:.*]]: tensor<2x3x1x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> { +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x1x5x2xf32> +// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S0]]) -> (tensor<6x1x5x2xf32>) { +// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x1x5x2xf32>) { +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG4]], 0, 0, %[[ARG6]]] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<2x3x1x5xf32> to tensor<1x3x1x1xf32> +// CHECK: tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<1x3x1x1xf32> to tensor<3x1xf32> +// CHECK: %[[S9:.*]] = linalg.matmul +// CHECK: %[[S10:.*]] = tensor.empty() : tensor<6x1x1x1xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[S10]][0, 0, 0, 0] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1xf32> into tensor<6x1x1x1xf32> +// CHECK: %[[INSERTED_SLICE_5:.*]] = tensor.insert_slice %[[INSERTED_SLICE]] into %[[ARG7]][0, 0, %[[ARG6]], %[[ARG4]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x1x1xf32> into tensor<6x1x5x2xf32> +// CHECK: scf.yield %[[INSERTED_SLICE_5]] : tensor<6x1x5x2xf32> +// CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32> +// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S2]]) -> (tensor<6x1x1x1x2x5xf32>) { +// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x1x1x1x2x5xf32>) { +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG4]], 0, 0, %[[ARG6]]] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<2x6x1x5xf32> to tensor<1x6x1x1xf32> +// CHECK: tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<6x1xf32> +// CHECK: %[[S9:.*]] = linalg.matmul +// CHECK: %[[S10:.*]] = tensor.empty() : tensor<6x1x1x1x1x1xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[S10]][0, 0, 0, 0, 0, 0] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1xf32> into tensor<6x1x1x1x1x1xf32> +// CHECK: %[[INSERTED_SLICE_5:.*]] = tensor.insert_slice %[[INSERTED_SLICE]] into %[[ARG7]][0, 0, 0, 0, %[[ARG4]], %[[ARG6]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x1x1x1x1xf32> into tensor<6x1x1x1x2x5xf32> +// CHECK: scf.yield %[[INSERTED_SLICE_5]] : tensor<6x1x1x1x2x5xf32> +// CHECK: %[[S5:.*]] = linalg.batch_matmul +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32> +// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<2x4x1x2xf32>) { +// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<2x4x1x2xf32>) { +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG4]], %[[ARG6]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x1x1x2x2xf32> to tensor<6x1x1x1x1x1xf32> +// CHECK: tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, 0, 0] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x1x1x1x1xf32> to tensor<6x1xf32> +// CHECK: linalg.matmul +// CHECK: %[[S12:.*]] = linalg.mul +// CHECK: %[[S13:.*]] = tensor.empty() : tensor<1x4x1x1xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[S13]][0, 0, 0, 0] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<4x1xf32> into tensor<1x4x1x1xf32> +// CHECK: %[[INSERTED_SLICE_5:.*]] = tensor.insert_slice %[[INSERTED_SLICE]] into %[[ARG7]][%[[ARG4]], 0, 0, %[[ARG6]]] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x4x1x1xf32> into tensor<2x4x1x2xf32> +// CHECK: scf.yield %[[INSERTED_SLICE_5]] : tensor<2x4x1x2xf32> From 0c121bbef5f970a6ebc632c4d791ba535f2c29f7 Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Wed, 17 Jul 2024 18:26:14 +0100 Subject: [PATCH 2/9] Address ftynse's comments --- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 122 +++---- .../transform-tile-and-winograd-rewrite.mlir | 332 ++++++++++-------- 2 files changed, 226 insertions(+), 228 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 47800dfc1d1a2..4905e3f8283f8 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -2885,15 +2885,6 @@ LogicalResult WinogradFilterTransformOp::verify() { // WinogradInputTransformOp //===----------------------------------------------------------------------===// -Value getValueFromOpFoldResult(OpFoldResult opFoldResult, OpBuilder &builder, - Location loc) { - if (auto attr = opFoldResult.dyn_cast()) { - auto intAttr = cast(attr); - return builder.create(loc, intAttr); - } - return opFoldResult.get(); -} - LogicalResult WinogradInputTransformOp::verify() { auto inputType = cast(getInput().getType()); ArrayRef inputShape = inputType.getShape(); @@ -2934,9 +2925,9 @@ LogicalResult WinogradInputTransformOp::verify() { SmallVector WinogradInputTransformOp::getIterationDomain(OpBuilder &builder) { Location loc = getLoc(); - auto indexType = builder.getIndexType(); - auto zeroAttr = builder.getIntegerAttr(indexType, 0); - auto oneAttr = builder.getIntegerAttr(indexType, 1); + IndexType indexType = builder.getIndexType(); + IntegerAttr zeroAttr = builder.getIntegerAttr(indexType, 0); + IntegerAttr oneAttr = builder.getIntegerAttr(indexType, 1); Value output = getOutput(); SmallVector loopBounds(6); for (unsigned dim = 0; dim < 6; ++dim) { @@ -2958,21 +2949,13 @@ LogicalResult WinogradInputTransformOp::getResultTilePosition( OpBuilder &builder, unsigned resultNumber, ArrayRef offsets, ArrayRef sizes, SmallVector &resultOffsets, SmallVector &resultSizes) { - auto zeroAttr = builder.getI64IntegerAttr(0); - auto oneAttr = builder.getI64IntegerAttr(1); + IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); + IntegerAttr oneAttr = builder.getI64IntegerAttr(1); - resultOffsets.push_back(zeroAttr); - resultOffsets.push_back(zeroAttr); - resultOffsets.push_back(offsets[2]); - resultOffsets.push_back(offsets[3]); - resultOffsets.push_back(zeroAttr); - resultOffsets.push_back(zeroAttr); - resultSizes.push_back(sizes[0]); - resultSizes.push_back(sizes[1]); - resultSizes.push_back(oneAttr); - resultSizes.push_back(oneAttr); - resultSizes.push_back(sizes[4]); - resultSizes.push_back(sizes[5]); + resultOffsets.append( + {zeroAttr, zeroAttr, offsets[2], offsets[3], zeroAttr, zeroAttr}); + resultSizes.append( + {sizes[0], sizes[1], oneAttr, oneAttr, sizes[4], sizes[5]}); return success(); } @@ -2981,11 +2964,11 @@ FailureOr WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder, ArrayRef offsets, ArrayRef sizes) { - auto oneAttr = builder.getI64IntegerAttr(1); - auto zeroAttr = builder.getI64IntegerAttr(0); + IntegerAttr oneAttr = builder.getI64IntegerAttr(1); + IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); Value input = getInput(); auto inputType = cast(input.getType()); - auto inputShape = inputType.getShape(); + ArrayRef inputShape = inputType.getShape(); int64_t inputH = inputShape[1]; int64_t inputW = inputShape[2]; int64_t m = getM(); @@ -2993,29 +2976,25 @@ WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder, int64_t alpha = m + r - 1; int64_t alphaH = inputH != 1 ? alpha : 1; int64_t alphaW = inputW != 1 ? alpha : 1; - auto alphaHAttr = builder.getI64IntegerAttr(alphaH); - auto alphaWAttr = builder.getI64IntegerAttr(alphaW); + IntegerAttr alphaHAttr = builder.getI64IntegerAttr(alphaH); + IntegerAttr alphaWAttr = builder.getI64IntegerAttr(alphaW); Location loc = getLoc(); SmallVector tiledOperands; SmallVector sliceOffsets, sliceSizes; - auto context = builder.getContext(); + MLIRContext *context = builder.getContext(); auto affineMap = AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); Value mappedOffset1 = builder.create( - loc, affineMap, getValueFromOpFoldResult(offsets[2], builder, loc)); + loc, affineMap, + getValueOrCreateConstantIndexOp(builder, loc, offsets[2])); Value mappedOffset2 = builder.create( - loc, affineMap, getValueFromOpFoldResult(offsets[3], builder, loc)); - - sliceOffsets.push_back(zeroAttr); - sliceOffsets.push_back(mappedOffset1); - sliceOffsets.push_back(mappedOffset2); - sliceOffsets.push_back(zeroAttr); - sliceSizes.push_back(sizes[4]); - sliceSizes.push_back(alphaHAttr); - sliceSizes.push_back(alphaWAttr); - sliceSizes.push_back(sizes[5]); + loc, affineMap, + getValueOrCreateConstantIndexOp(builder, loc, offsets[3])); + + sliceOffsets.append({zeroAttr, mappedOffset1, mappedOffset2, zeroAttr}); + sliceSizes.append({sizes[4], alphaHAttr, alphaWAttr, sizes[5]}); SmallVector inputStrides(4, oneAttr); tiledOperands.emplace_back(builder.create( loc, getInput(), sliceOffsets, sliceSizes, inputStrides)); @@ -3030,7 +3009,7 @@ WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder, tiledOperands.emplace_back(builder.create( loc, getOutput(), sliceOffsets, sliceSizes, outputStrides)); - SmallVector resultTypes; + SmallVector resultTypes; resultTypes.push_back(tiledOperands[1].getType()); Operation *tiledOp = mlir::clone(builder, getOperation(), resultTypes, tiledOperands); @@ -3083,9 +3062,9 @@ LogicalResult WinogradOutputTransformOp::verify() { SmallVector WinogradOutputTransformOp::getIterationDomain(OpBuilder &builder) { Location loc = getLoc(); - auto indexType = builder.getIndexType(); - auto zeroAttr = builder.getIntegerAttr(indexType, 0); - auto oneAttr = builder.getIntegerAttr(indexType, 1); + IndexType indexType = builder.getIndexType(); + IntegerAttr zeroAttr = builder.getIntegerAttr(indexType, 0); + IntegerAttr oneAttr = builder.getIntegerAttr(indexType, 1); Value value = getValue(); SmallVector loopBounds(6); for (unsigned dim = 0; dim < 6; ++dim) { @@ -3107,57 +3086,44 @@ LogicalResult WinogradOutputTransformOp::getResultTilePosition( OpBuilder &builder, unsigned resultNumber, ArrayRef offsets, ArrayRef sizes, SmallVector &resultOffsets, SmallVector &resultSizes) { - auto zeroAttr = builder.getI64IntegerAttr(0); + IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); Value output = getOutput(); auto outputType = cast(output.getType()); - auto outputShape = outputType.getShape(); + ArrayRef outputShape = outputType.getShape(); int64_t outputH = outputShape[1]; int64_t outputW = outputShape[2]; int64_t m = getM(); - auto heightM = builder.getI64IntegerAttr(outputH != 1 ? m : 1); - auto widthM = builder.getI64IntegerAttr(outputW != 1 ? m : 1); + IntegerAttr heightM = builder.getI64IntegerAttr(outputH != 1 ? m : 1); + IntegerAttr widthM = builder.getI64IntegerAttr(outputW != 1 ? m : 1); Location loc = getLoc(); - auto context = builder.getContext(); + MLIRContext *context = builder.getContext(); auto affineMap = AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); Value mappedOffset1 = builder.create( - loc, affineMap, getValueFromOpFoldResult(offsets[2], builder, loc)); + loc, affineMap, + getValueOrCreateConstantIndexOp(builder, loc, offsets[2])); Value mappedOffset2 = builder.create( - loc, affineMap, getValueFromOpFoldResult(offsets[3], builder, loc)); - - resultOffsets.push_back(zeroAttr); - resultOffsets.push_back(mappedOffset1); - resultOffsets.push_back(mappedOffset2); - resultOffsets.push_back(zeroAttr); - resultSizes.push_back(sizes[4]); - resultSizes.push_back(heightM); - resultSizes.push_back(widthM); - resultSizes.push_back(sizes[5]); + loc, affineMap, + getValueOrCreateConstantIndexOp(builder, loc, offsets[3])); + + resultOffsets.append({zeroAttr, mappedOffset1, mappedOffset2, zeroAttr}); + resultSizes.append({sizes[4], heightM, widthM, sizes[5]}); return success(); } FailureOr WinogradOutputTransformOp::getTiledImplementation( OpBuilder &builder, ArrayRef offsets, ArrayRef sizes) { - auto oneAttr = builder.getI64IntegerAttr(1); - auto zeroAttr = builder.getI64IntegerAttr(0); + IntegerAttr oneAttr = builder.getI64IntegerAttr(1); + IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); Location loc = getLoc(); SmallVector tiledOperands; SmallVector sliceOffsets, sliceSizes; - sliceOffsets.push_back(zeroAttr); - sliceOffsets.push_back(zeroAttr); - sliceOffsets.push_back(offsets[2]); - sliceOffsets.push_back(offsets[3]); - sliceOffsets.push_back(zeroAttr); - sliceOffsets.push_back(zeroAttr); - sliceSizes.push_back(sizes[0]); - sliceSizes.push_back(sizes[1]); - sliceSizes.push_back(oneAttr); - sliceSizes.push_back(oneAttr); - sliceSizes.push_back(sizes[4]); - sliceSizes.push_back(sizes[5]); + sliceOffsets.append( + {zeroAttr, zeroAttr, offsets[2], offsets[3], zeroAttr, zeroAttr}); + sliceSizes.append({sizes[0], sizes[1], oneAttr, oneAttr, sizes[4], sizes[5]}); SmallVector sliceStrides(6, oneAttr); tiledOperands.emplace_back(builder.create( loc, getValue(), sliceOffsets, sliceSizes, sliceStrides)); @@ -3172,7 +3138,7 @@ FailureOr WinogradOutputTransformOp::getTiledImplementation( tiledOperands.emplace_back(builder.create( loc, getOutput(), sliceOffsets, sliceSizes, strides)); - SmallVector resultTypes; + SmallVector resultTypes; resultTypes.push_back(tiledOperands[1].getType()); Operation *tiledOp = mlir::clone(builder, getOperation(), resultTypes, tiledOperands); diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir index 29833324bb21b..6bb3fb1423edc 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir @@ -31,52 +31,77 @@ module attributes {transform.with_named_sequence} { } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> ()> +// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)> // CHECK-LABEL: func.func @conv2d // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> { -// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf32> -// CHECK: scf.for {{.*}} = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x5x2xf32>) { -// CHECK-NEXT: scf.for {{.*}} = %[[C0]] to %[[C5]] step %[[C1]] iter_args({{.*}} = %[[ARG3]]) -> (tensor<6x6x5x2xf32>) { -// CHECK: linalg.matmul -// CHECK: linalg.matmul -// CHECK: %[[S5:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> -// CHECK: %[[S6:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> -// CHECK: scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S6]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) -// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) -// CHECK: tensor.extract_slice %[[ARG0]][0, %[[S13]], %[[S14]], 0] [2, 6, 6, 5] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x6x6x5xf32> -// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[S5]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x2x5xf32> -// CHECK: %[[S15:.*]] = scf.for {{.*}} = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) -> (tensor<6x6x1x1x2x5xf32>) { -// CHECK-NEXt: scf.for {{.*}} = %[[C0]] to %[[C5]] step %[[C1]] iter_args({{.*}} = %[[ARG8]]) -> (tensor<6x6x1x1x2x5xf32>) { -// CHECK: linalg.matmul -// CHECK: linalg.matmul -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S15]] into %[[ARG6]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x5xf32> into tensor<6x6x2x2x2x5xf32> -// CHECK: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x2x2x2x5xf32> -// CHECK: %[[S9:.*]] = linalg.batch_matmul -// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S9]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] : tensor<36x8x2xf32> into tensor<6x6x2x2x2x2xf32> -// CHECK: %[[S10:.*]] = tensor.empty() : tensor<2x8x8x2xf32> -// CHECK: scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S10]]) -> (tensor<2x8x8x2xf32>) { -// CHECK-NEXT: scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<2x8x8x2xf32>) { -// CHECK: tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> -// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) -// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) -// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG2]][0, %[[S13]], %[[S14]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x4x4x2xf32> -// CHECK: %[[S15:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) -> (tensor<2x4x4x2xf32>) { -// CHECK-NEXT: scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x4x4x2xf32>) { -// CHECK: linalg.matmul -// CHECK: linalg.matmul -// CHECK: %[[S16:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) -// CHECK: %[[S17:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S15]] into %[[ARG6]][0, %[[S16]], %[[S17]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x4x4x2xf32> into tensor<2x8x8x2xf32> -// CHECK: scf.yield %[[INSERTED_SLICE]] : tensor<2x8x8x2xf32> +// CHECK: %[[CST:.*]] = arith.constant 1.024000e+03 : f32 +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[S0:.*]] = tensor.empty() +// CHECK: %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]]) +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 3, 3, 1] [1, 1, 1, 1] +// CHECK: %[[S11:.*]] = linalg.matmul +// CHECK: %[[S13:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 6, 1, 1] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S9]] +// CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> +// CHECK: %[[S3:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> +// CHECK: %[[S4:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S3]]) +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, %[[S10]], %[[S11]], 0] [2, 6, 6, 5] [1, 1, 1, 1] +// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) +// CHECK: %[[S13:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) +// CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 6, 6, 1] [1, 1, 1, 1] +// CHECK: %[[S15:.*]] = linalg.matmul +// CHECK: %[[S17:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S17]] into %[[ARG10]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE_9]] +// CHECK: scf.yield %[[S13]] +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S9]] +// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] +// CHECK: %[[COLLAPSED_6:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] +// CHECK: %[[S6:.*]] = linalg.batch_matmul +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] +// CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x8x8x2xf32> +// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]]) +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG2]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) +// CHECK: %[[S15:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) +// CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S17:.*]] = linalg.matmul +// CHECK: %[[S19:.*]] = linalg.matmul +// CHECK: %[[S20:.*]] = tensor.empty() +// CHECK: %[[S21:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S20]] : tensor<4x4xf32>) { +// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: } -> tensor<4x4xf32> +// CHECK: %[[S22:.*]] = linalg.mul ins(%[[S21]], %[[S19]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32> +// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S22]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE_9]] +// CHECK: scf.yield %[[S15]] +// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][0, %[[S13]], %[[S14]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S9]] // ----- -func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<1xf32>, %arg3: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> { +func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg2: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = tensor.empty() : tensor<6x6x5x2xf32> %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x3x5xf32>) outs(%0 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> @@ -91,7 +116,7 @@ func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5x %4 = tensor.empty() : tensor<36x18x2xf32> %5 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%4 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32> %expanded = tensor.expand_shape %5 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32> - %padded_1 = tensor.pad %arg3 low[0, 0, 0, 0] high[0, 3, 3, 0] { + %padded_1 = tensor.pad %arg2 low[0, 0, 0, 0] high[0, 3, 3, 0] { ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index): tensor.yield %cst : f32 } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32> @@ -117,80 +142,82 @@ module attributes {transform.with_named_sequence} { } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> ()> +// CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)> // CHECK-LABEL: func.func @conv2d_unaligned -// CHECK-SAME: (%[[ARG0:.*]]: tensor<2x11x11x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> { -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[CST_6:.*]] = arith.constant 0.000000e+00 : f32 -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf32> -// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S0]]) -> (tensor<6x6x5x2xf32>) { -// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x5x2xf32>) { -// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG4]], 0, 0, %[[ARG6]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x1xf32> -// CHECK: tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<1x3x3x1xf32> to tensor<3x3xf32> -// CHECK: linalg.matmul -// CHECK: %[[S13:.*]] = linalg.matmul -// CHECK: %[[S14:.*]] = tensor.empty() : tensor<6x6x1x1xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[S14]][0, 0, 0, 0] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x1x1xf32> -// CHECK: %[[INSERTED_SLICE_11:.*]] = tensor.insert_slice %[[INSERTED_SLICE]] into %[[ARG7]][0, 0, %[[ARG6]], %[[ARG4]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x1x1xf32> into tensor<6x6x5x2xf32> -// CHECK: scf.yield %[[INSERTED_SLICE_11]] : tensor<6x6x5x2xf32> -// CHECK: %[[PADDED:.*]] = tensor.pad %[[ARG0]] low[0, 0, 0, 0] high[0, 3, 3, 0] { -// CHECK: ^bb0({{.*}}): -// CHECK: tensor.yield %[[CST_6]] : f32 -// CHECK: } : tensor<2x11x11x5xf32> to tensor<2x14x14x5xf32> -// CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32> -// CHECK: %[[S3:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32> -// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S3]]) -> (tensor<6x6x3x3x2x5xf32>) { -// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x3x3x2x5xf32>) { -// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK: tensor.extract_slice %[[PADDED]][0, %[[S10]], %[[S11]], 0] [2, 6, 6, 5] [1, 1, 1, 1] : tensor<2x14x14x5xf32> to tensor<2x6x6x5xf32> -// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x5xf32> to tensor<6x6x1x1x2x5xf32> -// CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) -> (tensor<6x6x1x1x2x5xf32>) { -// CHECK: scf.for %[[ARG10:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<6x6x1x1x2x5xf32>) { -// CHECK: linalg.matmul -// CHECK: %[[S17:.*]] = linalg.matmul -// CHECK: %[[S18:.*]] = tensor.empty() : tensor<6x6x1x1x1x1xf32> -// CHECK: %[[INSERTED_SLICE_13:.*]] = tensor.insert_slice %[[S17]] into %[[S18]][0, 0, 0, 0, 0, 0] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x1x1x1x1xf32> -// CHECK: %[[INSERTED_SLICE_14:.*]] = tensor.insert_slice %[[INSERTED_SLICE_13]] into %[[ARG11]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x1x1xf32> into tensor<6x6x1x1x2x5xf32> -// CHECK: scf.yield %[[INSERTED_SLICE_14]] : tensor<6x6x1x1x2x5xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x5xf32> into tensor<6x6x3x3x2x5xf32> -// CHECK: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x3x3x2x5xf32> -// CHECK: %[[S6:.*]] = linalg.batch_matmul -// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32> -// CHECK: %[[PADDED_8:.*]] = tensor.pad %[[ARG3]] low[0, 0, 0, 0] high[0, 3, 3, 0] { -// CHECK: ^bb0({{.*}}): -// CHECK: tensor.yield %[[CST_6]] : f32 -// CHECK: } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32> -// CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x12x12x2xf32> -// CHECK: %[[S8:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S7]]) -> (tensor<2x12x12x2xf32>) { -// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<2x12x12x2xf32>) { -// CHECK: tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x2xf32> to tensor<6x6x1x1x2x2xf32> -// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[PADDED_8]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x4x4x2xf32> -// CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) -> (tensor<2x4x4x2xf32>) { -// CHECK: scf.for %[[ARG10:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<2x4x4x2xf32>) { -// CHECK: linalg.matmul -// CHECK: linalg.matmul -// CHECK: %[[S22:.*]] = linalg.mul -// CHECK: %[[S23:.*]] = tensor.empty() : tensor<1x4x4x1xf32> -// CHECK: %[[INSERTED_SLICE_13:.*]] = tensor.insert_slice %[[S22]] into %[[S23]][0, 0, 0, 0] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<1x4x4x1xf32> -// CHECK: %[[INSERTED_SLICE_14:.*]] = tensor.insert_slice %[[INSERTED_SLICE_13]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<1x4x4x1xf32> into tensor<2x4x4x2xf32> -// CHECK: scf.yield %[[INSERTED_SLICE_14]] : tensor<2x4x4x2xf32> -// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, %[[S13]], %[[S14]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x4x4x2xf32> into tensor<2x12x12x2xf32> -// CHECK: scf.yield %[[INSERTED_SLICE]] : tensor<2x12x12x2xf32> -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S8]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32> -// CHECK: return %[[EXTRACTED_SLICE]] : tensor<2x9x9x2xf32> -// CHECK: } +// CHECK-SAME: (%[[ARG0:.*]]: tensor<2x11x11x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> { +// CHECK: %[[CST:.*]] = arith.constant 1.024000e+03 : f32 +// CHECK: %[[C3:.*]] = arith.constant 3 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[S0:.*]] = tensor.empty() +// CHECK: %[[S1:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S0]]) +// CHECK: %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG4]], 0, 0, %[[ARG6]]] [1, 3, 3, 1] [1, 1, 1, 1] +// CHECK: %[[S11:.*]] = linalg.matmul +// CHECK: %[[S13:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[ARG7]][0, 0, %[[ARG6]], %[[ARG4]]] [6, 6, 1, 1] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x5x2xf32> +// CHECK: scf.yield %[[S9]] : tensor<6x6x5x2xf32> +// CHECK: %[[PADDED:.*]] = tensor.pad %[[ARG0]] low[0, 0, 0, 0] high[0, 3, 3, 0] +// CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32> +// CHECK: %[[S3:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32> +// CHECK: %[[S4:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S3]]) +// CHECK: %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[PADDED]][0, %[[S10]], %[[S11]], 0] [2, 6, 6, 5] [1, 1, 1, 1] +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) +// CHECK: %[[S13:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) +// CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 6, 6, 1] [1, 1, 1, 1] +// CHECK: %[[S15:.*]] = linalg.matmul +// CHECK: %[[S17:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S17]] into %[[ARG11]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE_12]] : tensor<6x6x1x1x2x5xf32> +// CHECK: scf.yield %[[S13]] : tensor<6x6x1x1x2x5xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S9]] +// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] +// CHECK: %[[COLLAPSED_7:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] +// CHECK: %[[S6:.*]] = linalg.batch_matmul +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] +// CHECK: %[[PADDED_8:.*]] = tensor.pad %[[ARG2]] low[0, 0, 0, 0] high[0, 3, 3, 0] +// CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x12x12x2xf32> +// CHECK: %[[S8:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S7]]) +// CHECK: %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[PADDED_8]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) +// CHECK: %[[S15:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) +// CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S17:.*]] = linalg.matmul +// CHECK: %[[S19:.*]] = linalg.matmul +// CHECK: %[[S20:.*]] = tensor.empty() : tensor<4x4xf32> +// CHECK: %[[S21:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S20]] : tensor<4x4xf32>) { +// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: } -> tensor<4x4xf32> +// CHECK: %[[S22:.*]] = linalg.mul ins(%[[S21]], %[[S19]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32> +// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S22]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE_12]] +// CHECK: scf.yield %[[S15]] : tensor<2x4x4x2xf32> +// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, %[[S13]], %[[S14]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S9]] +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S8]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] +// CHECK: return %[[EXTRACTED_SLICE]] // ----- -func.func @conv2d_mx1_rx1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>, %arg2: tensor<1xf32>, %arg3: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> { +func.func @conv2d_mx1_rx1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>, %arg2: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> { %0 = tensor.empty() : tensor<6x1x5x2xf32> %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x1x5xf32>) outs(%0 : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32> %2 = tensor.empty() : tensor<6x1x1x1x2x5xf32> @@ -200,7 +227,7 @@ func.func @conv2d_mx1_rx1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32> %4 = tensor.empty() : tensor<6x2x2xf32> %5 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%4 : tensor<6x2x2xf32>) -> tensor<6x2x2xf32> %expanded = tensor.expand_shape %5 [[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32> - %6 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x1x1x1x2x2xf32>) outs(%arg3 : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> + %6 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x1x1x1x2x2xf32>) outs(%arg2 : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> return %6 : tensor<2x4x1x2xf32> } @@ -220,41 +247,46 @@ module attributes {transform.with_named_sequence} { } } +// CHECK: #[[$MAP0:.+]] = affine_map<(d0, d1) -> ()> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)> // CHECK-LABEL: func.func @conv2d_mx1_rx1 -// CHECK-SAME: (%[[ARG0:.*]]: tensor<2x6x1x5xf32>, %[[ARG1:.*]]: tensor<2x3x1x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> { -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C5:.*]] = arith.constant 5 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x1x5x2xf32> -// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S0]]) -> (tensor<6x1x5x2xf32>) { -// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x1x5x2xf32>) { -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG4]], 0, 0, %[[ARG6]]] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<2x3x1x5xf32> to tensor<1x3x1x1xf32> -// CHECK: tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<1x3x1x1xf32> to tensor<3x1xf32> -// CHECK: %[[S9:.*]] = linalg.matmul -// CHECK: %[[S10:.*]] = tensor.empty() : tensor<6x1x1x1xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[S10]][0, 0, 0, 0] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1xf32> into tensor<6x1x1x1xf32> -// CHECK: %[[INSERTED_SLICE_5:.*]] = tensor.insert_slice %[[INSERTED_SLICE]] into %[[ARG7]][0, 0, %[[ARG6]], %[[ARG4]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x1x1xf32> into tensor<6x1x5x2xf32> -// CHECK: scf.yield %[[INSERTED_SLICE_5]] : tensor<6x1x5x2xf32> -// CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32> -// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S2]]) -> (tensor<6x1x1x1x2x5xf32>) { -// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x1x1x1x2x5xf32>) { -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG4]], 0, 0, %[[ARG6]]] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<2x6x1x5xf32> to tensor<1x6x1x1xf32> -// CHECK: tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<1x6x1x1xf32> to tensor<6x1xf32> -// CHECK: %[[S9:.*]] = linalg.matmul -// CHECK: %[[S10:.*]] = tensor.empty() : tensor<6x1x1x1x1x1xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[S10]][0, 0, 0, 0, 0, 0] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1xf32> into tensor<6x1x1x1x1x1xf32> -// CHECK: %[[INSERTED_SLICE_5:.*]] = tensor.insert_slice %[[INSERTED_SLICE]] into %[[ARG7]][0, 0, 0, 0, %[[ARG4]], %[[ARG6]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x1x1x1x1xf32> into tensor<6x1x1x1x2x5xf32> -// CHECK: scf.yield %[[INSERTED_SLICE_5]] : tensor<6x1x1x1x2x5xf32> -// CHECK: %[[S5:.*]] = linalg.batch_matmul -// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32> -// CHECK: scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<2x4x1x2xf32>) { -// CHECK: scf.for %[[ARG6:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<2x4x1x2xf32>) { -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG4]], %[[ARG6]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x1x1x2x2xf32> to tensor<6x1x1x1x1x1xf32> -// CHECK: tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, 0, 0] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x1x1x1x1xf32> to tensor<6x1xf32> -// CHECK: linalg.matmul -// CHECK: %[[S12:.*]] = linalg.mul -// CHECK: %[[S13:.*]] = tensor.empty() : tensor<1x4x1x1xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[S13]][0, 0, 0, 0] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<4x1xf32> into tensor<1x4x1x1xf32> -// CHECK: %[[INSERTED_SLICE_5:.*]] = tensor.insert_slice %[[INSERTED_SLICE]] into %[[ARG7]][%[[ARG4]], 0, 0, %[[ARG6]]] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<1x4x1x1xf32> into tensor<2x4x1x2xf32> -// CHECK: scf.yield %[[INSERTED_SLICE_5]] : tensor<2x4x1x2xf32> +// CHECK-SAME: (%[[ARG0:.*]]: tensor<2x6x1x5xf32>, %[[ARG1:.*]]: tensor<2x3x1x5xf32>, %[[ARG2:.*]]: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> { +// CHECK: %[[CST:.*]] = arith.constant 3.200000e+01 : f32 +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x1x5x2xf32> +// CHECK: %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]]) +// CHECK: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 3, 1, 1] [1, 1, 1, 1] +// CHECK: %[[S9:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 1, 1, 1] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S7]] +// CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32> +// CHECK: %[[S3:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S2]]) +// CHECK: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 6, 1, 1] [1, 1, 1, 1] +// CHECK: %[[S9:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG6]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S7]] +// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] +// CHECK: %[[COLLAPSED_3:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2, 3, 4], [5]] +// CHECK: %[[S5:.*]] = linalg.batch_matmul +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] +// CHECK: %[[S6:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[ARG2]]) +// CHECK: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S9:.*]] = linalg.matmul +// CHECK: %[[S10:.*]] = tensor.empty() : tensor<4x1xf32> +// CHECK: %[[S11:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S10]] : tensor<4x1xf32>) { +// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: } -> tensor<4x1xf32> +// CHECK: %[[S12:.*]] = linalg.mul ins(%[[S11]], %[[S9]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S10]] : tensor<4x1xf32>) -> tensor<4x1xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S7]] +// CHECK: return %[[S6]] From b4e5c7c97b770391c81839261d47956925e23daf Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Thu, 18 Jul 2024 06:52:17 +0100 Subject: [PATCH 3/9] Address more comments --- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 4905e3f8283f8..da55360c82eb0 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -2999,15 +2999,14 @@ WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder, tiledOperands.emplace_back(builder.create( loc, getInput(), sliceOffsets, sliceSizes, inputStrides)); - sliceOffsets.clear(); - sliceSizes.clear(); - if (failed(getResultTilePosition(builder, 1, offsets, sizes, sliceOffsets, - sliceSizes))) + SmallVector resultOffsets, resultSizes; + if (failed(getResultTilePosition(builder, 1, offsets, sizes, resultOffsets, + resultSizes))) return failure(); SmallVector outputStrides(6, oneAttr); tiledOperands.emplace_back(builder.create( - loc, getOutput(), sliceOffsets, sliceSizes, outputStrides)); + loc, getOutput(), resultOffsets, resultSizes, outputStrides)); SmallVector resultTypes; resultTypes.push_back(tiledOperands[1].getType()); @@ -3128,15 +3127,14 @@ FailureOr WinogradOutputTransformOp::getTiledImplementation( tiledOperands.emplace_back(builder.create( loc, getValue(), sliceOffsets, sliceSizes, sliceStrides)); - sliceOffsets.clear(); - sliceSizes.clear(); - if (failed(getResultTilePosition(builder, 1, offsets, sizes, sliceOffsets, - sliceSizes))) + SmallVector resultOffsets, resultSizes; + if (failed(getResultTilePosition(builder, 1, offsets, sizes, resultOffsets, + resultSizes))) return failure(); SmallVector strides(4, oneAttr); tiledOperands.emplace_back(builder.create( - loc, getOutput(), sliceOffsets, sliceSizes, strides)); + loc, getOutput(), resultOffsets, resultSizes, strides)); SmallVector resultTypes; resultTypes.push_back(tiledOperands[1].getType()); From e972232669e6de1638a86b62808cff7290ff3e70 Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Thu, 18 Jul 2024 07:24:42 +0100 Subject: [PATCH 4/9] Update comments --- .../Dialect/Linalg/Transforms/Transforms.h | 52 ++++++++++++------- 1 file changed, 32 insertions(+), 20 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index 89f907cfc7ade..861e14d22d962 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -1318,8 +1318,8 @@ FailureOr winogradConv2D(RewriterBase &rewriter, /// Rewrite linalg.winograd_filter_transform. The data layout of the filter is /// FHWC. The transformation matrix is 2-dimension. We need to extract H x W -/// from FHWC first. We need to generate 2 levels of loops to iterate on F and -/// C. After the rewriting, we get +/// from FHWC first. We generate 2 levels of loops to iterate on F and C. After +/// the rewriting, we get /// /// scf.for %f = lo_f to hi_f step 1 /// scf.for %c = lo_c to hi_c step 1 @@ -1333,30 +1333,42 @@ decomposeWinogradFilterTransformOp(RewriterBase &rewriter, /// Rewrite linalg.winograd_input_transform. The data layout of the input is /// NHWC. The transformation matrix is 2-dimension. We need to extract H x W -/// from NHWC first. We need to generate 2 levels of loops to iterate on N and -/// C. After the rewriting, we get -/// -/// scf.for %n = lo_n to hi_n step 1 -/// scf.for %c = lo_c to hi_c step 1 -/// %extracted = extract input from input -/// %ret = linalg.matmul BT, %extracted -/// %ret = linalg.matmul %ret, B -/// %inserted = insert %ret into input +/// from NHWC first. We generate 4 levels of loops to iterate on N, C, tileH, +/// and tileW. After the rewriting, we get +/// +/// scf.for %h = 0 to tileH step 1 +/// scf.for %w = 0 to tileW step 1 +/// scf.for %n = 0 to N step 1 +/// scf.for %c = 0 to C step 1 +/// %extracted = extract %extracted from +/// %input +/// at [%n, (%h x m), (%w x m), %c] +/// %ret = linalg.matmul BT, %extracted +/// %ret = linalg.matmul %ret, B +/// %inserted = insert %ret into +/// %output +/// at [0, 0, %h, %w, %n, %c] FailureOr decomposeWinogradInputTransformOp(RewriterBase &rewriter, linalg::WinogradInputTransformOp op); /// Rewrite linalg.winograd_output_transform. The data layout of the output is /// HWNF. The transformation matrix is 2-dimension. We need to extract H x W -/// from HWNF first. We need to generate 2 levels of loops to iterate on N and -/// F. After the transformation, we get -/// -/// scf.for %n = lo_n to hi_n step 1 -/// scf.for %f = lo_f to hi_f step 1 -/// %extracted = extract input from result -/// %ret = linalg.matmul AT, %extracted -/// %ret = linalg.matmul %ret, A -/// %inserted = insert %ret into ret +/// from HWNF first. We generate 4 levels of loops to iterate on N, F, tileH, +/// and tileW. After the transformation, we get +/// +/// scf.for %h = 0 to tileH step 1 +/// scf.for %w = 0 to tileW step 1 +/// scf.for %n = 0 to N step 1 +/// scf.for %f = 0 to F step 1 +/// %extracted = extract %extracted from +/// %input +/// at [0, 0, %h, %w, %n, %f] +/// %ret = linalg.matmul AT, %extracted +/// %ret = linalg.matmul %ret, A +/// %inserted = insert %ret into +/// output +/// at [%n, (%h x m), (%w x m), %f] FailureOr decomposeWinogradOutputTransformOp(RewriterBase &rewriter, linalg::WinogradOutputTransformOp op); From 73696eec36aa8cebc80cf093b995a7fb34064951 Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Thu, 1 Aug 2024 08:44:10 +0100 Subject: [PATCH 5/9] address Max191's comments --- .../mlir/Dialect/Linalg/IR/LinalgOps.td | 49 ++- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 279 ++++++++++--- .../Linalg/Transforms/WinogradConv2D.cpp | 7 - .../transform-tile-and-winograd-rewrite.mlir | 12 +- .../Linalg/transform-tile-winograd.mlir | 392 ++++++++++++++++++ 5 files changed, 664 insertions(+), 75 deletions(-) create mode 100644 mlir/test/Dialect/Linalg/transform-tile-winograd.mlir diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td index bc2df7082ef8e..aaf243e015726 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td @@ -155,7 +155,12 @@ def Linalg_SoftmaxOp : Linalg_Op<"softmax", } def Linalg_WinogradFilterTransformOp : Linalg_Op<"winograd_filter_transform", - [AllElementTypesMatch<["filter", "output"]>]> { + [AllElementTypesMatch<["filter", "output"]>, + DeclareOpInterfaceMethods]> { let summary = "Winograd filter transform operator"; let description = [{ Winograd Conv2D algorithm will convert linalg Conv2D operator into batched @@ -190,6 +195,20 @@ def Linalg_WinogradFilterTransformOp : Linalg_Op<"winograd_filter_transform", `outs` `(` $output `:` type($output) `)` `->` type($result) }]; + let extraClassDeclaration = [{ + ShapedType getFilterOperandType() { + return cast(getFilter().getType()); + } + ShapedType getOutputOperandType() { + return cast(getOutput().getType()); + } + int64_t getFilterOperandRank() { + return getFilterOperandType().getRank(); + } + int64_t getOutputOperandRank() { + return getOutputOperandType().getRank(); + } + }]; let hasVerifier = 1; } @@ -234,6 +253,20 @@ def Linalg_WinogradInputTransformOp : Linalg_Op<"winograd_input_transform", `outs` `(` $output `:` type($output) `)` `->` type($result) }]; + let extraClassDeclaration = [{ + ShapedType getInputOperandType() { + return cast(getInput().getType()); + } + ShapedType getOutputOperandType() { + return cast(getOutput().getType()); + } + int64_t getInputOperandRank() { + return getInputOperandType().getRank(); + } + int64_t getOutputOperandRank() { + return getOutputOperandType().getRank(); + } + }]; let hasVerifier = 1; } @@ -278,6 +311,20 @@ def Linalg_WinogradOutputTransformOp : Linalg_Op<"winograd_output_transform", `outs` `(` $output `:` type($output) `)` `->` type($result) }]; + let extraClassDeclaration = [{ + ShapedType getValueOperandType() { + return cast(getValue().getType()); + } + ShapedType getOutputOperandType() { + return cast(getOutput().getType()); + } + int64_t getValueOperandRank() { + return getValueOperandType().getRank(); + } + int64_t getOutputOperandRank() { + return getOutputOperandType().getRank(); + } + }]; let hasVerifier = 1; } diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index da55360c82eb0..ee633a715cc9e 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -2881,6 +2881,105 @@ LogicalResult WinogradFilterTransformOp::verify() { return success(); } +SmallVector +WinogradFilterTransformOp::getIterationDomain(OpBuilder &builder) { + Location loc = getLoc(); + IntegerAttr zeroAttr = builder.getIndexAttr(0); + IntegerAttr oneAttr = builder.getIndexAttr(1); + Value filter = getFilter(); + SmallVector loopBounds; + Range fRange; + fRange.offset = zeroAttr; + // 0 is the index of F in the filter. + fRange.size = getDimValue(builder, loc, filter, 0); + fRange.stride = oneAttr; + loopBounds.emplace_back(fRange); + Range cRange; + cRange.offset = zeroAttr; + // 3 is the index of C in the filter. + cRange.size = getDimValue(builder, loc, filter, 3); + cRange.stride = oneAttr; + loopBounds.emplace_back(cRange); + return loopBounds; +} + +SmallVector +WinogradFilterTransformOp::getLoopIteratorTypes() { + int64_t filterRank = getFilterOperandRank(); + SmallVector iteratorTypes(filterRank - 2, + utils::IteratorType::parallel); + return iteratorTypes; +} + +LogicalResult WinogradFilterTransformOp::getResultTilePosition( + OpBuilder &builder, unsigned resultNumber, ArrayRef offsets, + ArrayRef sizes, SmallVector &resultOffsets, + SmallVector &resultSizes) { + IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); + ShapedType filterType = getFilterOperandType(); + ArrayRef filterShape = filterType.getShape(); + int64_t filterH = filterShape[1]; + int64_t filterW = filterShape[2]; + int64_t m = getM(); + int64_t r = getR(); + int64_t alpha = m + r - 1; + int64_t alphaH = filterH != 1 ? alpha : 1; + int64_t alphaW = filterW != 1 ? alpha : 1; + IntegerAttr alphaHAttr = builder.getI64IntegerAttr(alphaH); + IntegerAttr alphaWAttr = builder.getI64IntegerAttr(alphaW); + + resultOffsets.append({zeroAttr, zeroAttr, offsets[1], offsets[0]}); + resultSizes.append({alphaHAttr, alphaWAttr, sizes[1], sizes[0]}); + + return success(); +} + +/// Implement tiling for winograd_filter_transform +/// The input of winograd_filter_transform is (F, KH, KW, C). +/// The output of winograd_filter_transform is (alphaH, alphaW, C, F) +/// Users can specify the tile sizes of F and C. +/// `offsets` are the values for the offsets of F, C for one tile. +/// `sizes` are the values for the sizes of F, C for one tile. +FailureOr WinogradFilterTransformOp::getTiledImplementation( + OpBuilder &builder, ArrayRef offsets, + ArrayRef sizes) { + IntegerAttr oneAttr = builder.getI64IntegerAttr(1); + IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); + ShapedType filterType = getFilterOperandType(); + ArrayRef filterShape = filterType.getShape(); + int64_t filterH = filterShape[1]; + int64_t filterW = filterShape[2]; + IntegerAttr filterHAttr = builder.getI64IntegerAttr(filterH); + IntegerAttr filterWAttr = builder.getI64IntegerAttr(filterW); + SmallVector tiledOperands; + SmallVector sliceOffsets, sliceSizes; + + sliceOffsets.append({offsets[0], zeroAttr, zeroAttr, offsets[1]}); + sliceSizes.append({sizes[0], filterHAttr, filterWAttr, sizes[1]}); + int64_t filterRank = getFilterOperandRank(); + SmallVector filterStrides(filterRank, oneAttr); + Location loc = getLoc(); + tiledOperands.emplace_back(builder.create( + loc, getFilter(), sliceOffsets, sliceSizes, filterStrides)); + + SmallVector resultOffsets, resultSizes; + if (failed(getResultTilePosition(builder, 1, offsets, sizes, resultOffsets, + resultSizes))) + return failure(); + + int64_t outputRank = getOutputOperandRank(); + SmallVector outputStrides(outputRank, oneAttr); + tiledOperands.emplace_back(builder.create( + loc, getOutput(), resultOffsets, resultSizes, outputStrides)); + + SmallVector resultTypes; + resultTypes.push_back(tiledOperands[1].getType()); + Operation *tiledOp = + mlir::clone(builder, getOperation(), resultTypes, tiledOperands); + + return TilingResult{{tiledOp}, SmallVector(tiledOp->getResults())}; +} + //===----------------------------------------------------------------------===// // WinogradInputTransformOp //===----------------------------------------------------------------------===// @@ -2925,22 +3024,23 @@ LogicalResult WinogradInputTransformOp::verify() { SmallVector WinogradInputTransformOp::getIterationDomain(OpBuilder &builder) { Location loc = getLoc(); - IndexType indexType = builder.getIndexType(); - IntegerAttr zeroAttr = builder.getIntegerAttr(indexType, 0); - IntegerAttr oneAttr = builder.getIntegerAttr(indexType, 1); + IntegerAttr zeroAttr = builder.getIndexAttr(0); + IntegerAttr oneAttr = builder.getIndexAttr(1); Value output = getOutput(); - SmallVector loopBounds(6); - for (unsigned dim = 0; dim < 6; ++dim) { - loopBounds[dim].offset = zeroAttr; - loopBounds[dim].size = getDimValue(builder, loc, output, dim); - loopBounds[dim].stride = oneAttr; + int64_t outputRank = getOutputOperandRank(); + SmallVector loopBounds(outputRank - 2); + for (unsigned dim = 2; dim < outputRank; ++dim) { + loopBounds[dim - 2].offset = zeroAttr; + loopBounds[dim - 2].size = getDimValue(builder, loc, output, dim); + loopBounds[dim - 2].stride = oneAttr; } return loopBounds; } SmallVector WinogradInputTransformOp::getLoopIteratorTypes() { - SmallVector iteratorTypes(6, + int64_t outputRank = getOutputOperandRank(); + SmallVector iteratorTypes(outputRank - 2, utils::IteratorType::parallel); return iteratorTypes; } @@ -2950,52 +3050,79 @@ LogicalResult WinogradInputTransformOp::getResultTilePosition( ArrayRef sizes, SmallVector &resultOffsets, SmallVector &resultSizes) { IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); - IntegerAttr oneAttr = builder.getI64IntegerAttr(1); + ShapedType inputType = getInputOperandType(); + ArrayRef inputShape = inputType.getShape(); + int64_t inputH = inputShape[1]; + int64_t inputW = inputShape[2]; + int64_t m = getM(); + int64_t r = getR(); + int64_t alpha = m + r - 1; + int64_t alphaH = inputH != 1 ? alpha : 1; + int64_t alphaW = inputW != 1 ? alpha : 1; + IntegerAttr alphaHAttr = builder.getI64IntegerAttr(alphaH); + IntegerAttr alphaWAttr = builder.getI64IntegerAttr(alphaW); resultOffsets.append( - {zeroAttr, zeroAttr, offsets[2], offsets[3], zeroAttr, zeroAttr}); + {zeroAttr, zeroAttr, offsets[0], offsets[1], offsets[2], offsets[3]}); resultSizes.append( - {sizes[0], sizes[1], oneAttr, oneAttr, sizes[4], sizes[5]}); + {alphaHAttr, alphaWAttr, sizes[0], sizes[1], sizes[2], sizes[3]}); return success(); } +/// Implement tiling for winograd_input_transform +/// The input of winograd_input_transform is (N, H, W, C). +/// The output of winograd_input_transform is (alphaH, alphaW, tileH, tileW, N, +/// C) Users can specify the tile sizes of tileH, tileW, N, and C. `offsets` are +/// the values for the offsets of tileH, tileW, N, C for one tile. `sizes` are +/// the values for the sizes of tileH, tileW, N, C for one tile. FailureOr WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder, ArrayRef offsets, ArrayRef sizes) { IntegerAttr oneAttr = builder.getI64IntegerAttr(1); IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); - Value input = getInput(); - auto inputType = cast(input.getType()); + ShapedType inputType = getInputOperandType(); ArrayRef inputShape = inputType.getShape(); int64_t inputH = inputShape[1]; int64_t inputW = inputShape[2]; int64_t m = getM(); int64_t r = getR(); - int64_t alpha = m + r - 1; - int64_t alphaH = inputH != 1 ? alpha : 1; - int64_t alphaW = inputW != 1 ? alpha : 1; - IntegerAttr alphaHAttr = builder.getI64IntegerAttr(alphaH); - IntegerAttr alphaWAttr = builder.getI64IntegerAttr(alphaW); Location loc = getLoc(); - SmallVector tiledOperands; - SmallVector sliceOffsets, sliceSizes; - MLIRContext *context = builder.getContext(); - auto affineMap = + auto offsetAffineMap = AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); - Value mappedOffset1 = builder.create( - loc, affineMap, - getValueOrCreateConstantIndexOp(builder, loc, offsets[2])); - Value mappedOffset2 = builder.create( - loc, affineMap, - getValueOrCreateConstantIndexOp(builder, loc, offsets[3])); + Value mappedOffsetH = builder.create( + loc, offsetAffineMap, + getValueOrCreateConstantIndexOp(builder, loc, offsets[0])); + Value mappedOffsetW = builder.create( + loc, offsetAffineMap, + getValueOrCreateConstantIndexOp(builder, loc, offsets[1])); + auto sizeAffineMap = AffineMap::get( + 1, 0, {builder.getAffineDimExpr(0) * m + (r - 1)}, context); + Value mappedSizeH = builder.create( + loc, sizeAffineMap, + getValueOrCreateConstantIndexOp(builder, loc, sizes[0])); + Value mappedSizeW = builder.create( + loc, sizeAffineMap, + getValueOrCreateConstantIndexOp(builder, loc, sizes[1])); + + SmallVector tiledOperands; + SmallVector sliceOffsets, sliceSizes; - sliceOffsets.append({zeroAttr, mappedOffset1, mappedOffset2, zeroAttr}); - sliceSizes.append({sizes[4], alphaHAttr, alphaWAttr, sizes[5]}); - SmallVector inputStrides(4, oneAttr); + OpFoldResult offsetH = + inputH != 1 ? OpFoldResult(mappedOffsetH) : OpFoldResult(zeroAttr); + OpFoldResult offsetW = + inputW != 1 ? OpFoldResult(mappedOffsetW) : OpFoldResult(zeroAttr); + sliceOffsets.append({offsets[2], offsetH, offsetW, offsets[3]}); + OpFoldResult sizeH = + inputH != 1 ? OpFoldResult(mappedSizeH) : OpFoldResult(oneAttr); + OpFoldResult sizeW = + inputW != 1 ? OpFoldResult(mappedSizeW) : OpFoldResult(oneAttr); + sliceSizes.append({sizes[2], sizeH, sizeW, sizes[3]}); + int64_t inputRank = getInputOperandRank(); + SmallVector inputStrides(inputRank, oneAttr); tiledOperands.emplace_back(builder.create( loc, getInput(), sliceOffsets, sliceSizes, inputStrides)); @@ -3004,7 +3131,8 @@ WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder, resultSizes))) return failure(); - SmallVector outputStrides(6, oneAttr); + int64_t outputRank = getOutputOperandRank(); + SmallVector outputStrides(outputRank, oneAttr); tiledOperands.emplace_back(builder.create( loc, getOutput(), resultOffsets, resultSizes, outputStrides)); @@ -3032,7 +3160,8 @@ LogicalResult WinogradOutputTransformOp::verify() { bool leftTransform = valueH != 1; bool rightTransform = valueW != 1; - SmallVector expectedOutputShape(4, valueH); + int64_t outputRank = getOutputOperandRank(); + SmallVector expectedOutputShape(outputRank, valueH); if (ShapedType::isDynamic(valueH) || ShapedType::isDynamic(valueTileH)) { expectedOutputShape[1] = ShapedType::kDynamic; } else { @@ -3061,22 +3190,23 @@ LogicalResult WinogradOutputTransformOp::verify() { SmallVector WinogradOutputTransformOp::getIterationDomain(OpBuilder &builder) { Location loc = getLoc(); - IndexType indexType = builder.getIndexType(); - IntegerAttr zeroAttr = builder.getIntegerAttr(indexType, 0); - IntegerAttr oneAttr = builder.getIntegerAttr(indexType, 1); + IntegerAttr zeroAttr = builder.getIndexAttr(0); + IntegerAttr oneAttr = builder.getIndexAttr(1); Value value = getValue(); - SmallVector loopBounds(6); - for (unsigned dim = 0; dim < 6; ++dim) { - loopBounds[dim].offset = zeroAttr; - loopBounds[dim].size = getDimValue(builder, loc, value, dim); - loopBounds[dim].stride = oneAttr; + int64_t valueRank = getValueOperandRank(); + SmallVector loopBounds(valueRank - 2); + for (unsigned dim = 2; dim < valueRank; ++dim) { + loopBounds[dim - 2].offset = zeroAttr; + loopBounds[dim - 2].size = getDimValue(builder, loc, value, dim); + loopBounds[dim - 2].stride = oneAttr; } return loopBounds; } SmallVector WinogradOutputTransformOp::getLoopIteratorTypes() { - SmallVector iteratorTypes(6, + int64_t valueRank = getValueOperandRank(); + SmallVector iteratorTypes(valueRank - 2, utils::IteratorType::parallel); return iteratorTypes; } @@ -3085,32 +3215,49 @@ LogicalResult WinogradOutputTransformOp::getResultTilePosition( OpBuilder &builder, unsigned resultNumber, ArrayRef offsets, ArrayRef sizes, SmallVector &resultOffsets, SmallVector &resultSizes) { - IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); - Value output = getOutput(); - auto outputType = cast(output.getType()); - ArrayRef outputShape = outputType.getShape(); - int64_t outputH = outputShape[1]; - int64_t outputW = outputShape[2]; int64_t m = getM(); - IntegerAttr heightM = builder.getI64IntegerAttr(outputH != 1 ? m : 1); - IntegerAttr widthM = builder.getI64IntegerAttr(outputW != 1 ? m : 1); Location loc = getLoc(); MLIRContext *context = builder.getContext(); auto affineMap = AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); - Value mappedOffset1 = builder.create( + Value mappedOffsetH = builder.create( loc, affineMap, - getValueOrCreateConstantIndexOp(builder, loc, offsets[2])); - Value mappedOffset2 = builder.create( + getValueOrCreateConstantIndexOp(builder, loc, offsets[0])); + Value mappedOffsetW = builder.create( loc, affineMap, - getValueOrCreateConstantIndexOp(builder, loc, offsets[3])); + getValueOrCreateConstantIndexOp(builder, loc, offsets[1])); + Value mappedSizeH = builder.create( + loc, affineMap, getValueOrCreateConstantIndexOp(builder, loc, sizes[0])); + Value mappedSizeW = builder.create( + loc, affineMap, getValueOrCreateConstantIndexOp(builder, loc, sizes[1])); - resultOffsets.append({zeroAttr, mappedOffset1, mappedOffset2, zeroAttr}); - resultSizes.append({sizes[4], heightM, widthM, sizes[5]}); + ShapedType valueType = getValueOperandType(); + ArrayRef valueShape = valueType.getShape(); + int64_t valueH = valueShape[0]; + int64_t valueW = valueShape[1]; + IntegerAttr oneAttr = builder.getI64IntegerAttr(1); + IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); + OpFoldResult offsetH = + valueH != 1 ? OpFoldResult(mappedOffsetH) : OpFoldResult(zeroAttr); + OpFoldResult offsetW = + valueW != 1 ? OpFoldResult(mappedOffsetW) : OpFoldResult(zeroAttr); + OpFoldResult sizeH = + valueH != 1 ? OpFoldResult(mappedSizeH) : OpFoldResult(oneAttr); + OpFoldResult sizeW = + valueW != 1 ? OpFoldResult(mappedSizeW) : OpFoldResult(oneAttr); + + resultOffsets.append({offsets[2], offsetH, offsetW, offsets[3]}); + resultSizes.append({sizes[2], sizeH, sizeW, sizes[3]}); return success(); } +/// Implement tiling for winograd_output_transform +/// The input of winograd_output_transform is (alphaH, alphaW, tileH, tileW, N, +/// F). The output of winograd_output_transform is (N, H, W, F) Users can +/// specify the tile sizes of tileH, tileW, N, and F. `offsets` are the values +/// for the offsets of tileH, tileW, N, F for one tile. `sizes` are the values +/// for the sizes of tileH, tileW, N, F for one tile. FailureOr WinogradOutputTransformOp::getTiledImplementation( OpBuilder &builder, ArrayRef offsets, ArrayRef sizes) { @@ -3120,10 +3267,19 @@ FailureOr WinogradOutputTransformOp::getTiledImplementation( SmallVector tiledOperands; SmallVector sliceOffsets, sliceSizes; + ShapedType valueType = getValueOperandType(); + ArrayRef valueShape = valueType.getShape(); + int64_t alphaH = valueShape[0]; + int64_t alphaW = valueShape[1]; + IntegerAttr alphaHAttr = builder.getI64IntegerAttr(alphaH); + IntegerAttr alphaWAttr = builder.getI64IntegerAttr(alphaW); + sliceOffsets.append( - {zeroAttr, zeroAttr, offsets[2], offsets[3], zeroAttr, zeroAttr}); - sliceSizes.append({sizes[0], sizes[1], oneAttr, oneAttr, sizes[4], sizes[5]}); - SmallVector sliceStrides(6, oneAttr); + {zeroAttr, zeroAttr, offsets[0], offsets[1], offsets[2], offsets[3]}); + sliceSizes.append( + {alphaHAttr, alphaWAttr, sizes[0], sizes[1], sizes[2], sizes[3]}); + int64_t valueRank = getValueOperandRank(); + SmallVector sliceStrides(valueRank, oneAttr); tiledOperands.emplace_back(builder.create( loc, getValue(), sliceOffsets, sliceSizes, sliceStrides)); @@ -3132,7 +3288,8 @@ FailureOr WinogradOutputTransformOp::getTiledImplementation( resultSizes))) return failure(); - SmallVector strides(4, oneAttr); + int64_t outputRank = getOutputOperandRank(); + SmallVector strides(outputRank, oneAttr); tiledOperands.emplace_back(builder.create( loc, getOutput(), resultOffsets, resultSizes, strides)); diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp index c01a5f4b055f1..b65b18699a15a 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp @@ -490,8 +490,6 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input, Type elementType = inputType.getElementType(); auto inputShape = inputType.getShape(); // N, H, W, C int64_t inputN = inputShape[0]; - int64_t inputH = inputShape[1]; - int64_t inputW = inputShape[2]; int64_t inputC = inputShape[3]; auto valueType = cast(retValue.getType()); auto valueShape = valueType.getShape(); // alphaH, alphaW, HTile, WTile, N, C @@ -500,11 +498,6 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input, int64_t alphaH = leftTransform ? m + r - 1 : 1; int64_t alphaW = rightTransform ? m + r - 1 : 1; - if ((inputH != (tileH * m) + (r - 1)) && inputH != 1) - return Value(); - if ((inputW != (tileW * m) + (r - 1)) && inputW != 1) - return Value(); - auto buildBody = [&](OpBuilder &builder, Location loc, ValueRange ivs, ValueRange args) -> scf::ValueVector { Value tileHIter = ivs[0]; diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir index 6bb3fb1423edc..54eb755047f18 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir @@ -18,9 +18,9 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) @@ -129,9 +129,9 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) @@ -235,9 +235,9 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) diff --git a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir new file mode 100644 index 0000000000000..b1b2818db38fc --- /dev/null +++ b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir @@ -0,0 +1,392 @@ +// RUN: mlir-opt %s -transform-interpreter --split-input-file | FileCheck %s + +func.func @tile_winograd_filter(%arg0: tensor<2x3x3x5xf32>, %arg1: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> { + %0 = linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<2x3x3x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> + return %0 : tensor<6x6x5x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// CHECK-LABEL: func.func @tile_winograd_filter( +// CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x3x5xf32>, %[[ARG1:.*]]: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C1_1:.*]] = arith.constant 1 : index +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C1_1]] +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x1xf32> +// CHECK: %[[EXTRACTED_SLICE_2:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x1x1xf32> +// CHECK: %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x1xf32>) outs(%[[EXTRACTED_SLICE_2]] : tensor<6x6x1x1xf32>) -> tensor<6x6x1x1xf32> + +// ----- + +func.func @tile_winograd_filter(%arg0: tensor<2x3x3x5xf32>, %arg1: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> { + %0 = linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<2x3x3x5xf32>) outs(%arg1 : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> + return %0 : tensor<6x6x5x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [1, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (2, -d0 + 5)> +// CHECK-LABEL: func.func @tile_winograd_filter( +// CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x3x5xf32>, %[[ARG1:.*]]: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C2_1]] +// CHECK: %[[C5_2:.*]] = arith.constant 5 : index +// CHECK: %[[S3:.*]] = affine.min #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 3, %[[S3]]] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x?xf32> +// CHECK: %[[EXTRACTED_SLICE_3:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 6, %[[S3]], 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x?x1xf32> +// CHECK: %[[S4:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x?xf32>) outs(%[[EXTRACTED_SLICE_3]] : tensor<6x6x?x1xf32>) -> tensor<6x6x?x1xf32> + +// ----- + +func.func @tile_winograd_filter(%arg0: tensor<2x3x1x5xf32>, %arg1: tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32> { + %0 = linalg.winograd_filter_transform m(4) r(3) ins(%arg0 : tensor<2x3x1x5xf32>) outs(%arg1 : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32> + return %0 : tensor<6x1x5x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// CHECK-LABEL: func.func @tile_winograd_filter( +// CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x1x5xf32>, %[[ARG1:.*]]: tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32> { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C1_1:.*]] = arith.constant 1 : index +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C1_1]] +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<2x3x1x5xf32> to tensor<1x3x1x1xf32> +// CHECK: %[[EXTRACTED_SLICE_2:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x5x2xf32> to tensor<6x1x1x1xf32> +// CHECK: %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x1x1xf32>) outs(%[[EXTRACTED_SLICE_2]] : tensor<6x1x1x1xf32>) -> tensor<6x1x1x1xf32> + +// ----- + +func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { + %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%arg1 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> + return %0 : tensor<6x6x2x2x2x5xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loop3:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> +// CHECK-LABEL: func.func @tile_winograd_input( +// CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] +// CHECK: %[[S3:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) +// CHECK: %[[S4:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[C1_3:.*]] = arith.constant 1 : index +// CHECK: %[[S5:.*]] = affine.apply #[[$MAP1]](%[[C1_3]]) +// CHECK: %[[C1_4:.*]] = arith.constant 1 : index +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP1]](%[[C1_4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 5] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x5xf32> +// CHECK: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x2x5xf32> +// CHECK: %[[S7:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x5xf32>) outs(%[[EXTRACTED_SLICE_5]] : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32> + +// ----- + +func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { + %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%arg1 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> + return %0 : tensor<6x6x2x2x2x5xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loop3:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> +// CHECK-LABEL: func.func @tile_winograd_input( +// CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] +// CHECK: %[[C0_3:.*]] = arith.constant 0 : index +// CHECK: %[[C2_4:.*]] = arith.constant 2 : index +// CHECK: %[[C1_5:.*]] = arith.constant 1 : index +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C2_4]] step %[[C1_5]] +// CHECK: %[[C0_6:.*]] = arith.constant 0 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C1_7:.*]] = arith.constant 1 : index +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C1_7]] +// CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[C1_8:.*]] = arith.constant 1 : index +// CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]](%[[C1_8]]) +// CHECK: %[[C1_9:.*]] = arith.constant 1 : index +// CHECK: %[[S8:.*]] = affine.apply #[[$MAP1]](%[[C1_9]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], %[[S5]], %[[S6]], %[[ARG8]]] [1, %[[S7]], %[[S8]], 1] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<1x?x?x1xf32> +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x1x1xf32> +// CHECK: %[[S9:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x?x?x1xf32>) outs(%[[EXTRACTED_SLICE_10]] : tensor<6x6x1x1x1x1xf32>) -> tensor<6x6x1x1x1x1xf32> + +// ----- + +func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { + %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x10x10x5xf32>) outs(%arg1 : tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> + return %0 : tensor<6x6x2x2x2x5xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loop3:4 = transform.structured.tile_using_for %0 tile_sizes [2, 2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (2, -d0 + 5)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> +// CHECK-LABEL: func.func @tile_winograd_input( +// CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C2_0:.*]] = arith.constant 2 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] +// CHECK: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK: %[[C2_2:.*]] = arith.constant 2 : index +// CHECK: %[[C2_3:.*]] = arith.constant 2 : index +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] +// CHECK: %[[C0_4:.*]] = arith.constant 0 : index +// CHECK: %[[C2_5:.*]] = arith.constant 2 : index +// CHECK: %[[C2_6:.*]] = arith.constant 2 : index +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C2_5]] step %[[C2_6]] +// CHECK: %[[C0_7:.*]] = arith.constant 0 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C2_8:.*]] = arith.constant 2 : index +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_7]] to %[[C5]] step %[[C2_8]] +// CHECK: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG8]]) +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP1]](%[[ARG2]]) +// CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]](%[[ARG4]]) +// CHECK: %[[C2_10:.*]] = arith.constant 2 : index +// CHECK: %[[S8:.*]] = affine.apply #[[$MAP2]](%[[C2_10]]) +// CHECK: %[[C2_11:.*]] = arith.constant 2 : index +// CHECK: %[[S9:.*]] = affine.apply #[[$MAP2]](%[[C2_11]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], %[[S6]], %[[S7]], %[[ARG8]]] [2, %[[S8]], %[[S9]], %[[S5]]] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x?xf32> +// CHECK: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 2, 2, 2, %[[S5]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x2x2x2x?xf32> +// CHECK: %[[S10:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x?xf32>) outs(%[[EXTRACTED_SLICE_12]] : tensor<6x6x2x2x2x?xf32>) -> tensor<6x6x2x2x2x?xf32> + +// ----- + +func.func @tile_winograd_input(%arg0: tensor<2x1x10x5xf32>, %arg1: tensor<1x6x1x2x2x5xf32>) -> tensor<1x6x1x2x2x5xf32> { + %0 = linalg.winograd_input_transform m(4) r(3) ins(%arg0 : tensor<2x1x10x5xf32>) outs(%arg1 : tensor<1x6x1x2x2x5xf32>) -> tensor<1x6x1x2x2x5xf32> + return %0 : tensor<1x6x1x2x2x5xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loop3:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> +// CHECK-LABEL: func.func @tile_winograd_input( +// CHECK-SAME: %[[ARG0:.*]]: tensor<2x1x10x5xf32>, %[[ARG1:.*]]: tensor<1x6x1x2x2x5xf32>) -> tensor<1x6x1x2x2x5xf32> { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C1_0:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C1]] step %[[C1_0]] +// CHECK: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_2]] +// CHECK: %[[C0_3:.*]] = arith.constant 0 : index +// CHECK: %[[C2_4:.*]] = arith.constant 2 : index +// CHECK: %[[C1_5:.*]] = arith.constant 1 : index +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C2_4]] step %[[C1_5]] +// CHECK: %[[C0_6:.*]] = arith.constant 0 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C1_7:.*]] = arith.constant 1 : index +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C1_7]] +// CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[C1_8:.*]] = arith.constant 1 : index +// CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]](%[[C1_8]]) +// CHECK: %[[C1_9:.*]] = arith.constant 1 : index +// CHECK: %[[S8:.*]] = affine.apply #[[$MAP1]](%[[C1_9]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], 0, %[[S6]], %[[ARG8]]] [1, 1, %[[S8]], 1] [1, 1, 1, 1] : tensor<2x1x10x5xf32> to tensor<1x1x?x1xf32> +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [1, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x6x1x2x2x5xf32> to tensor<1x6x1x1x1x1xf32> +// CHECK: %[[S9:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x1x?x1xf32>) outs(%[[EXTRACTED_SLICE_10]] : tensor<1x6x1x1x1x1xf32>) -> tensor<1x6x1x1x1x1xf32> + +// ----- + +func.func @tile_winograd_output(%arg0 : tensor<6x6x2x2x2x2xf32>, %arg1: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> { + %0 = linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<6x6x2x2x2x2xf32>) outs(%arg1 : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> + return %0 : tensor<2x8x8x2xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK-LABEL: func.func @tile_winograd_output( +// CHECK-SAME: %[[ARG0:.*]]: tensor<6x6x2x2x2x2xf32>, %[[ARG1:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> +// CHECK: %[[S3:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) +// CHECK: %[[S4:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[C1_3:.*]] = arith.constant 1 : index +// CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[C1_3]]) +// CHECK: %[[C1_4:.*]] = arith.constant 1 : index +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[C1_4]]) +// CHECK: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG1]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x?x?x2xf32> + +// ----- + +func.func @tile_winograd_output(%arg0 : tensor<6x6x2x2x3x5xf32>, %arg1: tensor<3x8x8x5xf32>) -> tensor<3x8x8x5xf32> { + %0 = linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<6x6x2x2x3x5xf32>) outs(%arg1 : tensor<3x8x8x5xf32>) -> tensor<3x8x8x5xf32> + return %0 : tensor<3x8x8x5xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [2, 2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (2, -d0 + 3)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (2, -d0 + 5)> +// CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK-LABEL: func.func @tile_winograd_output( +// CHECK-SAME: %[[ARG0:.*]]: tensor<6x6x2x2x3x5xf32>, %[[ARG1:.*]]: tensor<3x8x8x5xf32>) -> tensor<3x8x8x5xf32> { +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<3x8x8x5xf32> +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C2_0:.*]] = arith.constant 2 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] +// CHECK: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK: %[[C2_2:.*]] = arith.constant 2 : index +// CHECK: %[[C2_3:.*]] = arith.constant 2 : index +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] +// CHECK: %[[C0_4:.*]] = arith.constant 0 : index +// CHECK: %[[C3:.*]] = arith.constant 3 : index +// CHECK: %[[C2_5:.*]] = arith.constant 2 : index +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C3]] step %[[C2_5]] +// CHECK: %[[C0_6:.*]] = arith.constant 0 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C2_7:.*]] = arith.constant 2 : index +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C2_7]] +// CHECK: %[[C3_8:.*]] = arith.constant 3 : index +// CHECK: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG6]]) +// CHECK: %[[C5_9:.*]] = arith.constant 5 : index +// CHECK: %[[S6:.*]] = affine.min #[[$MAP1]](%[[ARG8]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 2, 2, %[[S5]], %[[S6]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x3x5xf32> to tensor<6x6x2x2x?x?xf32> +// CHECK: %[[S7:.*]] = affine.apply #[[$MAP2]](%[[ARG2]]) +// CHECK: %[[S8:.*]] = affine.apply #[[$MAP2]](%[[ARG4]]) +// CHECK: %[[C2_10:.*]] = arith.constant 2 : index +// CHECK: %[[S9:.*]] = affine.apply #[[$MAP2]](%[[C2_10]]) +// CHECK: %[[C2_11:.*]] = arith.constant 2 : index +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP2]](%[[C2_11]]) +// CHECK: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S7]], %[[S8]], %[[ARG8]]] [%[[S5]], %[[S9]], %[[S10]], %[[S6]]] [1, 1, 1, 1] : tensor<3x8x8x5xf32> to tensor + +// ----- + +func.func @tile_winograd_output(%arg0 : tensor<6x1x2x1x3x5xf32>, %arg1: tensor<3x8x1x5xf32>) -> tensor<3x8x1x5xf32> { + %0 = linalg.winograd_output_transform m(4) r(3) ins(%arg0 : tensor<6x1x2x1x3x5xf32>) outs(%arg1 : tensor<3x8x1x5xf32>) -> tensor<3x8x1x5xf32> + return %0 : tensor<3x8x1x5xf32> +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { + %0 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op + %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + transform.yield + } +} + +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK-LABEL: func.func @tile_winograd_output( +// CHECK-SAME: %[[ARG0:.*]]: tensor<6x1x2x1x3x5xf32>, %[[ARG1:.*]]: tensor<3x8x1x5xf32>) -> tensor<3x8x1x5xf32> { +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK: %[[C1_1:.*]] = arith.constant 1 : index +// CHECK: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1_1]] step %[[C1_2]] +// CHECK: %[[C0_3:.*]] = arith.constant 0 : index +// CHECK: %[[C3:.*]] = arith.constant 3 : index +// CHECK: %[[C1_4:.*]] = arith.constant 1 : index +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_4]] +// CHECK: %[[C0_5:.*]] = arith.constant 0 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C1_6:.*]] = arith.constant 1 : index +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_5]] to %[[C5]] step %[[C1_6]] +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x2x1x3x5xf32> to tensor<6x1x1x1x1x1xf32> +// CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[C1_7:.*]] = arith.constant 1 : index +// CHECK: %[[S7:.*]] = affine.apply #[[$MAP0]](%[[C1_7]]) +// CHECK: %[[C1_8:.*]] = arith.constant 1 : index +// CHECK: %[[S8:.*]] = affine.apply #[[$MAP0]](%[[C1_8]]) +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S5]], 0, %[[ARG8]]] [1, %[[S7]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32> +// CHECK: %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x1x1x1x1x1xf32>) outs(%[[EXTRACTED_SLICE_9]] : tensor<1x?x1x1xf32>) -> tensor<1x?x1x1xf32> From 209b4e99388463245698ca0cc201a2a664d7719d Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Sun, 4 Aug 2024 02:36:04 +0100 Subject: [PATCH 6/9] Update test cases --- .../Linalg/transform-tile-winograd.mlir | 189 +++++++++--------- 1 file changed, 94 insertions(+), 95 deletions(-) diff --git a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir index b1b2818db38fc..2cad4fa37530d 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir @@ -15,13 +15,13 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @tile_winograd_filter( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x3x5xf32>, %[[ARG1:.*]]: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index // CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK: %[[C5:.*]] = arith.constant 5 : index -// CHECK: %[[C1_1:.*]] = arith.constant 1 : index // CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C1_1]] // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x1xf32> // CHECK: %[[EXTRACTED_SLICE_2:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x1x1xf32> @@ -42,16 +42,16 @@ module attributes {transform.with_named_sequence} { } } -// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (2, -d0 + 5)> +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 5, 2)> // CHECK-LABEL: func.func @tile_winograd_filter( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x3x5xf32>, %[[ARG1:.*]]: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index // CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK: %[[C5:.*]] = arith.constant 5 : index -// CHECK: %[[C2_1:.*]] = arith.constant 2 : index // CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C2_1]] // CHECK: %[[C5_2:.*]] = arith.constant 5 : index // CHECK: %[[S3:.*]] = affine.min #[[$MAP0]](%[[ARG4]]) @@ -76,13 +76,13 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @tile_winograd_filter( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x1x5xf32>, %[[ARG1:.*]]: tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32> { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index // CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK: %[[C5:.*]] = arith.constant 5 : index -// CHECK: %[[C1_1:.*]] = arith.constant 1 : index // CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C1_1]] // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<2x3x1x5xf32> to tensor<1x3x1x1xf32> // CHECK: %[[EXTRACTED_SLICE_2:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x5x2xf32> to tensor<6x1x1x1xf32> @@ -107,13 +107,13 @@ module attributes {transform.with_named_sequence} { // CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index // CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK: %[[C2_1:.*]] = arith.constant 2 : index -// CHECK: %[[C1_2:.*]] = arith.constant 1 : index // CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] // CHECK: %[[S3:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) // CHECK: %[[S4:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) @@ -144,21 +144,21 @@ module attributes {transform.with_named_sequence} { // CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_6:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_4:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index // CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK: %[[C2_1:.*]] = arith.constant 2 : index -// CHECK: %[[C1_2:.*]] = arith.constant 1 : index // CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] -// CHECK: %[[C0_3:.*]] = arith.constant 0 : index -// CHECK: %[[C2_4:.*]] = arith.constant 2 : index -// CHECK: %[[C1_5:.*]] = arith.constant 1 : index // CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C2_4]] step %[[C1_5]] -// CHECK: %[[C0_6:.*]] = arith.constant 0 : index -// CHECK: %[[C5:.*]] = arith.constant 5 : index -// CHECK: %[[C1_7:.*]] = arith.constant 1 : index // CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C1_7]] // CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) // CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) @@ -185,26 +185,26 @@ module attributes {transform.with_named_sequence} { } } -// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (2, -d0 + 5)> +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 5, 2)> // CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4)> // CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C2_0:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_7:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_5:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C2_0:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_3:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_8:.*]] = arith.constant 2 : index // CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] -// CHECK: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK: %[[C2_2:.*]] = arith.constant 2 : index -// CHECK: %[[C2_3:.*]] = arith.constant 2 : index // CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] -// CHECK: %[[C0_4:.*]] = arith.constant 0 : index -// CHECK: %[[C2_5:.*]] = arith.constant 2 : index -// CHECK: %[[C2_6:.*]] = arith.constant 2 : index // CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C2_5]] step %[[C2_6]] -// CHECK: %[[C0_7:.*]] = arith.constant 0 : index -// CHECK: %[[C5:.*]] = arith.constant 5 : index -// CHECK: %[[C2_8:.*]] = arith.constant 2 : index // CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_7]] to %[[C5]] step %[[C2_8]] // CHECK: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG8]]) // CHECK: %[[S6:.*]] = affine.apply #[[$MAP1]](%[[ARG2]]) @@ -236,21 +236,21 @@ module attributes {transform.with_named_sequence} { // CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x1x10x5xf32>, %[[ARG1:.*]]: tensor<1x6x1x2x2x5xf32>) -> tensor<1x6x1x2x2x5xf32> { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C1_0:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_6:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_4:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1_0:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index // CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C1]] step %[[C1_0]] -// CHECK: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C1_2:.*]] = arith.constant 1 : index // CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_2]] -// CHECK: %[[C0_3:.*]] = arith.constant 0 : index -// CHECK: %[[C2_4:.*]] = arith.constant 2 : index -// CHECK: %[[C1_5:.*]] = arith.constant 1 : index // CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C2_4]] step %[[C1_5]] -// CHECK: %[[C0_6:.*]] = arith.constant 0 : index -// CHECK: %[[C5:.*]] = arith.constant 5 : index -// CHECK: %[[C1_7:.*]] = arith.constant 1 : index // CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C1_7]] // CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) // CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) @@ -280,13 +280,13 @@ module attributes {transform.with_named_sequence} { // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> // CHECK-LABEL: func.func @tile_winograd_output( // CHECK-SAME: %[[ARG0:.*]]: tensor<6x6x2x2x2x2xf32>, %[[ARG1:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index // CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK: %[[C2_1:.*]] = arith.constant 2 : index -// CHECK: %[[C1_2:.*]] = arith.constant 1 : index // CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> // CHECK: %[[S3:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) @@ -312,27 +312,26 @@ module attributes {transform.with_named_sequence} { } } -// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (2, -d0 + 3)> -// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (2, -d0 + 5)> +// CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 3, 2)> +// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 5, 2)> // CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 4)> // CHECK-LABEL: func.func @tile_winograd_output( // CHECK-SAME: %[[ARG0:.*]]: tensor<6x6x2x2x3x5xf32>, %[[ARG1:.*]]: tensor<3x8x8x5xf32>) -> tensor<3x8x8x5xf32> { -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<3x8x8x5xf32> -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C2_0:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_6:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C2_0:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_3:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_5:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_7:.*]] = arith.constant 2 : index // CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] -// CHECK: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK: %[[C2_2:.*]] = arith.constant 2 : index -// CHECK: %[[C2_3:.*]] = arith.constant 2 : index // CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] -// CHECK: %[[C0_4:.*]] = arith.constant 0 : index -// CHECK: %[[C3:.*]] = arith.constant 3 : index -// CHECK: %[[C2_5:.*]] = arith.constant 2 : index // CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C3]] step %[[C2_5]] -// CHECK: %[[C0_6:.*]] = arith.constant 0 : index -// CHECK: %[[C5:.*]] = arith.constant 5 : index -// CHECK: %[[C2_7:.*]] = arith.constant 2 : index // CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C2_7]] // CHECK: %[[C3_8:.*]] = arith.constant 3 : index // CHECK: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG6]]) @@ -365,21 +364,21 @@ module attributes {transform.with_named_sequence} { // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> // CHECK-LABEL: func.func @tile_winograd_output( // CHECK-SAME: %[[ARG0:.*]]: tensor<6x1x2x1x3x5xf32>, %[[ARG1:.*]]: tensor<3x8x1x5xf32>) -> tensor<3x8x1x5xf32> { -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_5:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index // CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK: %[[C1_1:.*]] = arith.constant 1 : index -// CHECK: %[[C1_2:.*]] = arith.constant 1 : index // CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1_1]] step %[[C1_2]] -// CHECK: %[[C0_3:.*]] = arith.constant 0 : index -// CHECK: %[[C3:.*]] = arith.constant 3 : index -// CHECK: %[[C1_4:.*]] = arith.constant 1 : index // CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_4]] -// CHECK: %[[C0_5:.*]] = arith.constant 0 : index -// CHECK: %[[C5:.*]] = arith.constant 5 : index -// CHECK: %[[C1_6:.*]] = arith.constant 1 : index // CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_5]] to %[[C5]] step %[[C1_6]] // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x2x1x3x5xf32> to tensor<6x1x1x1x1x1xf32> // CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) From f7a02dbbf2967e5a97d42990277fbef0932ace53 Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Wed, 14 Aug 2024 04:12:37 +0100 Subject: [PATCH 7/9] Address Max191's comments --- .../mlir/Dialect/Linalg/IR/LinalgOps.td | 54 ++ mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 183 +++--- .../transform-tile-and-winograd-rewrite.mlir | 272 ++++---- .../Linalg/transform-tile-winograd.mlir | 595 ++++++++++++------ 4 files changed, 690 insertions(+), 414 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td index aaf243e015726..e82c654afe86a 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td @@ -208,6 +208,18 @@ def Linalg_WinogradFilterTransformOp : Linalg_Op<"winograd_filter_transform", int64_t getOutputOperandRank() { return getOutputOperandType().getRank(); } + int64_t getFilterFDim() { + return 0; + } + int64_t getFilterHDim() { + return 1; + } + int64_t getFilterWDim() { + return 2; + } + int64_t getFilterCDim() { + return 3; + } }]; let hasVerifier = 1; } @@ -266,6 +278,30 @@ def Linalg_WinogradInputTransformOp : Linalg_Op<"winograd_input_transform", int64_t getOutputOperandRank() { return getOutputOperandType().getRank(); } + int64_t getInputNDim() { + return 0; + } + int64_t getInputHDim() { + return 1; + } + int64_t getInputWDim() { + return 2; + } + int64_t getInputCDim() { + return 3; + } + int64_t getOutputTileHDim() { + return 2; + } + int64_t getOutputTileWDim() { + return 3; + } + int64_t getOutputNDim() { + return 4; + } + int64_t getOutputCDim() { + return 5; + } }]; let hasVerifier = 1; } @@ -324,6 +360,24 @@ def Linalg_WinogradOutputTransformOp : Linalg_Op<"winograd_output_transform", int64_t getOutputOperandRank() { return getOutputOperandType().getRank(); } + int64_t getValueAlphaHDim() { + return 0; + } + int64_t getValueAlphaWDim() { + return 1; + } + int64_t getValueTileHDim() { + return 2; + } + int64_t getValueTileWDim() { + return 3; + } + int64_t getValueNDim() { + return 4; + } + int64_t getValueFDim() { + return 5; + } }]; let hasVerifier = 1; } diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index ee633a715cc9e..b30e7e8db393e 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -2855,8 +2855,8 @@ FailureOr> SoftmaxOp::decomposeOperation(OpBuilder &b) { LogicalResult WinogradFilterTransformOp::verify() { auto filterType = cast(getFilter().getType()); ArrayRef filterShape = filterType.getShape(); - int64_t filterH = filterShape[1]; - int64_t filterW = filterShape[2]; + int64_t filterH = filterShape[getFilterHDim()]; + int64_t filterW = filterShape[getFilterWDim()]; int64_t r = getR(); int64_t m = getM(); @@ -2870,8 +2870,8 @@ LogicalResult WinogradFilterTransformOp::verify() { SmallVector expectedOutputShape; expectedOutputShape.push_back(filterH == r ? m + r - 1 : 1); expectedOutputShape.push_back(filterW == r ? m + r - 1 : 1); - expectedOutputShape.push_back(filterShape[3]); - expectedOutputShape.push_back(filterShape[0]); + expectedOutputShape.push_back(filterShape[getFilterCDim()]); + expectedOutputShape.push_back(filterShape[getFilterFDim()]); auto outputType = cast(getOutput().getType()); ArrayRef outputShape = outputType.getShape(); @@ -2887,26 +2887,20 @@ WinogradFilterTransformOp::getIterationDomain(OpBuilder &builder) { IntegerAttr zeroAttr = builder.getIndexAttr(0); IntegerAttr oneAttr = builder.getIndexAttr(1); Value filter = getFilter(); - SmallVector loopBounds; - Range fRange; - fRange.offset = zeroAttr; - // 0 is the index of F in the filter. - fRange.size = getDimValue(builder, loc, filter, 0); - fRange.stride = oneAttr; - loopBounds.emplace_back(fRange); - Range cRange; - cRange.offset = zeroAttr; - // 3 is the index of C in the filter. - cRange.size = getDimValue(builder, loc, filter, 3); - cRange.stride = oneAttr; - loopBounds.emplace_back(cRange); + int64_t filterRank = getFilterOperandRank(); + SmallVector loopBounds(filterRank); + for (unsigned dim = 0; dim < filterRank; ++dim) { + loopBounds[dim].offset = zeroAttr; + loopBounds[dim].size = getDimValue(builder, loc, filter, dim); + loopBounds[dim].stride = oneAttr; + } return loopBounds; } SmallVector WinogradFilterTransformOp::getLoopIteratorTypes() { int64_t filterRank = getFilterOperandRank(); - SmallVector iteratorTypes(filterRank - 2, + SmallVector iteratorTypes(filterRank, utils::IteratorType::parallel); return iteratorTypes; } @@ -2918,8 +2912,8 @@ LogicalResult WinogradFilterTransformOp::getResultTilePosition( IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); ShapedType filterType = getFilterOperandType(); ArrayRef filterShape = filterType.getShape(); - int64_t filterH = filterShape[1]; - int64_t filterW = filterShape[2]; + int64_t filterH = filterShape[getFilterHDim()]; + int64_t filterW = filterShape[getFilterWDim()]; int64_t m = getM(); int64_t r = getR(); int64_t alpha = m + r - 1; @@ -2928,8 +2922,10 @@ LogicalResult WinogradFilterTransformOp::getResultTilePosition( IntegerAttr alphaHAttr = builder.getI64IntegerAttr(alphaH); IntegerAttr alphaWAttr = builder.getI64IntegerAttr(alphaW); - resultOffsets.append({zeroAttr, zeroAttr, offsets[1], offsets[0]}); - resultSizes.append({alphaHAttr, alphaWAttr, sizes[1], sizes[0]}); + resultOffsets.append( + {zeroAttr, zeroAttr, offsets[getFilterCDim()], offsets[getFilterFDim()]}); + resultSizes.append( + {alphaHAttr, alphaWAttr, sizes[getFilterCDim()], sizes[getFilterFDim()]}); return success(); } @@ -2938,8 +2934,8 @@ LogicalResult WinogradFilterTransformOp::getResultTilePosition( /// The input of winograd_filter_transform is (F, KH, KW, C). /// The output of winograd_filter_transform is (alphaH, alphaW, C, F) /// Users can specify the tile sizes of F and C. -/// `offsets` are the values for the offsets of F, C for one tile. -/// `sizes` are the values for the sizes of F, C for one tile. +/// `offsets` are the values for the offsets of F, KH, KW, C for one tile. +/// `sizes` are the values for the sizes of F, KH, KW, C for one tile. FailureOr WinogradFilterTransformOp::getTiledImplementation( OpBuilder &builder, ArrayRef offsets, ArrayRef sizes) { @@ -2947,15 +2943,17 @@ FailureOr WinogradFilterTransformOp::getTiledImplementation( IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); ShapedType filterType = getFilterOperandType(); ArrayRef filterShape = filterType.getShape(); - int64_t filterH = filterShape[1]; - int64_t filterW = filterShape[2]; + int64_t filterH = filterShape[getFilterHDim()]; + int64_t filterW = filterShape[getFilterWDim()]; IntegerAttr filterHAttr = builder.getI64IntegerAttr(filterH); IntegerAttr filterWAttr = builder.getI64IntegerAttr(filterW); SmallVector tiledOperands; SmallVector sliceOffsets, sliceSizes; - sliceOffsets.append({offsets[0], zeroAttr, zeroAttr, offsets[1]}); - sliceSizes.append({sizes[0], filterHAttr, filterWAttr, sizes[1]}); + sliceOffsets.append( + {offsets[getFilterFDim()], zeroAttr, zeroAttr, offsets[getFilterCDim()]}); + sliceSizes.append({sizes[getFilterFDim()], filterHAttr, filterWAttr, + sizes[getFilterCDim()]}); int64_t filterRank = getFilterOperandRank(); SmallVector filterStrides(filterRank, oneAttr); Location loc = getLoc(); @@ -2987,8 +2985,8 @@ FailureOr WinogradFilterTransformOp::getTiledImplementation( LogicalResult WinogradInputTransformOp::verify() { auto inputType = cast(getInput().getType()); ArrayRef inputShape = inputType.getShape(); - int64_t inputH = inputShape[1]; - int64_t inputW = inputShape[2]; + int64_t inputH = inputShape[getInputHDim()]; + int64_t inputW = inputShape[getInputWDim()]; int m = getM(); int r = getR(); int64_t tileSize = m + r - 1; @@ -3010,8 +3008,8 @@ LogicalResult WinogradInputTransformOp::verify() { expectedOutputShape[1] = rightTransform ? tileSize : 1; expectedOutputShape[3] = rightTransform ? (inputW - (r - 1)) / m : 1; } - expectedOutputShape[4] = inputShape[0]; - expectedOutputShape[5] = inputShape[3]; + expectedOutputShape[4] = inputShape[getInputNDim()]; + expectedOutputShape[5] = inputShape[getInputCDim()]; auto outputType = cast(getOutput().getType()); ArrayRef outputShape = outputType.getShape(); @@ -3028,11 +3026,12 @@ WinogradInputTransformOp::getIterationDomain(OpBuilder &builder) { IntegerAttr oneAttr = builder.getIndexAttr(1); Value output = getOutput(); int64_t outputRank = getOutputOperandRank(); - SmallVector loopBounds(outputRank - 2); - for (unsigned dim = 2; dim < outputRank; ++dim) { - loopBounds[dim - 2].offset = zeroAttr; - loopBounds[dim - 2].size = getDimValue(builder, loc, output, dim); - loopBounds[dim - 2].stride = oneAttr; + SmallVector loopBounds(outputRank); + for (unsigned dim = 0; dim < outputRank; ++dim) { + loopBounds[dim].offset = zeroAttr; + // alphaH, alphaW, tileH, tileW, N, C + loopBounds[dim].size = getDimValue(builder, loc, output, dim); + loopBounds[dim].stride = oneAttr; } return loopBounds; } @@ -3040,7 +3039,7 @@ WinogradInputTransformOp::getIterationDomain(OpBuilder &builder) { SmallVector WinogradInputTransformOp::getLoopIteratorTypes() { int64_t outputRank = getOutputOperandRank(); - SmallVector iteratorTypes(outputRank - 2, + SmallVector iteratorTypes(outputRank, utils::IteratorType::parallel); return iteratorTypes; } @@ -3052,8 +3051,8 @@ LogicalResult WinogradInputTransformOp::getResultTilePosition( IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); ShapedType inputType = getInputOperandType(); ArrayRef inputShape = inputType.getShape(); - int64_t inputH = inputShape[1]; - int64_t inputW = inputShape[2]; + int64_t inputH = inputShape[getInputHDim()]; + int64_t inputW = inputShape[getInputWDim()]; int64_t m = getM(); int64_t r = getR(); int64_t alpha = m + r - 1; @@ -3062,10 +3061,12 @@ LogicalResult WinogradInputTransformOp::getResultTilePosition( IntegerAttr alphaHAttr = builder.getI64IntegerAttr(alphaH); IntegerAttr alphaWAttr = builder.getI64IntegerAttr(alphaW); - resultOffsets.append( - {zeroAttr, zeroAttr, offsets[0], offsets[1], offsets[2], offsets[3]}); - resultSizes.append( - {alphaHAttr, alphaWAttr, sizes[0], sizes[1], sizes[2], sizes[3]}); + resultOffsets.append({zeroAttr, zeroAttr, offsets[getOutputTileHDim()], + offsets[getOutputTileWDim()], offsets[getOutputNDim()], + offsets[getOutputCDim()]}); + resultSizes.append({alphaHAttr, alphaWAttr, sizes[getOutputTileHDim()], + sizes[getOutputTileWDim()], sizes[getOutputNDim()], + sizes[getOutputCDim()]}); return success(); } @@ -3084,8 +3085,8 @@ WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder, IntegerAttr zeroAttr = builder.getI64IntegerAttr(0); ShapedType inputType = getInputOperandType(); ArrayRef inputShape = inputType.getShape(); - int64_t inputH = inputShape[1]; - int64_t inputW = inputShape[2]; + int64_t inputH = inputShape[getInputHDim()]; + int64_t inputW = inputShape[getInputWDim()]; int64_t m = getM(); int64_t r = getR(); @@ -3093,20 +3094,16 @@ WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder, MLIRContext *context = builder.getContext(); auto offsetAffineMap = AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); - Value mappedOffsetH = builder.create( - loc, offsetAffineMap, - getValueOrCreateConstantIndexOp(builder, loc, offsets[0])); - Value mappedOffsetW = builder.create( - loc, offsetAffineMap, - getValueOrCreateConstantIndexOp(builder, loc, offsets[1])); + Value mappedOffsetH = affine::makeComposedAffineApply( + builder, loc, offsetAffineMap, offsets[getOutputTileHDim()]); + Value mappedOffsetW = affine::makeComposedAffineApply( + builder, loc, offsetAffineMap, offsets[getOutputTileWDim()]); auto sizeAffineMap = AffineMap::get( 1, 0, {builder.getAffineDimExpr(0) * m + (r - 1)}, context); - Value mappedSizeH = builder.create( - loc, sizeAffineMap, - getValueOrCreateConstantIndexOp(builder, loc, sizes[0])); - Value mappedSizeW = builder.create( - loc, sizeAffineMap, - getValueOrCreateConstantIndexOp(builder, loc, sizes[1])); + Value mappedSizeH = affine::makeComposedAffineApply( + builder, loc, sizeAffineMap, sizes[getOutputTileHDim()]); + Value mappedSizeW = affine::makeComposedAffineApply( + builder, loc, sizeAffineMap, sizes[getOutputTileWDim()]); SmallVector tiledOperands; SmallVector sliceOffsets, sliceSizes; @@ -3115,12 +3112,14 @@ WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder, inputH != 1 ? OpFoldResult(mappedOffsetH) : OpFoldResult(zeroAttr); OpFoldResult offsetW = inputW != 1 ? OpFoldResult(mappedOffsetW) : OpFoldResult(zeroAttr); - sliceOffsets.append({offsets[2], offsetH, offsetW, offsets[3]}); + sliceOffsets.append( + {offsets[getOutputNDim()], offsetH, offsetW, offsets[getOutputCDim()]}); OpFoldResult sizeH = inputH != 1 ? OpFoldResult(mappedSizeH) : OpFoldResult(oneAttr); OpFoldResult sizeW = inputW != 1 ? OpFoldResult(mappedSizeW) : OpFoldResult(oneAttr); - sliceSizes.append({sizes[2], sizeH, sizeW, sizes[3]}); + sliceSizes.append( + {sizes[getOutputNDim()], sizeH, sizeW, sizes[getOutputCDim()]}); int64_t inputRank = getInputOperandRank(); SmallVector inputStrides(inputRank, oneAttr); tiledOperands.emplace_back(builder.create( @@ -3151,10 +3150,10 @@ WinogradInputTransformOp::getTiledImplementation(OpBuilder &builder, LogicalResult WinogradOutputTransformOp::verify() { auto valueType = cast(getValue().getType()); ArrayRef valueShape = valueType.getShape(); - int64_t valueH = valueShape[0]; - int64_t valueW = valueShape[1]; - int64_t valueTileH = valueShape[2]; - int64_t valueTileW = valueShape[3]; + int64_t valueH = valueShape[getValueAlphaHDim()]; + int64_t valueW = valueShape[getValueAlphaWDim()]; + int64_t valueTileH = valueShape[getValueTileHDim()]; + int64_t valueTileW = valueShape[getValueTileWDim()]; int m = getM(); int r = getR(); bool leftTransform = valueH != 1; @@ -3176,8 +3175,8 @@ LogicalResult WinogradOutputTransformOp::verify() { return emitOpError("expect input width equals to input tile size"); expectedOutputShape[2] = (rightTransform ? m : 1) * valueTileW; } - expectedOutputShape[0] = valueShape[4]; - expectedOutputShape[3] = valueShape[5]; + expectedOutputShape[0] = valueShape[getValueNDim()]; + expectedOutputShape[3] = valueShape[getValueFDim()]; auto outputType = cast(getOutput().getType()); ArrayRef outputShape = outputType.getShape(); @@ -3194,11 +3193,12 @@ WinogradOutputTransformOp::getIterationDomain(OpBuilder &builder) { IntegerAttr oneAttr = builder.getIndexAttr(1); Value value = getValue(); int64_t valueRank = getValueOperandRank(); - SmallVector loopBounds(valueRank - 2); - for (unsigned dim = 2; dim < valueRank; ++dim) { - loopBounds[dim - 2].offset = zeroAttr; - loopBounds[dim - 2].size = getDimValue(builder, loc, value, dim); - loopBounds[dim - 2].stride = oneAttr; + SmallVector loopBounds(valueRank); + for (unsigned dim = 0; dim < valueRank; ++dim) { + loopBounds[dim].offset = zeroAttr; + // alphaH, alphaW, tileH, tileW, N, F + loopBounds[dim].size = getDimValue(builder, loc, value, dim); + loopBounds[dim].stride = oneAttr; } return loopBounds; } @@ -3206,7 +3206,7 @@ WinogradOutputTransformOp::getIterationDomain(OpBuilder &builder) { SmallVector WinogradOutputTransformOp::getLoopIteratorTypes() { int64_t valueRank = getValueOperandRank(); - SmallVector iteratorTypes(valueRank - 2, + SmallVector iteratorTypes(valueRank, utils::IteratorType::parallel); return iteratorTypes; } @@ -3221,16 +3221,15 @@ LogicalResult WinogradOutputTransformOp::getResultTilePosition( MLIRContext *context = builder.getContext(); auto affineMap = AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); - Value mappedOffsetH = builder.create( - loc, affineMap, - getValueOrCreateConstantIndexOp(builder, loc, offsets[0])); - Value mappedOffsetW = builder.create( - loc, affineMap, - getValueOrCreateConstantIndexOp(builder, loc, offsets[1])); - Value mappedSizeH = builder.create( - loc, affineMap, getValueOrCreateConstantIndexOp(builder, loc, sizes[0])); - Value mappedSizeW = builder.create( - loc, affineMap, getValueOrCreateConstantIndexOp(builder, loc, sizes[1])); + + Value mappedOffsetH = affine::makeComposedAffineApply( + builder, loc, affineMap, offsets[getValueTileHDim()]); + Value mappedOffsetW = affine::makeComposedAffineApply( + builder, loc, affineMap, offsets[getValueTileWDim()]); + Value mappedSizeH = affine::makeComposedAffineApply( + builder, loc, affineMap, sizes[getValueTileHDim()]); + Value mappedSizeW = affine::makeComposedAffineApply( + builder, loc, affineMap, sizes[getValueTileWDim()]); ShapedType valueType = getValueOperandType(); ArrayRef valueShape = valueType.getShape(); @@ -3247,8 +3246,10 @@ LogicalResult WinogradOutputTransformOp::getResultTilePosition( OpFoldResult sizeW = valueW != 1 ? OpFoldResult(mappedSizeW) : OpFoldResult(oneAttr); - resultOffsets.append({offsets[2], offsetH, offsetW, offsets[3]}); - resultSizes.append({sizes[2], sizeH, sizeW, sizes[3]}); + resultOffsets.append( + {offsets[getValueNDim()], offsetH, offsetW, offsets[getValueFDim()]}); + resultSizes.append( + {sizes[getValueNDim()], sizeH, sizeW, sizes[getValueFDim()]}); return success(); } @@ -3269,15 +3270,17 @@ FailureOr WinogradOutputTransformOp::getTiledImplementation( ShapedType valueType = getValueOperandType(); ArrayRef valueShape = valueType.getShape(); - int64_t alphaH = valueShape[0]; - int64_t alphaW = valueShape[1]; + int64_t alphaH = valueShape[getValueAlphaHDim()]; + int64_t alphaW = valueShape[getValueAlphaWDim()]; IntegerAttr alphaHAttr = builder.getI64IntegerAttr(alphaH); IntegerAttr alphaWAttr = builder.getI64IntegerAttr(alphaW); - sliceOffsets.append( - {zeroAttr, zeroAttr, offsets[0], offsets[1], offsets[2], offsets[3]}); - sliceSizes.append( - {alphaHAttr, alphaWAttr, sizes[0], sizes[1], sizes[2], sizes[3]}); + sliceOffsets.append({zeroAttr, zeroAttr, offsets[getValueTileHDim()], + offsets[getValueTileWDim()], offsets[getValueNDim()], + offsets[getValueFDim()]}); + sliceSizes.append({alphaHAttr, alphaWAttr, sizes[getValueTileHDim()], + sizes[getValueTileWDim()], sizes[getValueNDim()], + sizes[getValueFDim()]}); int64_t valueRank = getValueOperandRank(); SmallVector sliceStrides(valueRank, oneAttr); tiledOperands.emplace_back(builder.create( diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir index 54eb755047f18..c86b33ba88ee3 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir @@ -18,9 +18,9 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %3, %loop3:4 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %5, %loop5:4 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) @@ -36,6 +36,7 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @conv2d // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> { // CHECK: %[[CST:.*]] = arith.constant 1.024000e+03 : f32 +// CHECK: %[[C6:.*]] = arith.constant 6 : index // CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[C5:.*]] = arith.constant 5 : index // CHECK: %[[C2:.*]] = arith.constant 2 : index @@ -51,53 +52,49 @@ module attributes {transform.with_named_sequence} { // CHECK: scf.yield %[[S9]] // CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> // CHECK: %[[S3:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> -// CHECK: %[[S4:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S3]]) -// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) -// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, %[[S10]], %[[S11]], 0] [2, 6, 6, 5] [1, 1, 1, 1] -// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] -// CHECK: %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) -// CHECK: %[[S13:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -// CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 6, 6, 1] [1, 1, 1, 1] -// CHECK: %[[S15:.*]] = linalg.matmul -// CHECK: %[[S17:.*]] = linalg.matmul -// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S17]] into %[[ARG10]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] -// CHECK: scf.yield %[[INSERTED_SLICE_9]] -// CHECK: scf.yield %[[S13]] -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] -// CHECK: scf.yield %[[INSERTED_SLICE]] -// CHECK: scf.yield %[[S9]] -// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] -// CHECK: %[[COLLAPSED_6:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] +// CHECK: %[[S4:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S3]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK: %[[S11:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK: %[[S12:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) +// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, %[[S12]], %[[S13]], 0] [2, 6, 6, 5] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x6x6x5xf32> +// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x2x5xf32> +// CHECK: %[[S14:.*]] = scf.for %[[ARG11:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG12:.*]] = %[[EXTRACTED_SLICE_7]]) -> (tensor<6x6x1x1x2x5xf32>) { +// CHECK: %[[S15:.*]] = scf.for %[[ARG13:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG14:.*]] = %[[ARG12]]) -> (tensor<6x6x1x1x2x5xf32>) { +// CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][%[[ARG11]], 0, 0, %[[ARG13]]] [1, 6, 6, 1] [1, 1, 1, 1] : tensor<2x6x6x5xf32> to tensor<6x6xf32> +// CHECK: %[[S17:.*]] = linalg.matmul +// CHECK: %[[S19:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S19]] into %[[ARG14]][0, 0, 0, 0, %[[ARG11]], %[[ARG13]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x1x1x2x5xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG10]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x5xf32> into tensor<6x6x2x2x2x5xf32> +// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32> +// CHECK: %[[COLLAPSED_6:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x2x2x2x5xf32> into tensor<36x8x5xf32> // CHECK: %[[S6:.*]] = linalg.batch_matmul -// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] : tensor<36x8x2xf32> into tensor<6x6x2x2x2x2xf32> // CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x8x8x2xf32> -// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]]) -// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] -// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) -// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) -// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG2]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] -// CHECK: %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) -// CHECK: %[[S15:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -// CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] -// CHECK: %[[S17:.*]] = linalg.matmul -// CHECK: %[[S19:.*]] = linalg.matmul -// CHECK: %[[S20:.*]] = tensor.empty() -// CHECK: %[[S21:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S20]] : tensor<4x4xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 -// CHECK: } -> tensor<4x4xf32> -// CHECK: %[[S22:.*]] = linalg.mul ins(%[[S21]], %[[S19]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S22]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] -// CHECK: scf.yield %[[INSERTED_SLICE_9]] -// CHECK: scf.yield %[[S15]] -// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) -// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][0, %[[S13]], %[[S14]], 0] [2, 4, 4, 2] [1, 1, 1, 1] -// CHECK: scf.yield %[[INSERTED_SLICE]] -// CHECK: scf.yield %[[S9]] +// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]]) -> (tensor<2x8x8x2xf32>) { +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<2x8x8x2xf32>) { +// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x8x8x2xf32>) { +// CHECK: %[[S11:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x8x8x2xf32>) { +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> +// CHECK: %[[S12:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) +// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) +// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG2]][0, %[[S12]], %[[S13]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x4x4x2xf32> +// CHECK: %[[S14:.*]] = scf.for %[[ARG11:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG12:.*]] = %[[EXTRACTED_SLICE_7]]) -> (tensor<2x4x4x2xf32>) { +// CHECK: %[[S17:.*]] = scf.for %[[ARG13:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG14:.*]] = %[[ARG12]]) -> (tensor<2x4x4x2xf32>) { +// CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, %[[ARG11]], %[[ARG13]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x2xf32> to tensor<6x6xf32> +// CHECK: %[[S19:.*]] = linalg.matmul +// CHECK: %[[S21:.*]] = linalg.matmul +// CHECK: %[[S22:.*]] = tensor.empty() : tensor<4x4xf32> +// CHECK: %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S22]] : tensor<4x4xf32>) { +// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: } -> tensor<4x4xf32> +// CHECK: %[[S24:.*]] = linalg.mul ins(%[[S23]], %[[S21]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S22]] : tensor<4x4xf32>) -> tensor<4x4xf32> +// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S24]] into %[[ARG14]][%[[ARG11]], 0, 0, %[[ARG13]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x4x4x2xf32> +// CHECK: %[[S15:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) +// CHECK: %[[S16:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG10]][0, %[[S15]], %[[S16]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x4x4x2xf32> into tensor<2x8x8x2xf32> // ----- @@ -129,9 +126,9 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %3, %loop3:4 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %5, %loop5:4 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) @@ -148,10 +145,12 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x11x11x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> { // CHECK: %[[CST:.*]] = arith.constant 1.024000e+03 : f32 // CHECK: %[[C3:.*]] = arith.constant 3 : index +// CHECK: %[[C6:.*]] = arith.constant 6 : index // CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[C5:.*]] = arith.constant 5 : index // CHECK: %[[C2:.*]] = arith.constant 2 : index // CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[CST_6:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[S0:.*]] = tensor.empty() // CHECK: %[[S1:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S0]]) // CHECK: %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) @@ -164,56 +163,55 @@ module attributes {transform.with_named_sequence} { // CHECK: %[[PADDED:.*]] = tensor.pad %[[ARG0]] low[0, 0, 0, 0] high[0, 3, 3, 0] // CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32> // CHECK: %[[S3:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32> -// CHECK: %[[S4:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S3]]) -// CHECK: %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[PADDED]][0, %[[S10]], %[[S11]], 0] [2, 6, 6, 5] [1, 1, 1, 1] -// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] -// CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) -// CHECK: %[[S13:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -// CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 6, 6, 1] [1, 1, 1, 1] -// CHECK: %[[S15:.*]] = linalg.matmul -// CHECK: %[[S17:.*]] = linalg.matmul -// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S17]] into %[[ARG11]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] -// CHECK: scf.yield %[[INSERTED_SLICE_12]] : tensor<6x6x1x1x2x5xf32> -// CHECK: scf.yield %[[S13]] : tensor<6x6x1x1x2x5xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] -// CHECK: scf.yield %[[INSERTED_SLICE]] -// CHECK: scf.yield %[[S9]] -// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] -// CHECK: %[[COLLAPSED_7:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] +// CHECK: %[[S4:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S3]]) -> (tensor<6x6x3x3x2x5xf32>) { +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x6x3x3x2x5xf32>) { +// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<6x6x3x3x2x5xf32>) { +// CHECK: %[[S11:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<6x6x3x3x2x5xf32>) { +// CHECK: %[[S12:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) +// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[PADDED]][0, %[[S12]], %[[S13]], 0] [2, 6, 6, 5] [1, 1, 1, 1] : tensor<2x14x14x5xf32> to tensor<2x6x6x5xf32> +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x5xf32> to tensor<6x6x1x1x2x5xf32> +// CHECK: %[[S14:.*]] = scf.for %[[ARG11:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG12:.*]] = %[[EXTRACTED_SLICE_10]]) -> (tensor<6x6x1x1x2x5xf32>) { +// CHECK: %[[S15:.*]] = scf.for %[[ARG13:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG14:.*]] = %[[ARG12]]) -> (tensor<6x6x1x1x2x5xf32>) { +// CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][%[[ARG11]], 0, 0, %[[ARG13]]] [1, 6, 6, 1] [1, 1, 1, 1] : tensor<2x6x6x5xf32> to tensor<6x6xf32> +// CHECK: %[[S17:.*]] = linalg.matmul +// CHECK: %[[S19:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S19]] into %[[ARG14]][0, 0, 0, 0, %[[ARG11]], %[[ARG13]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x1x1x2x5xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG10]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x5xf32> into tensor<6x6x3x3x2x5xf32> +// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32> +// CHECK: %[[COLLAPSED_7:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32> // CHECK: %[[S6:.*]] = linalg.batch_matmul -// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] -// CHECK: %[[PADDED_8:.*]] = tensor.pad %[[ARG2]] low[0, 0, 0, 0] high[0, 3, 3, 0] +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32> +// CHECK: %[[PADDED_8:.*]] = tensor.pad %[[ARG2]] low[0, 0, 0, 0] high[0, 3, 3, 0] { +// CHECK: ^bb0(%[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index, %[[ARG6:.*]]: index): +// CHECK: tensor.yield %[[CST_6]] : f32 +// CHECK: } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32> // CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x12x12x2xf32> -// CHECK: %[[S8:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S7]]) -// CHECK: %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] -// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[PADDED_8]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] -// CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) -// CHECK: %[[S15:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -// CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] -// CHECK: %[[S17:.*]] = linalg.matmul -// CHECK: %[[S19:.*]] = linalg.matmul -// CHECK: %[[S20:.*]] = tensor.empty() : tensor<4x4xf32> -// CHECK: %[[S21:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S20]] : tensor<4x4xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 -// CHECK: } -> tensor<4x4xf32> -// CHECK: %[[S22:.*]] = linalg.mul ins(%[[S21]], %[[S19]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S22]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1] -// CHECK: scf.yield %[[INSERTED_SLICE_12]] -// CHECK: scf.yield %[[S15]] : tensor<2x4x4x2xf32> -// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, %[[S13]], %[[S14]], 0] [2, 4, 4, 2] [1, 1, 1, 1] -// CHECK: scf.yield %[[INSERTED_SLICE]] -// CHECK: scf.yield %[[S9]] -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S8]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] -// CHECK: return %[[EXTRACTED_SLICE]] +// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]]) -> (tensor<2x12x12x2xf32>) { +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<2x12x12x2xf32>) { +// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x12x12x2xf32>) { +// CHECK: %[[S11:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x12x12x2xf32>) { +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x2xf32> to tensor<6x6x1x1x2x2xf32> +// CHECK: %[[S12:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) +// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[PADDED_8]][0, %[[S12]], %[[S13]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x4x4x2xf32> +// CHECK: %[[S14:.*]] = scf.for %[[ARG11:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG12:.*]] = %[[EXTRACTED_SLICE_10]]) -> (tensor<2x4x4x2xf32>) { +// CHECK: %[[S17:.*]] = scf.for %[[ARG13:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG14:.*]] = %[[ARG12]]) -> (tensor<2x4x4x2xf32>) { +// CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0, %[[ARG11]], %[[ARG13]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x2xf32> to tensor<6x6xf32> +// CHECK: %[[S19:.*]] = linalg.matmul +// CHECK: %[[S21:.*]] = linalg.matmul +// CHECK: %[[S22:.*]] = tensor.empty() : tensor<4x4xf32> +// CHECK: %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S22]] : tensor<4x4xf32>) { +// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: } -> tensor<4x4xf32> +// CHECK: %[[S24:.*]] = linalg.mul +// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S24]] into %[[ARG14]][%[[ARG11]], 0, 0, %[[ARG13]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x4x4x2xf32> +// CHECK: %[[S15:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) +// CHECK: %[[S16:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG10]][0, %[[S15]], %[[S16]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x4x4x2xf32> into tensor<2x12x12x2xf32> +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S8]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32> +// CHECK: return %[[EXTRACTED_SLICE]] : tensor<2x9x9x2xf32> // ----- @@ -235,9 +233,9 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %3, %loop3:4 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %5, %loop5:4 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) @@ -251,42 +249,40 @@ module attributes {transform.with_named_sequence} { // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)> // CHECK-LABEL: func.func @conv2d_mx1_rx1 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x6x1x5xf32>, %[[ARG1:.*]]: tensor<2x3x1x5xf32>, %[[ARG2:.*]]: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> { -// CHECK: %[[CST:.*]] = arith.constant 3.200000e+01 : f32 -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C5:.*]] = arith.constant 5 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x1x5x2xf32> -// CHECK: %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]]) -// CHECK: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 3, 1, 1] [1, 1, 1, 1] -// CHECK: %[[S9:.*]] = linalg.matmul -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 1, 1, 1] [1, 1, 1, 1] -// CHECK: scf.yield %[[INSERTED_SLICE]] -// CHECK: scf.yield %[[S7]] -// CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32> -// CHECK: %[[S3:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S2]]) -// CHECK: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 6, 1, 1] [1, 1, 1, 1] -// CHECK: %[[S9:.*]] = linalg.matmul -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG6]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] -// CHECK: scf.yield %[[INSERTED_SLICE]] -// CHECK: scf.yield %[[S7]] -// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] -// CHECK: %[[COLLAPSED_3:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2, 3, 4], [5]] -// CHECK: %[[S5:.*]] = linalg.batch_matmul -// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] -// CHECK: %[[S6:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[ARG2]]) -// CHECK: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] -// CHECK: %[[S9:.*]] = linalg.matmul -// CHECK: %[[S10:.*]] = tensor.empty() : tensor<4x1xf32> -// CHECK: %[[S11:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S10]] : tensor<4x1xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 -// CHECK: } -> tensor<4x1xf32> -// CHECK: %[[S12:.*]] = linalg.mul ins(%[[S11]], %[[S9]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S10]] : tensor<4x1xf32>) -> tensor<4x1xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1] -// CHECK: scf.yield %[[INSERTED_SLICE]] -// CHECK: scf.yield %[[S7]] -// CHECK: return %[[S6]] +// CHECK: %[[CST:.*]] = arith.constant 3.200000e+01 : f32 +// CHECK: %[[C6:.*]] = arith.constant 6 : index +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x1x5x2xf32> +// CHECK: %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]]) -> (tensor<6x1x5x2xf32>) { +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x1x5x2xf32>) { +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<2x3x1x5xf32> to tensor<3x1xf32> +// CHECK: %[[S11:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S11]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1xf32> into tensor<6x1x5x2xf32> +// CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32> +// CHECK: %[[S3:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32> +// CHECK: %[[S4:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S3]]) -> (tensor<6x1x1x1x2x5xf32>) { +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[S2]]) -> (tensor<6x1x1x1x2x5xf32>) { +// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<6x1x1x1x2x5xf32>) { +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG5]], 0, 0, %[[ARG7]]] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<2x6x1x5xf32> to tensor<6x1xf32> +// CHECK: %[[S12:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG8]][0, 0, 0, 0, %[[ARG5]], %[[ARG7]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1xf32> into tensor<6x1x1x1x2x5xf32> +// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x1x5x2xf32> into tensor<6x5x2xf32> +// CHECK: %[[COLLAPSED_3:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x1x1x1x2x5xf32> into tensor<6x2x5xf32> +// CHECK: %[[S6:.*]] = linalg.batch_matmul +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32> +// CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x4x1x2xf32> +// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]]) -> (tensor<2x4x1x2xf32>) { +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG2]]) -> (tensor<2x4x1x2xf32>) { +// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x4x1x2xf32>) { +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG5]], %[[ARG7]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x1x1x2x2xf32> to tensor<6x1xf32> +// CHECK: %[[S12:.*]] = linalg.matmul +// CHECK: %[[S13:.*]] = tensor.empty() : tensor<4x1xf32> +// CHECK: %[[S14:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S13]] : tensor<4x1xf32>) { +// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: } -> tensor<4x1xf32> +// CHECK: %[[S15:.*]] = linalg.mul ins(%[[S14]], %[[S12]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S13]] : tensor<4x1xf32>) -> tensor<4x1xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S15]] into %[[ARG8]][%[[ARG5]], 0, 0, %[[ARG7]]] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<4x1xf32> into tensor<2x4x1x2xf32> diff --git a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir index 2cad4fa37530d..ea5d4e99c8731 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir @@ -8,24 +8,43 @@ func.func @tile_winograd_filter(%arg0: tensor<2x3x3x5xf32>, %arg1: tensor<6x6x5x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK-LABEL: func.func @tile_winograd_filter( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x3x5xf32>, %[[ARG1:.*]]: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C1_1]] -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x1xf32> -// CHECK: %[[EXTRACTED_SLICE_2:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x1x1xf32> -// CHECK: %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x1xf32>) outs(%[[EXTRACTED_SLICE_2]] : tensor<6x6x1x1xf32>) -> tensor<6x6x1x1xf32> +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf32> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C3_3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x5x2xf32>) { +// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C3]] step %[[C1_4]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x6x5x2xf32>) { +// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C3_3]] step %[[C1_5]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x5x2xf32>) { +// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C5]] step %[[C1_6]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x6x5x2xf32>) { +// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG8]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x1xf32> +// CHECK-NEXT: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x1x1xf32> +// CHECK-NEXT: %[[S5:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x1xf32>) outs(%[[EXTRACTED_SLICE_7]] : tensor<6x6x1x1xf32>) -> tensor<6x6x1x1xf32> +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S5]] into %[[ARG9]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x1x1xf32> into tensor<6x6x5x2xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x6x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x6x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x6x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: return %[[S1]] : tensor<6x6x5x2xf32> // ----- @@ -37,27 +56,47 @@ func.func @tile_winograd_filter(%arg0: tensor<2x3x3x5xf32>, %arg1: tensor<6x6x5x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [1, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 5, 2)> + // CHECK-LABEL: func.func @tile_winograd_filter( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x3x5xf32>, %[[ARG1:.*]]: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> { +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf32> // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C3_3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C2_1]] -// CHECK: %[[C5_2:.*]] = arith.constant 5 : index -// CHECK: %[[S3:.*]] = affine.min #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 3, %[[S3]]] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x?xf32> -// CHECK: %[[EXTRACTED_SLICE_3:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 6, %[[S3]], 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x?x1xf32> -// CHECK: %[[S4:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x?xf32>) outs(%[[EXTRACTED_SLICE_3]] : tensor<6x6x?x1xf32>) -> tensor<6x6x?x1xf32> +// CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x5x2xf32>) { +// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C3]] step %[[C1_4]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x6x5x2xf32>) { +// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C3_3]] step %[[C1_5]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x5x2xf32>) { +// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C5]] step %[[C2_6]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x6x5x2xf32>) { +// CHECK-NEXT: %[[C5_7:.*]] = arith.constant 5 : index +// CHECK-NEXT: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG8]]) +// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG8]]] [1, 3, 3, %[[S5]]] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x?xf32> +// CHECK-NEXT: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 6, %[[S5]], 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x?x1xf32> +// CHECK-NEXT: %[[S6:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x?xf32>) outs(%[[EXTRACTED_SLICE_8]] : tensor<6x6x?x1xf32>) -> tensor<6x6x?x1xf32> +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S6]] into %[[ARG9]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 6, %[[S5]], 1] [1, 1, 1, 1] : tensor<6x6x?x1xf32> into tensor<6x6x5x2xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x6x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x6x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x6x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: return %[[S1]] : tensor<6x6x5x2xf32> // ----- @@ -69,24 +108,43 @@ func.func @tile_winograd_filter(%arg0: tensor<2x3x1x5xf32>, %arg1: tensor<6x1x5x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK-LABEL: func.func @tile_winograd_filter( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x1x5xf32>, %[[ARG1:.*]]: tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32> { +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x1x5x2xf32> // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C1_1]] -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<2x3x1x5xf32> to tensor<1x3x1x1xf32> -// CHECK: %[[EXTRACTED_SLICE_2:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x5x2xf32> to tensor<6x1x1x1xf32> -// CHECK: %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x1x1xf32>) outs(%[[EXTRACTED_SLICE_2]] : tensor<6x1x1x1xf32>) -> tensor<6x1x1x1xf32> +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1_3:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1_3]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x1x5x2xf32>) { +// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C3]] step %[[C1_4]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x1x5x2xf32>) { +// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C1]] step %[[C1_5]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x1x5x2xf32>) { +// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C5]] step %[[C1_6]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x1x5x2xf32>) { +// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG8]]] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<2x3x1x5xf32> to tensor<1x3x1x1xf32> +// CHECK-NEXT: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x5x2xf32> to tensor<6x1x1x1xf32> +// CHECK-NEXT: %[[S5:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x1x1xf32>) outs(%[[EXTRACTED_SLICE_7]] : tensor<6x1x1x1xf32>) -> tensor<6x1x1x1xf32> +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S5]] into %[[ARG9]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x1x1xf32> into tensor<6x1x5x2xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x1x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x1x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x1x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x1x5x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: return %[[S1]] : tensor<6x1x5x2xf32> // ----- @@ -98,32 +156,50 @@ func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop3:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop3:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> -// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> +// CHECK: #[[$MAP1:.+]] = affine_map<() -> (6)> + // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] -// CHECK: %[[S3:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) -// CHECK: %[[S4:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[C1_3:.*]] = arith.constant 1 : index -// CHECK: %[[S5:.*]] = affine.apply #[[$MAP1]](%[[C1_3]]) -// CHECK: %[[C1_4:.*]] = arith.constant 1 : index -// CHECK: %[[S6:.*]] = affine.apply #[[$MAP1]](%[[C1_4]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 5] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x5xf32> -// CHECK: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x2x5xf32> -// CHECK: %[[S7:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x5xf32>) outs(%[[EXTRACTED_SLICE_5]] : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32> +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C6_3:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_4:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6_3]] step %[[C1_5]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_6]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2_4]] step %[[C1_7]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK-NEXT: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) +// CHECK-NEXT: %[[S7:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, %[[S5]], %[[S6]], 0] [2, %[[S7]], %[[S8]], 5] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x5xf32> +// CHECK-NEXT: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG6]], %[[ARG8]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x2x5xf32> +// CHECK-NEXT: %[[S9:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x5xf32>) outs(%[[EXTRACTED_SLICE_8]] : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32> +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG9]][0, 0, %[[ARG6]], %[[ARG8]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x5xf32> into tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: return %[[S1]] : tensor<6x6x2x2x2x5xf32> // ----- @@ -135,40 +211,62 @@ func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop3:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop3:6 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> -// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> +// CHECK: #[[$MAP1:.+]] = affine_map<() -> (6)> + // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_6:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_4:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] -// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C2_4]] step %[[C1_5]] -// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C1_7]] -// CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) -// CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[C1_8:.*]] = arith.constant 1 : index -// CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]](%[[C1_8]]) -// CHECK: %[[C1_9:.*]] = arith.constant 1 : index -// CHECK: %[[S8:.*]] = affine.apply #[[$MAP1]](%[[C1_9]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], %[[S5]], %[[S6]], %[[ARG8]]] [1, %[[S7]], %[[S8]], 1] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<1x?x?x1xf32> -// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x1x1xf32> -// CHECK: %[[S9:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x?x?x1xf32>) outs(%[[EXTRACTED_SLICE_10]] : tensor<6x6x1x1x1x1xf32>) -> tensor<6x6x1x1x1x1xf32> +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C6_5:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_7:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_8:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_9:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_10:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_11:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_12:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6_5]] step %[[C1_8]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_9]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2_6]] step %[[C1_10]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S5:.*]] = scf.for %[[ARG10:.*]] = %[[C0_3]] to %[[C2_7]] step %[[C1_11]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S6:.*]] = scf.for %[[ARG12:.*]] = %[[C0_4]] to %[[C5]] step %[[C1_12]] iter_args(%[[ARG13:.*]] = %[[ARG11]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S7:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) +// CHECK-NEXT: %[[S9:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG10]], %[[S7]], %[[S8]], %[[ARG12]]] [1, %[[S9]], %[[S10]], 1] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<1x?x?x1xf32> +// CHECK-NEXT: %[[EXTRACTED_SLICE_13:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x1x1xf32> +// CHECK-NEXT: %[[S11:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x?x?x1xf32>) outs(%[[EXTRACTED_SLICE_13]] : tensor<6x6x1x1x1x1xf32>) -> tensor<6x6x1x1x1x1xf32> +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S11]] into %[[ARG13]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x1x1xf32> into tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S6]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S5]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: return %[[S1]] : tensor<6x6x2x2x2x5xf32> // ----- @@ -180,42 +278,65 @@ func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop3:4 = transform.structured.tile_using_for %0 tile_sizes [2, 2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop3:6 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 2, 2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 5, 2)> // CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4)> -// CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> +// CHECK: #[[$MAP2:.+]] = affine_map<() -> (10)> + // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_7:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_5:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C2_0:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_3:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_8:.*]] = arith.constant 2 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] -// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C2_5]] step %[[C2_6]] -// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_7]] to %[[C5]] step %[[C2_8]] -// CHECK: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG8]]) -// CHECK: %[[S6:.*]] = affine.apply #[[$MAP1]](%[[ARG2]]) -// CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]](%[[ARG4]]) -// CHECK: %[[C2_10:.*]] = arith.constant 2 : index -// CHECK: %[[S8:.*]] = affine.apply #[[$MAP2]](%[[C2_10]]) -// CHECK: %[[C2_11:.*]] = arith.constant 2 : index -// CHECK: %[[S9:.*]] = affine.apply #[[$MAP2]](%[[C2_11]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], %[[S6]], %[[S7]], %[[ARG8]]] [2, %[[S8]], %[[S9]], %[[S5]]] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x?xf32> -// CHECK: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 2, 2, 2, %[[S5]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x2x2x2x?xf32> -// CHECK: %[[S10:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x?xf32>) outs(%[[EXTRACTED_SLICE_12]] : tensor<6x6x2x2x2x?xf32>) -> tensor<6x6x2x2x2x?xf32> +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C6_5:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_7:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_8:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2_9:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_10:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_11:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_12:.*]] = arith.constant 2 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6_5]] step %[[C1_8]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C2_9]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2_6]] step %[[C2_10]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S5:.*]] = scf.for %[[ARG10:.*]] = %[[C0_3]] to %[[C2_7]] step %[[C2_11]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[S6:.*]] = scf.for %[[ARG12:.*]] = %[[C0_4]] to %[[C5]] step %[[C2_12]] iter_args(%[[ARG13:.*]] = %[[ARG11]]) -> (tensor<6x6x2x2x2x5xf32>) { +// CHECK-NEXT: %[[C5_13:.*]] = arith.constant 5 : index +// CHECK-NEXT: %[[S7:.*]] = affine.min #[[$MAP0]](%[[ARG12]]) +// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP1]](%[[ARG6]]) +// CHECK-NEXT: %[[S9:.*]] = affine.apply #[[$MAP1]](%[[ARG8]]) +// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP2]]() +// CHECK-NEXT: %[[S11:.*]] = affine.apply #[[$MAP2]]() +// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG10]], %[[S8]], %[[S9]], %[[ARG12]]] [2, %[[S10]], %[[S11]], %[[S7]]] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x?xf32> +// CHECK-NEXT: %[[EXTRACTED_SLICE_14:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 6, 2, 2, 2, %[[S7]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x2x2x2x?xf32> +// CHECK-NEXT: %[[S12:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x?xf32>) outs(%[[EXTRACTED_SLICE_14]] : tensor<6x6x2x2x2x?xf32>) -> tensor<6x6x2x2x2x?xf32> +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG13]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 6, 2, 2, 2, %[[S7]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x?xf32> into tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S6]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S5]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x6x2x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: return %[[S1]] : tensor<6x6x2x2x2x5xf32> // ----- @@ -227,40 +348,62 @@ func.func @tile_winograd_input(%arg0: tensor<2x1x10x5xf32>, %arg1: tensor<1x6x1x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop3:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop3:6 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> -// CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4 + 2)> +// CHECK: #[[$MAP1:.+]] = affine_map<() -> (6)> + // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x1x10x5xf32>, %[[ARG1:.*]]: tensor<1x6x1x2x2x5xf32>) -> tensor<1x6x1x2x2x5xf32> { +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<1x6x1x2x2x5xf32> // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_6:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_4:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C1_0:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C1]] step %[[C1_0]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_2]] -// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C2_4]] step %[[C1_5]] -// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C1_7]] -// CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) -// CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[C1_8:.*]] = arith.constant 1 : index -// CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]](%[[C1_8]]) -// CHECK: %[[C1_9:.*]] = arith.constant 1 : index -// CHECK: %[[S8:.*]] = affine.apply #[[$MAP1]](%[[C1_9]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], 0, %[[S6]], %[[ARG8]]] [1, 1, %[[S8]], 1] [1, 1, 1, 1] : tensor<2x1x10x5xf32> to tensor<1x1x?x1xf32> -// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [1, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x6x1x2x2x5xf32> to tensor<1x6x1x1x1x1xf32> -// CHECK: %[[S9:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x1x?x1xf32>) outs(%[[EXTRACTED_SLICE_10]] : tensor<1x6x1x1x1x1xf32>) -> tensor<1x6x1x1x1x1xf32> +// CHECK-DAG: %[[C1_8:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_9:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_10:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_11:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_12:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C1]] step %[[C1_7]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<1x6x1x2x2x5xf32>) { +// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6]] step %[[C1_8]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<1x6x1x2x2x5xf32>) { +// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C1_5]] step %[[C1_9]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<1x6x1x2x2x5xf32>) { +// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2]] step %[[C1_10]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<1x6x1x2x2x5xf32>) { +// CHECK-NEXT: %[[S5:.*]] = scf.for %[[ARG10:.*]] = %[[C0_3]] to %[[C2_6]] step %[[C1_11]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<1x6x1x2x2x5xf32>) { +// CHECK-NEXT: %[[S6:.*]] = scf.for %[[ARG12:.*]] = %[[C0_4]] to %[[C5]] step %[[C1_12]] iter_args(%[[ARG13:.*]] = %[[ARG11]]) -> (tensor<1x6x1x2x2x5xf32>) { +// CHECK-NEXT: %[[S7:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) +// CHECK-NEXT: %[[S9:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG10]], 0, %[[S8]], %[[ARG12]]] [1, 1, %[[S10]], 1] [1, 1, 1, 1] : tensor<2x1x10x5xf32> to tensor<1x1x?x1xf32> +// CHECK-NEXT: %[[EXTRACTED_SLICE_13:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [1, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x6x1x2x2x5xf32> to tensor<1x6x1x1x1x1xf32> +// CHECK-NEXT: %[[S11:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x1x?x1xf32>) outs(%[[EXTRACTED_SLICE_13]] : tensor<1x6x1x1x1x1xf32>) -> tensor<1x6x1x1x1x1xf32> +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S11]] into %[[ARG13]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [1, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x6x1x1x1x1xf32> into tensor<1x6x1x2x2x5xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<1x6x1x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S6]] : tensor<1x6x1x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S5]] : tensor<1x6x1x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S4]] : tensor<1x6x1x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S3]] : tensor<1x6x1x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S2]] : tensor<1x6x1x2x2x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: return %[[S1]] : tensor<1x6x1x2x2x5xf32> // ----- @@ -272,30 +415,54 @@ func.func @tile_winograd_output(%arg0 : tensor<6x6x2x2x2x2xf32>, %arg1: tensor<2 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK: #[[$MAP1:.+]] = affine_map<() -> (4)> + // CHECK-LABEL: func.func @tile_winograd_output( // CHECK-SAME: %[[ARG0:.*]]: tensor<6x6x2x2x2x2xf32>, %[[ARG1:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> { +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<2x8x8x2xf32> // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C6_3:.*]] = arith.constant 6 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_4:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> -// CHECK: %[[S3:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) -// CHECK: %[[S4:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[C1_3:.*]] = arith.constant 1 : index -// CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[C1_3]]) -// CHECK: %[[C1_4:.*]] = arith.constant 1 : index -// CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[C1_4]]) -// CHECK: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG1]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x?x?x2xf32> +// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<2x8x8x2xf32>) { +// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6_3]] step %[[C1_5]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<2x8x8x2xf32>) { +// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_6]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<2x8x8x2xf32>) { +// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2_4]] step %[[C1_7]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<2x8x8x2xf32>) { +// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG6]], %[[ARG8]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> +// CHECK-NEXT: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK-NEXT: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) +// CHECK-NEXT: %[[S7:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[ARG1]][0, %[[S5]], %[[S6]], 0] [2, %[[S7]], %[[S8]], 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x?x?x2xf32> +// CHECK-NEXT: %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x6x1x1x2x2xf32>) outs(%[[EXTRACTED_SLICE_8]] : tensor<2x?x?x2xf32>) -> tensor<2x?x?x2xf32> +// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK-NEXT: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) +// CHECK-NEXT: %[[S12:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[S13:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG9]][0, %[[S10]], %[[S11]], 0] [2, %[[S12]], %[[S13]], 2] [1, 1, 1, 1] : tensor<2x?x?x2xf32> into tensor<2x8x8x2xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<2x8x8x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S4]] : tensor<2x8x8x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S3]] : tensor<2x8x8x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S2]] : tensor<2x8x8x2xf32> +// CHECK-NEXT: } +// CHECK-NEXT: return %[[S1]] : tensor<2x8x8x2xf32> // ----- @@ -307,7 +474,7 @@ func.func @tile_winograd_output(%arg0 : tensor<6x6x2x2x3x5xf32>, %arg1: tensor<3 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [2, 2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:6 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 2, 2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } @@ -315,36 +482,65 @@ module attributes {transform.with_named_sequence} { // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 3, 2)> // CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 5, 2)> // CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 4)> -// CHECK-LABEL: func.func @tile_winograd_output( -// CHECK-SAME: %[[ARG0:.*]]: tensor<6x6x2x2x3x5xf32>, %[[ARG1:.*]]: tensor<3x8x8x5xf32>) -> tensor<3x8x8x5xf32> { -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_6:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C2_0:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_3:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_5:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_7:.*]] = arith.constant 2 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] -// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C3]] step %[[C2_5]] -// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C2_7]] -// CHECK: %[[C3_8:.*]] = arith.constant 3 : index -// CHECK: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG6]]) -// CHECK: %[[C5_9:.*]] = arith.constant 5 : index -// CHECK: %[[S6:.*]] = affine.min #[[$MAP1]](%[[ARG8]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 2, 2, %[[S5]], %[[S6]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x3x5xf32> to tensor<6x6x2x2x?x?xf32> -// CHECK: %[[S7:.*]] = affine.apply #[[$MAP2]](%[[ARG2]]) -// CHECK: %[[S8:.*]] = affine.apply #[[$MAP2]](%[[ARG4]]) -// CHECK: %[[C2_10:.*]] = arith.constant 2 : index -// CHECK: %[[S9:.*]] = affine.apply #[[$MAP2]](%[[C2_10]]) -// CHECK: %[[C2_11:.*]] = arith.constant 2 : index -// CHECK: %[[S10:.*]] = affine.apply #[[$MAP2]](%[[C2_11]]) -// CHECK: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S7]], %[[S8]], %[[ARG8]]] [%[[S5]], %[[S9]], %[[S10]], %[[S6]]] [1, 1, 1, 1] : tensor<3x8x8x5xf32> to tensor +// CHECK: #[[$MAP3:.+]] = affine_map<() -> (8)> + +// CHECK-LABEL: func.func @tile_winograd_output( +// CHECK-SAME: %[[ARG0:.*]]: tensor<6x6x2x2x3x5xf32>, %[[ARG1:.*]]: tensor<3x8x8x5xf32>) -> tensor<3x8x8x5xf32> { +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<3x8x8x5xf32> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C6_5:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2_8:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_9:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_10:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_11:.*]] = arith.constant 2 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<3x8x8x5xf32>) { +// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6_5]] step %[[C1_7]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<3x8x8x5xf32>) { +// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C2_8]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<3x8x8x5xf32>) { +// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2_6]] step %[[C2_9]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<3x8x8x5xf32>) { +// CHECK-NEXT: %[[S5:.*]] = scf.for %[[ARG10:.*]] = %[[C0_3]] to %[[C3]] step %[[C2_10]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<3x8x8x5xf32>) { +// CHECK-NEXT: %[[S6:.*]] = scf.for %[[ARG12:.*]] = %[[C0_4]] to %[[C5]] step %[[C2_11]] iter_args(%[[ARG13:.*]] = %[[ARG11]]) -> (tensor<3x8x8x5xf32>) { +// CHECK-NEXT: %[[C3_12:.*]] = arith.constant 3 : index +// CHECK-NEXT: %[[S7:.*]] = affine.min #[[$MAP0]](%[[ARG10]]) +// CHECK-NEXT: %[[C5_13:.*]] = arith.constant 5 : index +// CHECK-NEXT: %[[S8:.*]] = affine.min #[[$MAP1]](%[[ARG12]]) +// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 6, 2, 2, %[[S7]], %[[S8]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x3x5xf32> to tensor<6x6x2x2x?x?xf32> +// CHECK-NEXT: %[[S9:.*]] = affine.apply #[[$MAP2]](%[[ARG6]]) +// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP2]](%[[ARG8]]) +// CHECK-NEXT: %[[S11:.*]] = affine.apply #[[$MAP3]]() +// CHECK-NEXT: %[[S12:.*]] = affine.apply #[[$MAP3]]() +// CHECK-NEXT: %[[EXTRACTED_SLICE_14:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG10]], %[[S9]], %[[S10]], %[[ARG12]]] [%[[S7]], %[[S11]], %[[S12]], %[[S8]]] [1, 1, 1, 1] : tensor<3x8x8x5xf32> to tensor +// CHECK-NEXT: %[[S13:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x6x2x2x?x?xf32>) outs(%[[EXTRACTED_SLICE_14]] : tensor) -> tensor +// CHECK-NEXT: %[[S14:.*]] = affine.apply #[[$MAP2]](%[[ARG6]]) +// CHECK-NEXT: %[[S15:.*]] = affine.apply #[[$MAP2]](%[[ARG8]]) +// CHECK-NEXT: %[[S16:.*]] = affine.apply #[[$MAP3]]() +// CHECK-NEXT: %[[S17:.*]] = affine.apply #[[$MAP3]]() +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[ARG13]][%[[ARG10]], %[[S14]], %[[S15]], %[[ARG12]]] [%[[S7]], %[[S16]], %[[S17]], %[[S8]]] [1, 1, 1, 1] : tensor into tensor<3x8x8x5xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<3x8x8x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S6]] : tensor<3x8x8x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S5]] : tensor<3x8x8x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S4]] : tensor<3x8x8x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S3]] : tensor<3x8x8x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S2]] : tensor<3x8x8x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: return %[[S1]] : tensor<3x8x8x5xf32> +// CHECK-NEXT: } // ----- @@ -356,36 +552,63 @@ func.func @tile_winograd_output(%arg0 : tensor<6x1x2x1x3x5xf32>, %arg1: tensor<3 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:6 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> +// CHECK: #[[$MAP1:.+]] = affine_map<() -> (4)> + // CHECK-LABEL: func.func @tile_winograd_output( // CHECK-SAME: %[[ARG0:.*]]: tensor<6x1x2x1x3x5xf32>, %[[ARG1:.*]]: tensor<3x8x1x5xf32>) -> tensor<3x8x1x5xf32> { +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<3x8x1x5xf32> // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_5:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1_1]] step %[[C1_2]] -// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_4]] -// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_5]] to %[[C5]] step %[[C1_6]] -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x2x1x3x5xf32> to tensor<6x1x1x1x1x1xf32> -// CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) -// CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) -// CHECK: %[[C1_7:.*]] = arith.constant 1 : index -// CHECK: %[[S7:.*]] = affine.apply #[[$MAP0]](%[[C1_7]]) -// CHECK: %[[C1_8:.*]] = arith.constant 1 : index -// CHECK: %[[S8:.*]] = affine.apply #[[$MAP0]](%[[C1_8]]) -// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S5]], 0, %[[ARG8]]] [1, %[[S7]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32> -// CHECK: %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x1x1x1x1x1xf32>) outs(%[[EXTRACTED_SLICE_9]] : tensor<1x?x1x1xf32>) -> tensor<1x?x1x1xf32> +// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_8:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_9:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_10:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_11:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1_6]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<3x8x1x5xf32>) { +// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1]] step %[[C1_7]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<3x8x1x5xf32>) { +// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_8]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<3x8x1x5xf32>) { +// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C1_5]] step %[[C1_9]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<3x8x1x5xf32>) { +// CHECK-NEXT: %[[S5:.*]] = scf.for %[[ARG10:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_10]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<3x8x1x5xf32>) { +// CHECK-NEXT: %[[S6:.*]] = scf.for %[[ARG12:.*]] = %[[C0_4]] to %[[C5]] step %[[C1_11]] iter_args(%[[ARG13:.*]] = %[[ARG11]]) -> (tensor<3x8x1x5xf32>) { +// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x2x1x3x5xf32> to tensor<6x1x1x1x1x1xf32> +// CHECK-NEXT: %[[S7:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) +// CHECK-NEXT: %[[S9:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG10]], %[[S7]], 0, %[[ARG12]]] [1, %[[S9]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32> +// CHECK-NEXT: %[[S11:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x1x1x1x1x1xf32>) outs(%[[EXTRACTED_SLICE_12]] : tensor<1x?x1x1xf32>) -> tensor<1x?x1x1xf32> +// CHECK-NEXT: %[[S12:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK-NEXT: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) +// CHECK-NEXT: %[[S14:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[S15:.*]] = affine.apply #[[$MAP1]]() +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S11]] into %[[ARG13]][%[[ARG10]], %[[S12]], 0, %[[ARG12]]] [1, %[[S14]], 1, 1] [1, 1, 1, 1] : tensor<1x?x1x1xf32> into tensor<3x8x1x5xf32> +// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<3x8x1x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S6]] : tensor<3x8x1x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S5]] : tensor<3x8x1x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S4]] : tensor<3x8x1x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S3]] : tensor<3x8x1x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: scf.yield %[[S2]] : tensor<3x8x1x5xf32> +// CHECK-NEXT: } +// CHECK-NEXT: return %[[S1]] : tensor<3x8x1x5xf32> From c786c4f1738ff13865ce1a9dce84cbe472cb7b75 Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Fri, 16 Aug 2024 02:44:25 +0100 Subject: [PATCH 8/9] Add more extraClassDeclarations and update test cases --- .../mlir/Dialect/Linalg/IR/LinalgOps.td | 18 + mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 32 +- .../transform-tile-and-winograd-rewrite.mlir | 272 +++++---- .../Linalg/transform-tile-winograd.mlir | 570 ++++++------------ 4 files changed, 340 insertions(+), 552 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td index e82c654afe86a..5b6a90f806bed 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td @@ -290,6 +290,12 @@ def Linalg_WinogradInputTransformOp : Linalg_Op<"winograd_input_transform", int64_t getInputCDim() { return 3; } + int64_t getOutputAlphaHDim() { + return 0; + } + int64_t getOutputAlphaWDim() { + return 1; + } int64_t getOutputTileHDim() { return 2; } @@ -378,6 +384,18 @@ def Linalg_WinogradOutputTransformOp : Linalg_Op<"winograd_output_transform", int64_t getValueFDim() { return 5; } + int64_t getOutputNDim() { + return 0; + } + int64_t getOutputHDim() { + return 1; + } + int64_t getOutputWDim() { + return 2; + } + int64_t getOutputFDim() { + return 3; + } }]; let hasVerifier = 1; } diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index b30e7e8db393e..67cd4272abdfd 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -2995,21 +2995,21 @@ LogicalResult WinogradInputTransformOp::verify() { SmallVector expectedOutputShape(6, inputH); if (ShapedType::isDynamic(inputH)) { - expectedOutputShape[0] = tileSize; - expectedOutputShape[2] = ShapedType::kDynamic; + expectedOutputShape[getOutputAlphaHDim()] = tileSize; + expectedOutputShape[getOutputTileHDim()] = ShapedType::kDynamic; } else { - expectedOutputShape[0] = leftTransform ? tileSize : 1; - expectedOutputShape[2] = leftTransform ? (inputH - (r - 1)) / m : 1; + expectedOutputShape[getOutputAlphaHDim()] = leftTransform ? tileSize : 1; + expectedOutputShape[getOutputTileHDim()] = leftTransform ? (inputH - (r - 1)) / m : 1; } if (ShapedType::isDynamic(inputW)) { - expectedOutputShape[1] = tileSize; - expectedOutputShape[3] = ShapedType::kDynamic; + expectedOutputShape[getOutputAlphaWDim()] = tileSize; + expectedOutputShape[getOutputTileWDim()] = ShapedType::kDynamic; } else { - expectedOutputShape[1] = rightTransform ? tileSize : 1; - expectedOutputShape[3] = rightTransform ? (inputW - (r - 1)) / m : 1; + expectedOutputShape[getOutputAlphaWDim()] = rightTransform ? tileSize : 1; + expectedOutputShape[getOutputTileWDim()] = rightTransform ? (inputW - (r - 1)) / m : 1; } - expectedOutputShape[4] = inputShape[getInputNDim()]; - expectedOutputShape[5] = inputShape[getInputCDim()]; + expectedOutputShape[getOutputNDim()] = inputShape[getInputNDim()]; + expectedOutputShape[getOutputCDim()] = inputShape[getInputCDim()]; auto outputType = cast(getOutput().getType()); ArrayRef outputShape = outputType.getShape(); @@ -3162,21 +3162,21 @@ LogicalResult WinogradOutputTransformOp::verify() { int64_t outputRank = getOutputOperandRank(); SmallVector expectedOutputShape(outputRank, valueH); if (ShapedType::isDynamic(valueH) || ShapedType::isDynamic(valueTileH)) { - expectedOutputShape[1] = ShapedType::kDynamic; + expectedOutputShape[getOutputHDim()] = ShapedType::kDynamic; } else { if (valueH != (leftTransform ? m + r - 1 : 1)) return emitOpError("expect input height equals to input tile size"); - expectedOutputShape[1] = (leftTransform ? m : 1) * valueTileH; + expectedOutputShape[getOutputHDim()] = (leftTransform ? m : 1) * valueTileH; } if (ShapedType::isDynamic(valueW) || ShapedType::isDynamic(valueTileW)) { - expectedOutputShape[2] = ShapedType::kDynamic; + expectedOutputShape[getOutputWDim()] = ShapedType::kDynamic; } else { if (valueW != (rightTransform ? m + r - 1 : 1)) return emitOpError("expect input width equals to input tile size"); - expectedOutputShape[2] = (rightTransform ? m : 1) * valueTileW; + expectedOutputShape[getOutputWDim()] = (rightTransform ? m : 1) * valueTileW; } - expectedOutputShape[0] = valueShape[getValueNDim()]; - expectedOutputShape[3] = valueShape[getValueFDim()]; + expectedOutputShape[getOutputNDim()] = valueShape[getValueNDim()]; + expectedOutputShape[getOutputFDim()] = valueShape[getValueFDim()]; auto outputType = cast(getOutput().getType()); ArrayRef outputShape = outputType.getShape(); diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir index c86b33ba88ee3..6bb3fb1423edc 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir @@ -18,9 +18,9 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %3, %loop3:4 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %5, %loop5:4 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) @@ -36,7 +36,6 @@ module attributes {transform.with_named_sequence} { // CHECK-LABEL: func.func @conv2d // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> { // CHECK: %[[CST:.*]] = arith.constant 1.024000e+03 : f32 -// CHECK: %[[C6:.*]] = arith.constant 6 : index // CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[C5:.*]] = arith.constant 5 : index // CHECK: %[[C2:.*]] = arith.constant 2 : index @@ -52,49 +51,53 @@ module attributes {transform.with_named_sequence} { // CHECK: scf.yield %[[S9]] // CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> // CHECK: %[[S3:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> -// CHECK: %[[S4:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S3]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK: %[[S11:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK: %[[S12:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) -// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, %[[S12]], %[[S13]], 0] [2, 6, 6, 5] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x6x6x5xf32> -// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x2x5xf32> -// CHECK: %[[S14:.*]] = scf.for %[[ARG11:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG12:.*]] = %[[EXTRACTED_SLICE_7]]) -> (tensor<6x6x1x1x2x5xf32>) { -// CHECK: %[[S15:.*]] = scf.for %[[ARG13:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG14:.*]] = %[[ARG12]]) -> (tensor<6x6x1x1x2x5xf32>) { -// CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][%[[ARG11]], 0, 0, %[[ARG13]]] [1, 6, 6, 1] [1, 1, 1, 1] : tensor<2x6x6x5xf32> to tensor<6x6xf32> -// CHECK: %[[S17:.*]] = linalg.matmul -// CHECK: %[[S19:.*]] = linalg.matmul -// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S19]] into %[[ARG14]][0, 0, 0, 0, %[[ARG11]], %[[ARG13]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x1x1x2x5xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG10]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x5xf32> into tensor<6x6x2x2x2x5xf32> -// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32> -// CHECK: %[[COLLAPSED_6:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x2x2x2x5xf32> into tensor<36x8x5xf32> +// CHECK: %[[S4:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S3]]) +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, %[[S10]], %[[S11]], 0] [2, 6, 6, 5] [1, 1, 1, 1] +// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) +// CHECK: %[[S13:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) +// CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 6, 6, 1] [1, 1, 1, 1] +// CHECK: %[[S15:.*]] = linalg.matmul +// CHECK: %[[S17:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S17]] into %[[ARG10]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE_9]] +// CHECK: scf.yield %[[S13]] +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S9]] +// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] +// CHECK: %[[COLLAPSED_6:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] // CHECK: %[[S6:.*]] = linalg.batch_matmul -// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] : tensor<36x8x2xf32> into tensor<6x6x2x2x2x2xf32> +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] // CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x8x8x2xf32> -// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]]) -> (tensor<2x8x8x2xf32>) { -// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<2x8x8x2xf32>) { -// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x8x8x2xf32>) { -// CHECK: %[[S11:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x8x8x2xf32>) { -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> -// CHECK: %[[S12:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) -// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) -// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG2]][0, %[[S12]], %[[S13]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x4x4x2xf32> -// CHECK: %[[S14:.*]] = scf.for %[[ARG11:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG12:.*]] = %[[EXTRACTED_SLICE_7]]) -> (tensor<2x4x4x2xf32>) { -// CHECK: %[[S17:.*]] = scf.for %[[ARG13:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG14:.*]] = %[[ARG12]]) -> (tensor<2x4x4x2xf32>) { -// CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, %[[ARG11]], %[[ARG13]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x2xf32> to tensor<6x6xf32> -// CHECK: %[[S19:.*]] = linalg.matmul -// CHECK: %[[S21:.*]] = linalg.matmul -// CHECK: %[[S22:.*]] = tensor.empty() : tensor<4x4xf32> -// CHECK: %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S22]] : tensor<4x4xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 -// CHECK: } -> tensor<4x4xf32> -// CHECK: %[[S24:.*]] = linalg.mul ins(%[[S23]], %[[S21]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S22]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S24]] into %[[ARG14]][%[[ARG11]], 0, 0, %[[ARG13]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x4x4x2xf32> -// CHECK: %[[S15:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) -// CHECK: %[[S16:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG10]][0, %[[S15]], %[[S16]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x4x4x2xf32> into tensor<2x8x8x2xf32> +// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]]) +// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG2]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) +// CHECK: %[[S15:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) +// CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S17:.*]] = linalg.matmul +// CHECK: %[[S19:.*]] = linalg.matmul +// CHECK: %[[S20:.*]] = tensor.empty() +// CHECK: %[[S21:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S20]] : tensor<4x4xf32>) { +// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: } -> tensor<4x4xf32> +// CHECK: %[[S22:.*]] = linalg.mul ins(%[[S21]], %[[S19]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32> +// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S22]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE_9]] +// CHECK: scf.yield %[[S15]] +// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][0, %[[S13]], %[[S14]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S9]] // ----- @@ -126,9 +129,9 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %3, %loop3:4 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %5, %loop5:4 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) @@ -145,12 +148,10 @@ module attributes {transform.with_named_sequence} { // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x11x11x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> { // CHECK: %[[CST:.*]] = arith.constant 1.024000e+03 : f32 // CHECK: %[[C3:.*]] = arith.constant 3 : index -// CHECK: %[[C6:.*]] = arith.constant 6 : index // CHECK: %[[C1:.*]] = arith.constant 1 : index // CHECK: %[[C5:.*]] = arith.constant 5 : index // CHECK: %[[C2:.*]] = arith.constant 2 : index // CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[CST_6:.*]] = arith.constant 0.000000e+00 : f32 // CHECK: %[[S0:.*]] = tensor.empty() // CHECK: %[[S1:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S0]]) // CHECK: %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) @@ -163,55 +164,56 @@ module attributes {transform.with_named_sequence} { // CHECK: %[[PADDED:.*]] = tensor.pad %[[ARG0]] low[0, 0, 0, 0] high[0, 3, 3, 0] // CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32> // CHECK: %[[S3:.*]] = tensor.empty() : tensor<6x6x3x3x2x5xf32> -// CHECK: %[[S4:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S3]]) -> (tensor<6x6x3x3x2x5xf32>) { -// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x6x3x3x2x5xf32>) { -// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<6x6x3x3x2x5xf32>) { -// CHECK: %[[S11:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<6x6x3x3x2x5xf32>) { -// CHECK: %[[S12:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) -// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) -// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[PADDED]][0, %[[S12]], %[[S13]], 0] [2, 6, 6, 5] [1, 1, 1, 1] : tensor<2x14x14x5xf32> to tensor<2x6x6x5xf32> -// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x5xf32> to tensor<6x6x1x1x2x5xf32> -// CHECK: %[[S14:.*]] = scf.for %[[ARG11:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG12:.*]] = %[[EXTRACTED_SLICE_10]]) -> (tensor<6x6x1x1x2x5xf32>) { -// CHECK: %[[S15:.*]] = scf.for %[[ARG13:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG14:.*]] = %[[ARG12]]) -> (tensor<6x6x1x1x2x5xf32>) { -// CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][%[[ARG11]], 0, 0, %[[ARG13]]] [1, 6, 6, 1] [1, 1, 1, 1] : tensor<2x6x6x5xf32> to tensor<6x6xf32> -// CHECK: %[[S17:.*]] = linalg.matmul -// CHECK: %[[S19:.*]] = linalg.matmul -// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S19]] into %[[ARG14]][0, 0, 0, 0, %[[ARG11]], %[[ARG13]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x1x1x2x5xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG10]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x5xf32> into tensor<6x6x3x3x2x5xf32> -// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32> -// CHECK: %[[COLLAPSED_7:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32> +// CHECK: %[[S4:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S3]]) +// CHECK: %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[PADDED]][0, %[[S10]], %[[S11]], 0] [2, 6, 6, 5] [1, 1, 1, 1] +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[S2]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) +// CHECK: %[[S13:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) +// CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 6, 6, 1] [1, 1, 1, 1] +// CHECK: %[[S15:.*]] = linalg.matmul +// CHECK: %[[S17:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S17]] into %[[ARG11]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE_12]] : tensor<6x6x1x1x2x5xf32> +// CHECK: scf.yield %[[S13]] : tensor<6x6x1x1x2x5xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S9]] +// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] +// CHECK: %[[COLLAPSED_7:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] // CHECK: %[[S6:.*]] = linalg.batch_matmul -// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32> -// CHECK: %[[PADDED_8:.*]] = tensor.pad %[[ARG2]] low[0, 0, 0, 0] high[0, 3, 3, 0] { -// CHECK: ^bb0(%[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index, %[[ARG6:.*]]: index): -// CHECK: tensor.yield %[[CST_6]] : f32 -// CHECK: } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32> +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] +// CHECK: %[[PADDED_8:.*]] = tensor.pad %[[ARG2]] low[0, 0, 0, 0] high[0, 3, 3, 0] // CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x12x12x2xf32> -// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]]) -> (tensor<2x12x12x2xf32>) { -// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<2x12x12x2xf32>) { -// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x12x12x2xf32>) { -// CHECK: %[[S11:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x12x12x2xf32>) { -// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG7]], %[[ARG9]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x2xf32> to tensor<6x6x1x1x2x2xf32> -// CHECK: %[[S12:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) -// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) -// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[PADDED_8]][0, %[[S12]], %[[S13]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x4x4x2xf32> -// CHECK: %[[S14:.*]] = scf.for %[[ARG11:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG12:.*]] = %[[EXTRACTED_SLICE_10]]) -> (tensor<2x4x4x2xf32>) { -// CHECK: %[[S17:.*]] = scf.for %[[ARG13:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG14:.*]] = %[[ARG12]]) -> (tensor<2x4x4x2xf32>) { -// CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0, %[[ARG11]], %[[ARG13]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x2xf32> to tensor<6x6xf32> -// CHECK: %[[S19:.*]] = linalg.matmul -// CHECK: %[[S21:.*]] = linalg.matmul -// CHECK: %[[S22:.*]] = tensor.empty() : tensor<4x4xf32> -// CHECK: %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S22]] : tensor<4x4xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 -// CHECK: } -> tensor<4x4xf32> -// CHECK: %[[S24:.*]] = linalg.mul -// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S24]] into %[[ARG14]][%[[ARG11]], 0, 0, %[[ARG13]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x4x4x2xf32> -// CHECK: %[[S15:.*]] = affine.apply #[[$MAP0]](%[[ARG7]]) -// CHECK: %[[S16:.*]] = affine.apply #[[$MAP0]](%[[ARG9]]) -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG10]][0, %[[S15]], %[[S16]], 0] [2, 4, 4, 2] [1, 1, 1, 1] : tensor<2x4x4x2xf32> into tensor<2x12x12x2xf32> -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S8]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32> -// CHECK: return %[[EXTRACTED_SLICE]] : tensor<2x9x9x2xf32> +// CHECK: %[[S8:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S7]]) +// CHECK: %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[PADDED_8]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) +// CHECK: %[[S15:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) +// CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S17:.*]] = linalg.matmul +// CHECK: %[[S19:.*]] = linalg.matmul +// CHECK: %[[S20:.*]] = tensor.empty() : tensor<4x4xf32> +// CHECK: %[[S21:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S20]] : tensor<4x4xf32>) { +// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: } -> tensor<4x4xf32> +// CHECK: %[[S22:.*]] = linalg.mul ins(%[[S21]], %[[S19]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32> +// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S22]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE_12]] +// CHECK: scf.yield %[[S15]] : tensor<2x4x4x2xf32> +// CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S14:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, %[[S13]], %[[S14]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S9]] +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S8]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] +// CHECK: return %[[EXTRACTED_SLICE]] // ----- @@ -233,9 +235,9 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op %2 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %3, %loop3:4 = transform.structured.tile_using_for %2 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %3, %loop3:2 = transform.structured.tile_using_for %2 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %4 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %5, %loop5:4 = transform.structured.tile_using_for %4 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %5, %loop5:2 = transform.structured.tile_using_for %4 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) %7 = transform.structured.decompose_winograd_op %0 : (!transform.any_op) -> (!transform.any_op) %8 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %3 : (!transform.any_op) -> !transform.any_op %9 = transform.structured.decompose_winograd_op %8 : (!transform.any_op) -> (!transform.any_op) @@ -249,40 +251,42 @@ module attributes {transform.with_named_sequence} { // CHECK: #[[$MAP1:.+]] = affine_map<(d0, d1) -> (d0, d1)> // CHECK-LABEL: func.func @conv2d_mx1_rx1 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x6x1x5xf32>, %[[ARG1:.*]]: tensor<2x3x1x5xf32>, %[[ARG2:.*]]: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> { -// CHECK: %[[CST:.*]] = arith.constant 3.200000e+01 : f32 -// CHECK: %[[C6:.*]] = arith.constant 6 : index -// CHECK: %[[C1:.*]] = arith.constant 1 : index -// CHECK: %[[C5:.*]] = arith.constant 5 : index -// CHECK: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[C0:.*]] = arith.constant 0 : index -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x1x5x2xf32> -// CHECK: %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]]) -> (tensor<6x1x5x2xf32>) { -// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x1x5x2xf32>) { -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<2x3x1x5xf32> to tensor<3x1xf32> -// CHECK: %[[S11:.*]] = linalg.matmul -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S11]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1xf32> into tensor<6x1x5x2xf32> -// CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32> -// CHECK: %[[S3:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32> -// CHECK: %[[S4:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S3]]) -> (tensor<6x1x1x1x2x5xf32>) { -// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[S2]]) -> (tensor<6x1x1x1x2x5xf32>) { -// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<6x1x1x1x2x5xf32>) { -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG5]], 0, 0, %[[ARG7]]] [1, 6, 1, 1] [1, 1, 1, 1] : tensor<2x6x1x5xf32> to tensor<6x1xf32> -// CHECK: %[[S12:.*]] = linalg.matmul -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG8]][0, 0, 0, 0, %[[ARG5]], %[[ARG7]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1xf32> into tensor<6x1x1x1x2x5xf32> -// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x1x5x2xf32> into tensor<6x5x2xf32> -// CHECK: %[[COLLAPSED_3:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x1x1x1x2x5xf32> into tensor<6x2x5xf32> -// CHECK: %[[S6:.*]] = linalg.batch_matmul -// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32> -// CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x4x1x2xf32> -// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]]) -> (tensor<2x4x1x2xf32>) { -// CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG2]]) -> (tensor<2x4x1x2xf32>) { -// CHECK: %[[S10:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x4x1x2xf32>) { -// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG5]], %[[ARG7]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x1x1x2x2xf32> to tensor<6x1xf32> -// CHECK: %[[S12:.*]] = linalg.matmul -// CHECK: %[[S13:.*]] = tensor.empty() : tensor<4x1xf32> -// CHECK: %[[S14:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S13]] : tensor<4x1xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 -// CHECK: } -> tensor<4x1xf32> -// CHECK: %[[S15:.*]] = linalg.mul ins(%[[S14]], %[[S12]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S13]] : tensor<4x1xf32>) -> tensor<4x1xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S15]] into %[[ARG8]][%[[ARG5]], 0, 0, %[[ARG7]]] [1, 4, 1, 1] [1, 1, 1, 1] : tensor<4x1xf32> into tensor<2x4x1x2xf32> +// CHECK: %[[CST:.*]] = arith.constant 3.200000e+01 : f32 +// CHECK: %[[C1:.*]] = arith.constant 1 : index +// CHECK: %[[C5:.*]] = arith.constant 5 : index +// CHECK: %[[C2:.*]] = arith.constant 2 : index +// CHECK: %[[C0:.*]] = arith.constant 0 : index +// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x1x5x2xf32> +// CHECK: %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]]) +// CHECK: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 3, 1, 1] [1, 1, 1, 1] +// CHECK: %[[S9:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 1, 1, 1] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S7]] +// CHECK: %[[S2:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32> +// CHECK: %[[S3:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S2]]) +// CHECK: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 6, 1, 1] [1, 1, 1, 1] +// CHECK: %[[S9:.*]] = linalg.matmul +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG6]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S7]] +// CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] +// CHECK: %[[COLLAPSED_3:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2, 3, 4], [5]] +// CHECK: %[[S5:.*]] = linalg.batch_matmul +// CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] +// CHECK: %[[S6:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[ARG2]]) +// CHECK: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S9:.*]] = linalg.matmul +// CHECK: %[[S10:.*]] = tensor.empty() : tensor<4x1xf32> +// CHECK: %[[S11:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S10]] : tensor<4x1xf32>) { +// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: } -> tensor<4x1xf32> +// CHECK: %[[S12:.*]] = linalg.mul ins(%[[S11]], %[[S9]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S10]] : tensor<4x1xf32>) -> tensor<4x1xf32> +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1] +// CHECK: scf.yield %[[INSERTED_SLICE]] +// CHECK: scf.yield %[[S7]] +// CHECK: return %[[S6]] diff --git a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir index ea5d4e99c8731..21522a2083b46 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir @@ -8,43 +8,24 @@ func.func @tile_winograd_filter(%arg0: tensor<2x3x3x5xf32>, %arg1: tensor<6x6x5x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK-LABEL: func.func @tile_winograd_filter( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x3x5xf32>, %[[ARG1:.*]]: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> { -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf32> -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[C3_3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x5x2xf32>) { -// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C3]] step %[[C1_4]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x6x5x2xf32>) { -// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C3_3]] step %[[C1_5]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x5x2xf32>) { -// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C5]] step %[[C1_6]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x6x5x2xf32>) { -// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG8]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x1xf32> -// CHECK-NEXT: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x1x1xf32> -// CHECK-NEXT: %[[S5:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x1xf32>) outs(%[[EXTRACTED_SLICE_7]] : tensor<6x6x1x1xf32>) -> tensor<6x6x1x1xf32> -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S5]] into %[[ARG9]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x1x1xf32> into tensor<6x6x5x2xf32> -// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x6x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x6x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x6x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: return %[[S1]] : tensor<6x6x5x2xf32> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C1_1]] +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x1xf32> +// CHECK: %[[EXTRACTED_SLICE_2:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x1x1xf32> +// CHECK: %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x1xf32>) outs(%[[EXTRACTED_SLICE_2]] : tensor<6x6x1x1xf32>) -> tensor<6x6x1x1xf32> // ----- @@ -56,47 +37,27 @@ func.func @tile_winograd_filter(%arg0: tensor<2x3x3x5xf32>, %arg1: tensor<6x6x5x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 5, 2)> - // CHECK-LABEL: func.func @tile_winograd_filter( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x3x5xf32>, %[[ARG1:.*]]: tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32> { -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf32> // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[C3_3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x5x2xf32>) { -// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C3]] step %[[C1_4]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x6x5x2xf32>) { -// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C3_3]] step %[[C1_5]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x5x2xf32>) { -// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C5]] step %[[C2_6]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x6x5x2xf32>) { -// CHECK-NEXT: %[[C5_7:.*]] = arith.constant 5 : index -// CHECK-NEXT: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG8]]) -// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG8]]] [1, 3, 3, %[[S5]]] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x?xf32> -// CHECK-NEXT: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 6, %[[S5]], 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x?x1xf32> -// CHECK-NEXT: %[[S6:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x?xf32>) outs(%[[EXTRACTED_SLICE_8]] : tensor<6x6x?x1xf32>) -> tensor<6x6x?x1xf32> -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S6]] into %[[ARG9]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 6, %[[S5]], 1] [1, 1, 1, 1] : tensor<6x6x?x1xf32> into tensor<6x6x5x2xf32> -// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x6x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x6x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x6x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: return %[[S1]] : tensor<6x6x5x2xf32> +// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C2_1]] +// CHECK: %[[C5_2:.*]] = arith.constant 5 : index +// CHECK: %[[S3:.*]] = affine.min #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 3, %[[S3]]] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<1x3x3x?xf32> +// CHECK: %[[EXTRACTED_SLICE_3:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 6, %[[S3]], 1] [1, 1, 1, 1] : tensor<6x6x5x2xf32> to tensor<6x6x?x1xf32> +// CHECK: %[[S4:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x3x?xf32>) outs(%[[EXTRACTED_SLICE_3]] : tensor<6x6x?x1xf32>) -> tensor<6x6x?x1xf32> // ----- @@ -108,43 +69,24 @@ func.func @tile_winograd_filter(%arg0: tensor<2x3x1x5xf32>, %arg1: tensor<6x1x5x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_filter_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [1, 0, 0, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK-LABEL: func.func @tile_winograd_filter( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x3x1x5xf32>, %[[ARG1:.*]]: tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32> { -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x1x5x2xf32> // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C1_3:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1_3]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x1x5x2xf32>) { -// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C3]] step %[[C1_4]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x1x5x2xf32>) { -// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C1]] step %[[C1_5]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x1x5x2xf32>) { -// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C5]] step %[[C1_6]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x1x5x2xf32>) { -// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG8]]] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<2x3x1x5xf32> to tensor<1x3x1x1xf32> -// CHECK-NEXT: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x5x2xf32> to tensor<6x1x1x1xf32> -// CHECK-NEXT: %[[S5:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x1x1xf32>) outs(%[[EXTRACTED_SLICE_7]] : tensor<6x1x1x1xf32>) -> tensor<6x1x1x1xf32> -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S5]] into %[[ARG9]][0, 0, %[[ARG8]], %[[ARG2]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x1x1xf32> into tensor<6x1x5x2xf32> -// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x1x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x1x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x1x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x1x5x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: return %[[S1]] : tensor<6x1x5x2xf32> +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C5]] step %[[C1_1]] +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG2]], 0, 0, %[[ARG4]]] [1, 3, 1, 1] [1, 1, 1, 1] : tensor<2x3x1x5xf32> to tensor<1x3x1x1xf32> +// CHECK: %[[EXTRACTED_SLICE_2:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG4]], %[[ARG2]]] [6, 1, 1, 1] [1, 1, 1, 1] : tensor<6x1x5x2xf32> to tensor<6x1x1x1xf32> +// CHECK: %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x3x1x1xf32>) outs(%[[EXTRACTED_SLICE_2]] : tensor<6x1x1x1xf32>) -> tensor<6x1x1x1xf32> // ----- @@ -156,50 +98,30 @@ func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop3:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop3:2 = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> // CHECK: #[[$MAP1:.+]] = affine_map<() -> (6)> - // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C6_3:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_4:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6_3]] step %[[C1_5]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_6]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2_4]] step %[[C1_7]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK-NEXT: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) -// CHECK-NEXT: %[[S7:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, %[[S5]], %[[S6]], 0] [2, %[[S7]], %[[S8]], 5] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x5xf32> -// CHECK-NEXT: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG6]], %[[ARG8]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x2x5xf32> -// CHECK-NEXT: %[[S9:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x5xf32>) outs(%[[EXTRACTED_SLICE_8]] : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32> -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG9]][0, 0, %[[ARG6]], %[[ARG8]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x2x5xf32> into tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: return %[[S1]] : tensor<6x6x2x2x2x5xf32> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] +// CHECK: %[[S3:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) +// CHECK: %[[S4:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S5:.*]] = affine.apply #[[$MAP1]]() +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP1]]() +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 5] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x5xf32> +// CHECK: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x2x5xf32> +// CHECK: %[[S7:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x5xf32>) outs(%[[EXTRACTED_SLICE_5]] : tensor<6x6x1x1x2x5xf32>) -> tensor<6x6x1x1x2x5xf32> // ----- @@ -211,62 +133,38 @@ func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop3:6 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop3:4 = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> // CHECK: #[[$MAP1:.+]] = affine_map<() -> (6)> - // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C6_5:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_7:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_8:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_9:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_10:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_11:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_12:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6_5]] step %[[C1_8]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_9]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2_6]] step %[[C1_10]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S5:.*]] = scf.for %[[ARG10:.*]] = %[[C0_3]] to %[[C2_7]] step %[[C1_11]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S6:.*]] = scf.for %[[ARG12:.*]] = %[[C0_4]] to %[[C5]] step %[[C1_12]] iter_args(%[[ARG13:.*]] = %[[ARG11]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S7:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) -// CHECK-NEXT: %[[S9:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG10]], %[[S7]], %[[S8]], %[[ARG12]]] [1, %[[S9]], %[[S10]], 1] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<1x?x?x1xf32> -// CHECK-NEXT: %[[EXTRACTED_SLICE_13:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x1x1xf32> -// CHECK-NEXT: %[[S11:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x?x?x1xf32>) outs(%[[EXTRACTED_SLICE_13]] : tensor<6x6x1x1x1x1xf32>) -> tensor<6x6x1x1x1x1xf32> -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S11]] into %[[ARG13]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x1x1x1x1xf32> into tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S6]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S5]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: return %[[S1]] : tensor<6x6x2x2x2x5xf32> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_6:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_4:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C2_4]] step %[[C1_5]] +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C1_7]] +// CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]]() +// CHECK: %[[S8:.*]] = affine.apply #[[$MAP1]]() +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], %[[S5]], %[[S6]], %[[ARG8]]] [1, %[[S7]], %[[S8]], 1] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<1x?x?x1xf32> +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x1x1x1x1xf32> +// CHECK: %[[S9:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x?x?x1xf32>) outs(%[[EXTRACTED_SLICE_10]] : tensor<6x6x1x1x1x1xf32>) -> tensor<6x6x1x1x1x1xf32> // ----- @@ -278,7 +176,7 @@ func.func @tile_winograd_input(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<6x6x2 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop3:6 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 2, 2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop3:4 = transform.structured.tile_using_for %0 tile_sizes [0, 0, 2, 2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } @@ -286,57 +184,32 @@ module attributes {transform.with_named_sequence} { // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (-d0 + 5, 2)> // CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * 4)> // CHECK: #[[$MAP2:.+]] = affine_map<() -> (10)> - // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<6x6x2x2x2x5xf32>) -> tensor<6x6x2x2x2x5xf32> { -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32> -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C6_5:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_7:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_8:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2_9:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_10:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_11:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_12:.*]] = arith.constant 2 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6_5]] step %[[C1_8]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C2_9]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2_6]] step %[[C2_10]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S5:.*]] = scf.for %[[ARG10:.*]] = %[[C0_3]] to %[[C2_7]] step %[[C2_11]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[S6:.*]] = scf.for %[[ARG12:.*]] = %[[C0_4]] to %[[C5]] step %[[C2_12]] iter_args(%[[ARG13:.*]] = %[[ARG11]]) -> (tensor<6x6x2x2x2x5xf32>) { -// CHECK-NEXT: %[[C5_13:.*]] = arith.constant 5 : index -// CHECK-NEXT: %[[S7:.*]] = affine.min #[[$MAP0]](%[[ARG12]]) -// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP1]](%[[ARG6]]) -// CHECK-NEXT: %[[S9:.*]] = affine.apply #[[$MAP1]](%[[ARG8]]) -// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP2]]() -// CHECK-NEXT: %[[S11:.*]] = affine.apply #[[$MAP2]]() -// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG10]], %[[S8]], %[[S9]], %[[ARG12]]] [2, %[[S10]], %[[S11]], %[[S7]]] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x?xf32> -// CHECK-NEXT: %[[EXTRACTED_SLICE_14:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 6, 2, 2, 2, %[[S7]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x2x2x2x?xf32> -// CHECK-NEXT: %[[S12:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x?xf32>) outs(%[[EXTRACTED_SLICE_14]] : tensor<6x6x2x2x2x?xf32>) -> tensor<6x6x2x2x2x?xf32> -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG13]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 6, 2, 2, 2, %[[S7]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x?xf32> into tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S6]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S5]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S4]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S3]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S2]] : tensor<6x6x2x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: return %[[S1]] : tensor<6x6x2x2x2x5xf32> +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_7:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_5:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C2_0:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_3:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_8:.*]] = arith.constant 2 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C2_5]] step %[[C2_6]] +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_7]] to %[[C5]] step %[[C2_8]] +// CHECK: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG8]]) +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP1]](%[[ARG2]]) +// CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]](%[[ARG4]]) +// CHECK: %[[S8:.*]] = affine.apply #[[$MAP2]]() +// CHECK: %[[S9:.*]] = affine.apply #[[$MAP2]]() +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], %[[S6]], %[[S7]], %[[ARG8]]] [2, %[[S8]], %[[S9]], %[[S5]]] [1, 1, 1, 1] : tensor<2x10x10x5xf32> to tensor<2x?x?x?xf32> +// CHECK: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 2, 2, 2, %[[S5]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x5xf32> to tensor<6x6x2x2x2x?xf32> +// CHECK: %[[S10:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<2x?x?x?xf32>) outs(%[[EXTRACTED_SLICE_12]] : tensor<6x6x2x2x2x?xf32>) -> tensor<6x6x2x2x2x?xf32> // ----- @@ -348,62 +221,38 @@ func.func @tile_winograd_input(%arg0: tensor<2x1x10x5xf32>, %arg1: tensor<1x6x1x module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_input_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop3:6 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop3:4 = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> // CHECK: #[[$MAP1:.+]] = affine_map<() -> (6)> - // CHECK-LABEL: func.func @tile_winograd_input( // CHECK-SAME: %[[ARG0:.*]]: tensor<2x1x10x5xf32>, %[[ARG1:.*]]: tensor<1x6x1x2x2x5xf32>) -> tensor<1x6x1x2x2x5xf32> { -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<1x6x1x2x2x5xf32> // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_6:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_4:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1_0:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_8:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_9:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_10:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_11:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_12:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C1]] step %[[C1_7]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<1x6x1x2x2x5xf32>) { -// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6]] step %[[C1_8]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<1x6x1x2x2x5xf32>) { -// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C1_5]] step %[[C1_9]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<1x6x1x2x2x5xf32>) { -// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2]] step %[[C1_10]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<1x6x1x2x2x5xf32>) { -// CHECK-NEXT: %[[S5:.*]] = scf.for %[[ARG10:.*]] = %[[C0_3]] to %[[C2_6]] step %[[C1_11]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<1x6x1x2x2x5xf32>) { -// CHECK-NEXT: %[[S6:.*]] = scf.for %[[ARG12:.*]] = %[[C0_4]] to %[[C5]] step %[[C1_12]] iter_args(%[[ARG13:.*]] = %[[ARG11]]) -> (tensor<1x6x1x2x2x5xf32>) { -// CHECK-NEXT: %[[S7:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) -// CHECK-NEXT: %[[S9:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG10]], 0, %[[S8]], %[[ARG12]]] [1, 1, %[[S10]], 1] [1, 1, 1, 1] : tensor<2x1x10x5xf32> to tensor<1x1x?x1xf32> -// CHECK-NEXT: %[[EXTRACTED_SLICE_13:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [1, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x6x1x2x2x5xf32> to tensor<1x6x1x1x1x1xf32> -// CHECK-NEXT: %[[S11:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x1x?x1xf32>) outs(%[[EXTRACTED_SLICE_13]] : tensor<1x6x1x1x1x1xf32>) -> tensor<1x6x1x1x1x1xf32> -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S11]] into %[[ARG13]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [1, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x6x1x1x1x1xf32> into tensor<1x6x1x2x2x5xf32> -// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<1x6x1x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S6]] : tensor<1x6x1x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S5]] : tensor<1x6x1x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S4]] : tensor<1x6x1x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S3]] : tensor<1x6x1x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S2]] : tensor<1x6x1x2x2x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: return %[[S1]] : tensor<1x6x1x2x2x5xf32> +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C1]] step %[[C1_0]] +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_2]] +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C2_4]] step %[[C1_5]] +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C1_7]] +// CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]]() +// CHECK: %[[S8:.*]] = affine.apply #[[$MAP1]]() +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG6]], 0, %[[S6]], %[[ARG8]]] [1, 1, %[[S8]], 1] [1, 1, 1, 1] : tensor<2x1x10x5xf32> to tensor<1x1x?x1xf32> +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [1, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<1x6x1x2x2x5xf32> to tensor<1x6x1x1x1x1xf32> +// CHECK: %[[S9:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<1x1x?x1xf32>) outs(%[[EXTRACTED_SLICE_10]] : tensor<1x6x1x1x1x1xf32>) -> tensor<1x6x1x1x1x1xf32> // ----- @@ -415,54 +264,29 @@ func.func @tile_winograd_output(%arg0 : tensor<6x6x2x2x2x2xf32>, %arg1: tensor<2 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:2 = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1, 1, 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> // CHECK: #[[$MAP1:.+]] = affine_map<() -> (4)> - // CHECK-LABEL: func.func @tile_winograd_output( // CHECK-SAME: %[[ARG0:.*]]: tensor<6x6x2x2x2x2xf32>, %[[ARG1:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> { -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<2x8x8x2xf32> // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C6_3:.*]] = arith.constant 6 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_4:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<2x8x8x2xf32>) { -// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6_3]] step %[[C1_5]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<2x8x8x2xf32>) { -// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_6]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<2x8x8x2xf32>) { -// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2_4]] step %[[C1_7]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<2x8x8x2xf32>) { -// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG6]], %[[ARG8]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> -// CHECK-NEXT: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK-NEXT: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) -// CHECK-NEXT: %[[S7:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[ARG1]][0, %[[S5]], %[[S6]], 0] [2, %[[S7]], %[[S8]], 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x?x?x2xf32> -// CHECK-NEXT: %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x6x1x1x2x2xf32>) outs(%[[EXTRACTED_SLICE_8]] : tensor<2x?x?x2xf32>) -> tensor<2x?x?x2xf32> -// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK-NEXT: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) -// CHECK-NEXT: %[[S12:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[S13:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG9]][0, %[[S10]], %[[S11]], 0] [2, %[[S12]], %[[S13]], 2] [1, 1, 1, 1] : tensor<2x?x?x2xf32> into tensor<2x8x8x2xf32> -// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<2x8x8x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S4]] : tensor<2x8x8x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S3]] : tensor<2x8x8x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S2]] : tensor<2x8x8x2xf32> -// CHECK-NEXT: } -// CHECK-NEXT: return %[[S1]] : tensor<2x8x8x2xf32> +// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> +// CHECK: %[[S3:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) +// CHECK: %[[S4:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S5:.*]] = affine.apply #[[$MAP1]]() +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP1]]() +// CHECK: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG1]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x?x?x2xf32> // ----- @@ -474,7 +298,7 @@ func.func @tile_winograd_output(%arg0 : tensor<6x6x2x2x3x5xf32>, %arg1: tensor<3 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:6 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 2, 2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [0, 0, 2, 2, 2, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } @@ -483,64 +307,34 @@ module attributes {transform.with_named_sequence} { // CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (-d0 + 5, 2)> // CHECK: #[[$MAP2:.+]] = affine_map<(d0) -> (d0 * 4)> // CHECK: #[[$MAP3:.+]] = affine_map<() -> (8)> - -// CHECK-LABEL: func.func @tile_winograd_output( -// CHECK-SAME: %[[ARG0:.*]]: tensor<6x6x2x2x3x5xf32>, %[[ARG1:.*]]: tensor<3x8x8x5xf32>) -> tensor<3x8x8x5xf32> { -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<3x8x8x5xf32> -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C6_5:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_6:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index -// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2_8:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_9:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_10:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C2_11:.*]] = arith.constant 2 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<3x8x8x5xf32>) { -// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C6_5]] step %[[C1_7]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<3x8x8x5xf32>) { -// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C2_8]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<3x8x8x5xf32>) { -// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C2_6]] step %[[C2_9]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<3x8x8x5xf32>) { -// CHECK-NEXT: %[[S5:.*]] = scf.for %[[ARG10:.*]] = %[[C0_3]] to %[[C3]] step %[[C2_10]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<3x8x8x5xf32>) { -// CHECK-NEXT: %[[S6:.*]] = scf.for %[[ARG12:.*]] = %[[C0_4]] to %[[C5]] step %[[C2_11]] iter_args(%[[ARG13:.*]] = %[[ARG11]]) -> (tensor<3x8x8x5xf32>) { -// CHECK-NEXT: %[[C3_12:.*]] = arith.constant 3 : index -// CHECK-NEXT: %[[S7:.*]] = affine.min #[[$MAP0]](%[[ARG10]]) -// CHECK-NEXT: %[[C5_13:.*]] = arith.constant 5 : index -// CHECK-NEXT: %[[S8:.*]] = affine.min #[[$MAP1]](%[[ARG12]]) -// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 6, 2, 2, %[[S7]], %[[S8]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x3x5xf32> to tensor<6x6x2x2x?x?xf32> -// CHECK-NEXT: %[[S9:.*]] = affine.apply #[[$MAP2]](%[[ARG6]]) -// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP2]](%[[ARG8]]) -// CHECK-NEXT: %[[S11:.*]] = affine.apply #[[$MAP3]]() -// CHECK-NEXT: %[[S12:.*]] = affine.apply #[[$MAP3]]() -// CHECK-NEXT: %[[EXTRACTED_SLICE_14:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG10]], %[[S9]], %[[S10]], %[[ARG12]]] [%[[S7]], %[[S11]], %[[S12]], %[[S8]]] [1, 1, 1, 1] : tensor<3x8x8x5xf32> to tensor -// CHECK-NEXT: %[[S13:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x6x2x2x?x?xf32>) outs(%[[EXTRACTED_SLICE_14]] : tensor) -> tensor -// CHECK-NEXT: %[[S14:.*]] = affine.apply #[[$MAP2]](%[[ARG6]]) -// CHECK-NEXT: %[[S15:.*]] = affine.apply #[[$MAP2]](%[[ARG8]]) -// CHECK-NEXT: %[[S16:.*]] = affine.apply #[[$MAP3]]() -// CHECK-NEXT: %[[S17:.*]] = affine.apply #[[$MAP3]]() -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[ARG13]][%[[ARG10]], %[[S14]], %[[S15]], %[[ARG12]]] [%[[S7]], %[[S16]], %[[S17]], %[[S8]]] [1, 1, 1, 1] : tensor into tensor<3x8x8x5xf32> -// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<3x8x8x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S6]] : tensor<3x8x8x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S5]] : tensor<3x8x8x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S4]] : tensor<3x8x8x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S3]] : tensor<3x8x8x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S2]] : tensor<3x8x8x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: return %[[S1]] : tensor<3x8x8x5xf32> -// CHECK-NEXT: } +// CHECK-LABEL: func.func @tile_winograd_output( +// CHECK-SAME: %[[ARG0:.*]]: tensor<6x6x2x2x3x5xf32>, %[[ARG1:.*]]: tensor<3x8x8x5xf32>) -> tensor<3x8x8x5xf32> { +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C0_6:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C2_0:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_3:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_5:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C2_7:.*]] = arith.constant 2 : index +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C3]] step %[[C2_5]] +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C2_7]] +// CHECK: %[[C3_8:.*]] = arith.constant 3 : index +// CHECK: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG6]]) +// CHECK: %[[C5_9:.*]] = arith.constant 5 : index +// CHECK: %[[S6:.*]] = affine.min #[[$MAP1]](%[[ARG8]]) +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 6, 2, 2, %[[S5]], %[[S6]]] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x3x5xf32> to tensor<6x6x2x2x?x?xf32> +// CHECK: %[[S7:.*]] = affine.apply #[[$MAP2]](%[[ARG2]]) +// CHECK: %[[S8:.*]] = affine.apply #[[$MAP2]](%[[ARG4]]) +// CHECK: %[[S9:.*]] = affine.apply #[[$MAP3]]() +// CHECK: %[[S10:.*]] = affine.apply #[[$MAP3]]() +// CHECK: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S7]], %[[S8]], %[[ARG8]]] [%[[S5]], %[[S9]], %[[S10]], %[[S6]]] [1, 1, 1, 1] : tensor<3x8x8x5xf32> to tensor // ----- @@ -552,63 +346,35 @@ func.func @tile_winograd_output(%arg0 : tensor<6x1x2x1x3x5xf32>, %arg1: tensor<3 module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { %0 = transform.structured.match ops{["linalg.winograd_output_transform"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %1, %loop1:6 = transform.structured.tile_using_for %0 tile_sizes [1, 1, 1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) + %1, %loop1:4 = transform.structured.tile_using_for %0 tile_sizes [0, 0, 1, 1, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op) transform.yield } } // CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 4)> // CHECK: #[[$MAP1:.+]] = affine_map<() -> (4)> - // CHECK-LABEL: func.func @tile_winograd_output( // CHECK-SAME: %[[ARG0:.*]]: tensor<6x1x2x1x3x5xf32>, %[[ARG1:.*]]: tensor<3x8x1x5xf32>) -> tensor<3x8x1x5xf32> { -// CHECK: %[[S0:.*]] = tensor.empty() : tensor<3x8x1x5xf32> // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_1:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_2:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C0_3:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C0_4:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0_5:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C1_5:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_7:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_8:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_9:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_10:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C1_11:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C6]] step %[[C1_6]] iter_args(%[[ARG3:.*]] = %[[S0]]) -> (tensor<3x8x1x5xf32>) { -// CHECK-NEXT: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1]] step %[[C1_7]] iter_args(%[[ARG5:.*]] = %[[ARG3]]) -> (tensor<3x8x1x5xf32>) { -// CHECK-NEXT: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_1]] to %[[C2]] step %[[C1_8]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) -> (tensor<3x8x1x5xf32>) { -// CHECK-NEXT: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_2]] to %[[C1_5]] step %[[C1_9]] iter_args(%[[ARG9:.*]] = %[[ARG7]]) -> (tensor<3x8x1x5xf32>) { -// CHECK-NEXT: %[[S5:.*]] = scf.for %[[ARG10:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_10]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) -> (tensor<3x8x1x5xf32>) { -// CHECK-NEXT: %[[S6:.*]] = scf.for %[[ARG12:.*]] = %[[C0_4]] to %[[C5]] step %[[C1_11]] iter_args(%[[ARG13:.*]] = %[[ARG11]]) -> (tensor<3x8x1x5xf32>) { -// CHECK-NEXT: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG6]], %[[ARG8]], %[[ARG10]], %[[ARG12]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x2x1x3x5xf32> to tensor<6x1x1x1x1x1xf32> -// CHECK-NEXT: %[[S7:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK-NEXT: %[[S8:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) -// CHECK-NEXT: %[[S9:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[S10:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG10]], %[[S7]], 0, %[[ARG12]]] [1, %[[S9]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32> -// CHECK-NEXT: %[[S11:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x1x1x1x1x1xf32>) outs(%[[EXTRACTED_SLICE_12]] : tensor<1x?x1x1xf32>) -> tensor<1x?x1x1xf32> -// CHECK-NEXT: %[[S12:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK-NEXT: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG8]]) -// CHECK-NEXT: %[[S14:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[S15:.*]] = affine.apply #[[$MAP1]]() -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S11]] into %[[ARG13]][%[[ARG10]], %[[S12]], 0, %[[ARG12]]] [1, %[[S14]], 1, 1] [1, 1, 1, 1] : tensor<1x?x1x1xf32> into tensor<3x8x1x5xf32> -// CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<3x8x1x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S6]] : tensor<3x8x1x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S5]] : tensor<3x8x1x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S4]] : tensor<3x8x1x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S3]] : tensor<3x8x1x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: scf.yield %[[S2]] : tensor<3x8x1x5xf32> -// CHECK-NEXT: } -// CHECK-NEXT: return %[[S1]] : tensor<3x8x1x5xf32> +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1_1]] step %[[C1_2]] +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_4]] +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_5]] to %[[C5]] step %[[C1_6]] +// CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x2x1x3x5xf32> to tensor<6x1x1x1x1x1xf32> +// CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) +// CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) +// CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]]() +// CHECK: %[[S8:.*]] = affine.apply #[[$MAP1]]() +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S5]], 0, %[[ARG8]]] [1, %[[S7]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32> +// CHECK: %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x1x1x1x1x1xf32>) outs(%[[EXTRACTED_SLICE_9]] : tensor<1x?x1x1xf32>) -> tensor<1x?x1x1xf32> From 4b1c9ddbe542e901fdcd453acd255eb175dd6f27 Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Fri, 16 Aug 2024 09:15:39 +0100 Subject: [PATCH 9/9] clang-format --- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 67cd4272abdfd..775ed8f37344e 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -2999,14 +2999,16 @@ LogicalResult WinogradInputTransformOp::verify() { expectedOutputShape[getOutputTileHDim()] = ShapedType::kDynamic; } else { expectedOutputShape[getOutputAlphaHDim()] = leftTransform ? tileSize : 1; - expectedOutputShape[getOutputTileHDim()] = leftTransform ? (inputH - (r - 1)) / m : 1; + expectedOutputShape[getOutputTileHDim()] = + leftTransform ? (inputH - (r - 1)) / m : 1; } if (ShapedType::isDynamic(inputW)) { expectedOutputShape[getOutputAlphaWDim()] = tileSize; expectedOutputShape[getOutputTileWDim()] = ShapedType::kDynamic; } else { expectedOutputShape[getOutputAlphaWDim()] = rightTransform ? tileSize : 1; - expectedOutputShape[getOutputTileWDim()] = rightTransform ? (inputW - (r - 1)) / m : 1; + expectedOutputShape[getOutputTileWDim()] = + rightTransform ? (inputW - (r - 1)) / m : 1; } expectedOutputShape[getOutputNDim()] = inputShape[getInputNDim()]; expectedOutputShape[getOutputCDim()] = inputShape[getInputCDim()]; @@ -3173,7 +3175,8 @@ LogicalResult WinogradOutputTransformOp::verify() { } else { if (valueW != (rightTransform ? m + r - 1 : 1)) return emitOpError("expect input width equals to input tile size"); - expectedOutputShape[getOutputWDim()] = (rightTransform ? m : 1) * valueTileW; + expectedOutputShape[getOutputWDim()] = + (rightTransform ? m : 1) * valueTileW; } expectedOutputShape[getOutputNDim()] = valueShape[getValueNDim()]; expectedOutputShape[getOutputFDim()] = valueShape[getValueFDim()];