From 058ee4cdbad77d4fbd217bebffbe7f2d40048daa Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <sebastian.larsson@arm.com>
Date: Tue, 7 Jan 2025 14:28:39 +0100
Subject: [PATCH 1/3] Arm backend: Update size_adjust_conv2d_pass docs

Explain more thorougly what the pass does.

Change-Id: I9260512fb0b8afe23505edcce47dd3c9f59e9690
---
 .../arm/_passes/size_adjust_conv2d_pass.py    | 53 +++++++++++++++++--
 1 file changed, 48 insertions(+), 5 deletions(-)

diff --git a/backends/arm/_passes/size_adjust_conv2d_pass.py b/backends/arm/_passes/size_adjust_conv2d_pass.py
index 08da9a74c91..c7572bb97d9 100644
--- a/backends/arm/_passes/size_adjust_conv2d_pass.py
+++ b/backends/arm/_passes/size_adjust_conv2d_pass.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -16,7 +16,8 @@
 
 def conv_remainder(input_length, pad, dilation, weight, stride):
     """
-    Returns the size
+    Returns the remainder of input_length; given the padding, dilation, stride,
+    and kernel size.
     """
     return (input_length + 2 * pad - dilation * (weight - 1) - 1) % stride
 
@@ -65,9 +66,51 @@ def create_node(
 
 class SizeAdjustConv2DPass(ExportPass):
     """
-    Adjust the convolution input size to match perfectly with the
-    weight size, padding, stride and dilation parameters.
-    This is done by inserting a slice op to remove the uneven end of the input.
+    Adjust the convolution input size to match the kernel size, padding, stride,
+    and dilation parameters. Pytorch allows the input and kernel shape to not
+    "match", in which case the remaining rows/columns are truncated. However,
+    matching the size is a requirement in the TOSA specification. In case the
+    input and kernel shape do not match, the following is done to meet the
+    specification:
+
+      1) The padding is truncated (done in the node visitor)
+      2) (if neccessary) The input is truncated (done in this pass)."
+
+    A simple example would be a 2x2 kernel (no padding, stride=2) and a 5x5
+    input:
+
+    ┌───┬───┬───┬───┬───┐    ┌───┬───┬───┬───┬───┐    ┌───┬───┬───┬───┬───┐
+    │ X │ X │   │   │   │    │   │   │ X │ X │   │    │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │ X │ X │   │   │   │    │   │   │ X │ X │   │    │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │   │   │   │ -> │   │   │   │   │   │ -> │ X │ X │   │   │   │ ->
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │   │   │   │    │   │   │   │   │   │    │ X │ X │   │   │   │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │   │   │   │    │   │   │   │   │   │    │   │   │   │   │   │
+    └───┴───┴───┴───┴───┘    └───┴───┴───┴───┴───┘    └───┴───┴───┴───┴───┘
+         First pass               second pass              third pass
+
+    ┌───┬───┬───┬───┬───┐    ┌───┬───┬───┬───┬───┐
+    │   │   │   │   │   │    │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │   │   │   │    │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │ X │ X │   │ -> │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │ X │ X │   │    │   │   │   │   │ - │
+    ├───┼───┼───┼───┼───┤    ├───┼───┼───┼───┼───┤
+    │   │   │   │   │   │    │ - │ - │ - │ - │ - │
+    └───┴───┴───┴───┴───┘    └───┴───┴───┴───┴───┘
+         Fourth pass            Unvisited cells
+
+    Cells that are never visited are marked with `-` and are never considered
+    when the kernel traverses over the input, hence they can be removed.
+
+    To match the shape of the kernel (and all parameters) with the input, a
+    slice op is inserted to remove the remaining edges (rows and columns) of the
+    input.
     """
 
     conv2d_op = exir_ops.edge.aten.convolution.default

From ed40f7a887b3dd8e237a0a8da5f036d3d142240a Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <sebastian.larsson@arm.com>
Date: Tue, 7 Jan 2025 10:27:02 +0100
Subject: [PATCH 2/3] Arm backend: Remove code duplication from
 size_adjust_conv2d_pass

The functions insert_q_dq_pair and create_node were duplicates of
functions with the same name in arm_pass_utils.py.

Import the functions from arm_pass_utils and remove the duplicate
functions.

Change-Id: I8f16af56f8438d8cd33d087c4ebac419cb5b2596
---
 .../arm/_passes/size_adjust_conv2d_pass.py    | 50 ++-----------------
 1 file changed, 3 insertions(+), 47 deletions(-)

diff --git a/backends/arm/_passes/size_adjust_conv2d_pass.py b/backends/arm/_passes/size_adjust_conv2d_pass.py
index c7572bb97d9..ee811273438 100644
--- a/backends/arm/_passes/size_adjust_conv2d_pass.py
+++ b/backends/arm/_passes/size_adjust_conv2d_pass.py
@@ -6,12 +6,12 @@
 
 # pyre-unsafe
 
-from typing import cast, Optional
+from typing import cast
 
 import torch.fx
+from executorch.backends.arm._passes.arm_pass_utils import create_node
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
-from torch._ops import OpOverload
 
 
 def conv_remainder(input_length, pad, dilation, weight, stride):
@@ -22,48 +22,6 @@ def conv_remainder(input_length, pad, dilation, weight, stride):
     return (input_length + 2 * pad - dilation * (weight - 1) - 1) % stride
 
 
-def insert_q_dq_pair(
-    graph: torch.fx.Graph,
-    anchor: torch.fx.Node,
-    q_params: tuple,
-):
-    with graph.inserting_after(anchor):
-        q = create_node(
-            graph=graph,
-            op_target=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-            args=(),  # We add the argument last
-        )
-        q.meta = anchor.meta
-
-    with graph.inserting_after(q):
-        dq = create_node(
-            graph=graph,
-            op_target=exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
-            args=(q,) + q_params,
-        )
-        dq.meta = q.meta
-
-    anchor.replace_all_uses_with(dq)
-    # We add this last so the replace all uses above does not replace the quantized
-    # node's first use
-    q.args = (anchor,) + q_params
-    return dq
-
-
-def create_node(
-    graph: torch.fx.Graph,
-    op_target: OpOverload,
-    args: tuple = (),
-    kwargs: Optional[dict] = None,
-):
-    return graph.create_node(
-        "call_function",
-        op_target,
-        args=args,
-        kwargs=kwargs or {},
-    )
-
-
 class SizeAdjustConv2DPass(ExportPass):
     """
     Adjust the convolution input size to match the kernel size, padding, stride,
@@ -152,9 +110,7 @@ def call(self, graph_module: torch.fx.GraphModule):
             with graph_module.graph.inserting_before(node):
                 last_node = cast(torch.fx.Node, input_node)
                 for args in slice_args:
-                    slice_node = graph.create_node(
-                        "call_function", self.slice_op, (last_node,) + args
-                    )
+                    slice_node = create_node(graph, self.slice_op, (last_node,) + args)
                     last_node = slice_node
                 conv_node.replace_input_with(cast(torch.fx.Node, input_node), last_node)
                 modified_graph = True

From eff64e602638706e905bf2feb5e5d8f8c94792ef Mon Sep 17 00:00:00 2001
From: Sebastian Larsson <sebastian.larsson@arm.com>
Date: Tue, 7 Jan 2025 09:32:59 +0100
Subject: [PATCH 3/3] Arm backend: Add tests that require size_adjust pass

Change-Id: I84e9cb02f18cb1c81a27156b6e9d946c11291adf
---
 backends/arm/test/ops/test_conv1d.py |  47 +++++++++++-
 backends/arm/test/ops/test_conv2d.py | 107 ++++++++++++++++++++++++++-
 2 files changed, 151 insertions(+), 3 deletions(-)

diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
index 593260ac56f..41138442db5 100644
--- a/backends/arm/test/ops/test_conv1d.py
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -180,6 +180,47 @@ def forward(self, x):
     batches=1,
 )
 
+conv1d_7_1x3x16_st2_pd1_dl2 = Conv1d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=7,
+    stride=2,
+    padding=1,
+    dilation=2,
+    length=16,
+    batches=1,
+)
+conv1d_7_1x3x15_st1_pd0_dl1 = Conv1d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=7,
+    stride=1,
+    padding=0,
+    dilation=1,
+    length=15,
+    batches=1,
+)
+conv1d_5_1x3x14_st5_pd0_dl1 = Conv1d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=5,
+    stride=5,
+    padding=0,
+    dilation=1,
+    length=14,
+    batches=1,
+)
+conv1d_5_1x3x9_st5_pd0_dl1 = Conv1d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=5,
+    stride=5,
+    padding=0,
+    dilation=1,
+    length=9,
+    batches=1,
+)
+
 two_conv1d_nobias = Conv1d(
     nbr_conv=2,
     length=256,
@@ -214,6 +255,10 @@ def forward(self, x):
     ("2_1x2x14_st2", conv1d_2_1x2x14_st2),
     ("5_3x2x128_st1", conv1d_5_3x2x128_st1),
     ("3_1x3x224_st2_pd1", conv1d_3_1x3x224_st2_pd1),
+    ("7_1x3x16_st2_pd1_dl2_needs_adjust_pass", conv1d_7_1x3x16_st2_pd1_dl2),
+    ("7_1x3x15_st1_pd0_dl1_needs_adjust_pass", conv1d_7_1x3x15_st1_pd0_dl1),
+    ("5_1x3x14_st5_pd0_dl1_needs_adjust_pass", conv1d_5_1x3x14_st5_pd0_dl1),
+    ("5_1x3x9_st5_pd0_dl1_needs_adjust_pass", conv1d_5_1x3x9_st5_pd0_dl1),
     ("two_conv1d_nobias", two_conv1d_nobias),
     ("two_conv1d", two_conv1d),
 ]
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index 9ccac539408..6312a4dd2cb 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -201,6 +201,101 @@ def forward(self, x):
     batches=1,
 )
 
+conv2d_7x7_1x3x16x16_st2_pd1_dl2 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(7, 7),
+    stride=2,
+    padding=1,
+    dilation=2,
+    width=16,
+    height=16,
+    batches=1,
+)
+
+conv2d_7x7_1x3x15x15_st1_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(7, 7),
+    stride=1,
+    padding=0,
+    dilation=1,
+    width=15,
+    height=15,
+    batches=1,
+)
+
+conv2d_5x5_1x3x14x14_st5_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(5, 5),
+    stride=5,
+    padding=0,
+    dilation=1,
+    width=14,
+    height=14,
+    batches=1,
+)
+
+conv2d_5x5_1x3x9x9_st5_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(5, 5),
+    stride=5,
+    padding=0,
+    dilation=1,
+    width=9,
+    height=9,
+    batches=1,
+)
+
+conv2d_3x3_1x3x8x9_st3_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(3, 3),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=8,
+    height=9,
+    batches=1,
+)
+
+conv2d_3x3_1x3x9x8_st3_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(3, 3),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=8,
+    height=9,
+    batches=1,
+)
+
+conv2d_3x4_1x3x7x7_st3_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(3, 4),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=7,
+    height=7,
+    batches=1,
+)
+
+conv2d_4x3_1x3x7x7_st3_pd0_dl1 = Conv2d(
+    in_channels=3,
+    out_channels=3,
+    kernel_size=(4, 3),
+    stride=3,
+    padding=0,
+    dilation=1,
+    width=7,
+    height=7,
+    batches=1,
+)
 
 two_conv2d_nobias = Conv2d(
     nbr_conv=2,
@@ -236,7 +331,15 @@ def forward(self, x):
     ("3x3_1x3x12x12_st2_pd1", conv2d_3x3_1x3x12x12_st2_pd1),
     ("1x1_1x2x128x128_st1", conv2d_1x1_1x2x128x128_st1),
     ("2x2_1x1x14x13_st2_needs_adjust_pass", conv2d_2x2_1x1x14x13_st2),
-    ("conv2d_5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1),
+    ("5x5_1x3x14x15_st3_pd1_needs_adjust_pass", conv2d_5x5_1x3x14x15_st3_pd1),
+    ("7x7_1x3x16x16_st2_pd1_dl2_needs_adjust_pass", conv2d_7x7_1x3x16x16_st2_pd1_dl2),
+    ("7x7_1x3x15x15_st1_pd0_dl1_needs_adjust_pass", conv2d_7x7_1x3x15x15_st1_pd0_dl1),
+    ("5x5_1x3x14x14_st5_pd0_dl1_needs_adjust_pass", conv2d_5x5_1x3x14x14_st5_pd0_dl1),
+    ("5x5_1x3x9x9_st5_pd0_dl1_needs_adjust_pass", conv2d_5x5_1x3x9x9_st5_pd0_dl1),
+    ("3x3_1x3x9x8_st3_pd0_dl1_needs_adjust_pass", conv2d_3x3_1x3x9x8_st3_pd0_dl1),
+    ("3x3_1x3x8x9_st3_pd0_dl1_needs_adjust_pass", conv2d_3x3_1x3x8x9_st3_pd0_dl1),
+    ("3x4_1x3x7x7_st3_pd0_dl1_needs_adjust_pass", conv2d_3x4_1x3x7x7_st3_pd0_dl1),
+    ("4x3_1x3x7x7_st3_pd0_dl1_needs_adjust_pass", conv2d_4x3_1x3x7x7_st3_pd0_dl1),
     ("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1),
     ("3x3_1x3x224x224_st2_pd1", conv2d_3x3_1x3x224x224_st2_pd1),
     ("two_conv2d_nobias", two_conv2d_nobias),