Align lowering with new utils behavior

dchigarev · dchigarev · commit 62c5c38064ad · 2025-09-12T11:01:50.000Z
Signed-off-by: dchigarev &lt;dmitry.chigarev@intel.com&gt;
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -97,6 +97,9 @@ static LogicalResult transferPreconditions(PatternRewriter &rewriter,
   return success();
 }
 
+// Common preconditions for the lowering of vector.gather and vector.scatter:
+//  1. Source is a memref.
+//  2. The innermost dimension of the memref is contiguous (stride == 1)
 static LogicalResult gatherScatterPreconditions(PatternRewriter &rewriter,
                                                 Operation *op, Type baseType) {
   auto srcTy = dyn_cast<MemRefType>(baseType);
@@ -259,7 +262,7 @@ computeMemrefMeta(OpType xferOp, PatternRewriter &rewriter) {
     adjustStridesForPermutation(permMap, strides);
   }
 
-  return strides;
+  return {strides, offsetVal};
 }
 
 // This function compute the vectors of localOffsets for scattered load/stores.
@@ -374,15 +377,14 @@ template <
     typename = std::enable_if_t<llvm::is_one_of<
         std::decay_t<OpType>, vector::GatherOp, vector::ScatterOp>::value>>
 static Value computeOffsets(PatternRewriter &rewriter, OpType gatScatOp,
-                            ArrayRef<Value> strides) {
+                            ArrayRef<Value> strides, Value baseOffset) {
   Location loc = gatScatOp.getLoc();
   SmallVector<Value> offsets = gatScatOp.getOffsets();
-  Value linearOffset = arith::ConstantIndexOp::create(rewriter, loc, 0);
   for (size_t i = 0; i < offsets.size(); ++i) {
     Value offsetContrib =
         arith::MulIOp::create(rewriter, loc, offsets[i], strides[i]);
-    linearOffset =
-        arith::AddIOp::create(rewriter, loc, linearOffset, offsetContrib);
+    baseOffset =
+        arith::AddIOp::create(rewriter, loc, baseOffset, offsetContrib);
   }
   Value indices = gatScatOp.getIndices();
   VectorType vecType = cast<VectorType>(indices.getType());
@@ -391,7 +393,7 @@ static Value computeOffsets(PatternRewriter &rewriter, OpType gatScatOp,
       vector::BroadcastOp::create(
           rewriter, loc,
           VectorType::get(vecType.getShape(), rewriter.getIndexType()),
-          linearOffset)
+          baseOffset)
           .getResult();
   return arith::AddIOp::create(rewriter, loc, baseVector, indices).getResult();
 }
@@ -402,8 +404,7 @@ template <
         std::decay_t<OpType>, vector::TransferReadOp, vector::TransferWriteOp,
         vector::GatherOp, vector::ScatterOp>::value>>
 // Convert memref to i64 base pointer
-static Value memrefToIndexPtr(OpType xferOp,
-                              PatternRewriter &rewriter) {
+static Value memrefToIndexPtr(OpType xferOp, PatternRewriter &rewriter) {
   Location loc = xferOp.getLoc();
   auto indexPtr = memref::ExtractAlignedPointerAsIndexOp::create(
                       rewriter, loc, xferOp.getBase())
@@ -613,18 +614,13 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
     Location loc = gatherOp.getLoc();
     VectorType vectorType = gatherOp.getVectorType();
 
-    SmallVector<Value> strides = computeStrides(gatherOp, rewriter);
-    if (strides.empty())
+    auto meta = computeMemrefMeta(gatherOp, rewriter);
+    if (meta.first.empty())
       return rewriter.notifyMatchFailure(gatherOp, "Failed to compute strides");
 
-    Value localOffsets = computeOffsets(rewriter, gatherOp, strides);
-    Value flatMemref = collapseMemrefTo1D(gatherOp, rewriter);
-
-    if (auto alignment = gatherOp.getAlignment()) {
-      flatMemref = memref::AssumeAlignmentOp::create(rewriter, loc, flatMemref,
-                                                     alignment.value())
-                       .getResult();
-    }
+    Value localOffsets =
+        computeOffsets(rewriter, gatherOp, meta.first, meta.second);
+    Value flatMemref = memrefToIndexPtr(gatherOp, rewriter);
 
     auto xeGatherOp = xegpu::LoadGatherOp::create(
         rewriter, loc, vectorType, flatMemref, localOffsets, gatherOp.getMask(),
@@ -651,19 +647,14 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
       return failure();
 
     Location loc = scatterOp.getLoc();
-    SmallVector<Value> strides = computeStrides(scatterOp, rewriter);
-    if (strides.empty())
+    auto meta = computeMemrefMeta(scatterOp, rewriter);
+    if (meta.first.empty())
       return rewriter.notifyMatchFailure(scatterOp,
                                          "Failed to compute strides");
 
-    Value localOffsets = computeOffsets(rewriter, scatterOp, strides);
-    Value flatMemref = collapseMemrefTo1D(scatterOp, rewriter);
-
-    if (auto alignment = scatterOp.getAlignment()) {
-      flatMemref = memref::AssumeAlignmentOp::create(rewriter, loc, flatMemref,
-                                                     alignment.value())
-                       .getResult();
-    }
+    Value localOffsets =
+        computeOffsets(rewriter, scatterOp, meta.first, meta.second);
+    Value flatMemref = memrefToIndexPtr(scatterOp, rewriter);
 
     xegpu::StoreScatterOp::create(rewriter, loc, scatterOp.getValueToStore(),
                                   flatMemref, localOffsets, scatterOp.getMask(),
diff --git a/mlir/test/Conversion/VectorToXeGPU/gather-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/gather-to-xegpu.mlir
@@ -19,8 +19,9 @@ gpu.func @load_1D_vector(%source: memref<8x16x32xf32>,
 // CHECK-COUNT2: arith.addi {{.*}} : index
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}}:  index to vector<8xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], %[[INDICES]] : vector<8xindex>
-// CHECK:        %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2]{{\]}} : memref<8x16x32xf32> into memref<4096xf32>
-// CHECK:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] : memref<4096xf32>, vector<8xindex>, vector<8xi1> -> vector<8xf32>
+// CHECK:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index
+// CHECK:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// CHECK:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf32>
 // CHECK:        %[[RES:.+]] = arith.select %[[MASK]], %[[VEC]], %[[PASS_THRU]] : vector<8xi1>, vector<8xf32>
 // CHECK:        gpu.return %[[RES]] : vector<8xf32>
 }
@@ -45,8 +46,9 @@ gpu.func @load_2D_memref(%source: memref<8x32xf32>,
 // CHECK-COUNT1: arith.addi {{.*}} : index
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}}:  index to vector<8xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], %[[INDICES]] : vector<8xindex>
-// CHECK:        %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1]{{\]}} : memref<8x32xf32> into memref<256xf32>
-// CHECK:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] : memref<256xf32>, vector<8xindex>, vector<8xi1> -> vector<8xf32>
+// CHECK:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x32xf32> -> index
+// CHECK:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// CHECK:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] : i64, vector<8xindex>, vector<8xi1> -> vector<8xf32>
 // CHECK:        %[[RES:.+]] = arith.select %[[MASK]], %[[VEC]], %[[PASS_THRU]] : vector<8xi1>, vector<8xf32>
 // CHECK:        gpu.return %[[RES]] : vector<8xf32>
 }
@@ -71,8 +73,9 @@ gpu.func @load_2D_vector(%source: memref<8x16x32xf32>,
 // CHECK-COUNT2: arith.addi {{.*}} : index
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}}:  index to vector<8x16xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], %[[INDICES]] : vector<8x16xindex>
-// CHECK:        %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2]{{\]}} : memref<8x16x32xf32> into memref<4096xf32>
-// CHECK:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] : memref<4096xf32>, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
+// CHECK:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<8x16x32xf32> -> index
+// CHECK:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// CHECK:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
 // CHECK:        %[[RES:.+]] = arith.select %[[MASK]], %[[VEC]], %[[PASS_THRU]] : vector<8x16xi1>, vector<8x16xf32>
 // CHECK:        gpu.return %[[RES]] : vector<8x16xf32>
 }
@@ -98,8 +101,9 @@ gpu.func @load_dynamic_source(%source: memref<?x?x?xf32>,
 // CHECK-COUNT2: arith.addi {{.*}} : index
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}}:  index to vector<8x16xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], %[[INDICES]] : vector<8x16xindex>
-// CHECK:        %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2]{{\]}} : memref<?x?x?xf32> into memref<?xf32>
-// CHECK:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] : memref<?xf32>, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
+// CHECK:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<?x?x?xf32> -> index
+// CHECK:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// CHECK:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
 // CHECK:        %[[RES:.+]] = arith.select %[[MASK]], %[[VEC]], %[[PASS_THRU]] : vector<8x16xi1>, vector<8x16xf32>
 // CHECK:        gpu.return %[[RES]] : vector<8x16xf32>
 }
@@ -125,8 +129,9 @@ gpu.func @load_dynamic_source2(%source: memref<?x8x16xf32>,
 // CHECK-COUNT2: arith.addi {{.*}} : index
 // CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}}:  index to vector<8x16xindex>
 // CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], %[[INDICES]] : vector<8x16xindex>
-// CHECK:        %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2]{{\]}} : memref<?x8x16xf32> into memref<?xf32>
-// CHECK:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] : memref<?xf32>, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
+// CHECK:        %[[COLLAPSE:.+]] = memref.extract_aligned_pointer_as_index %[[SRC]] : memref<?x8x16xf32> -> index
+// CHECK:        %[[COLLAPSE_I:.+]] = arith.index_cast %[[COLLAPSE]] : index to i64
+// CHECK:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_I]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] : i64, vector<8x16xindex>, vector<8x16xi1> -> vector<8x16xf32>
 // CHECK:        %[[RES:.+]] = arith.select %[[MASK]], %[[VEC]], %[[PASS_THRU]] : vector<8x16xi1>, vector<8x16xf32>
 // CHECK:        gpu.return %[[RES]] : vector<8x16xf32>
 }
@@ -146,42 +151,37 @@ gpu.func @no_load_tensor(%source: tensor<32x64xf32>,
 
 // -----
 gpu.module @xevm_module {
-gpu.func @no_load_non_unit_inner_stride(
-    %source: memref<32xf32, strided<[?], offset: ?>>,
-    %off: index, %indices: vector<8xindex>, %mask: vector<8xi1>,
-    %pass_thru: vector<8xf32>) -> vector<8xf32> {
-  %0 = vector.gather %source[%off][%indices], %mask, %pass_thru
-    : memref<32xf32, strided<[?], offset: ?>>, vector<8xindex>, vector<8xi1>, vector<8xf32> into vector<8xf32>
-  gpu.return %0 : vector<8xf32>
-}
-// CHECK-LABEL:  @no_load_non_unit_inner_stride(
-// CHECK:        vector.gather
+gpu.func @gather_from_subview(%source: memref<4096x4096xf16>,
+                              %off1: index, %off2: index,
+                              %indices: vector<8xindex>,
+                              %mask: vector<8xi1>,
+                              %pass_thru: vector<8xf16>) -> vector<8xf16> {
+  %subview = memref.subview %source[%off1, %off2] [256, 256] [1, 1]
+      : memref<4096x4096xf16>
+        to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+  %0 = vector.gather %subview[%off1, %off2][%indices], %mask, %pass_thru
+       : memref<256x256xf16, strided<[4096, 1], offset: ?>>,
+         vector<8xindex>, vector<8xi1>, vector<8xf16>
+         into vector<8xf16>
+  gpu.return %0 : vector<8xf16>
+}
+// CHECK-LABEL:  @gather_from_subview(
+// CHECK-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
+// CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index,
+// CHECK-SAME:   %[[INDICES:.+]]: vector<8xindex>,
+// CHECK-SAME:   %[[MASK:.+]]: vector<8xi1>,
+// CHECK-SAME:   %[[PASS:.+]]: vector<8xf16>) -> vector<8xf16> {
+// CHECK:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1]
+// CHECK:        %[[BB:.+]], %[[OFFSET:.+]],{{.*}},{{.*}} = memref.extract_strided_metadata %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> memref<f16>, index, index, index, index, index
+// CHECK:        arith.muli {{.*}} : index
+// CHECK:        arith.addi %[[OFFSET]]{{.*}} : index
+// CHECK:        %[[BASE_OFF:.+]] = arith.addi {{.*}} : index
+// CHECK:        %[[SPLAT:.+]] = vector.broadcast %[[BASE_OFF]] : index to vector<8xindex>
+// CHECK:        %[[LIN:.+]] = arith.addi %[[SPLAT]], %[[INDICES]] : vector<8xindex>
+// CHECK:        %[[BASE_IDX:.+]] = memref.extract_aligned_pointer_as_index %[[SUBVIEW]] : memref<256x256xf16, strided<[4096, 1], offset: ?>> -> index
+// CHECK:        %[[BASE_I64:.+]] = arith.index_cast %[[BASE_IDX]] : index to i64
+// CHECK:        %[[VEC:.+]] = xegpu.load %[[BASE_I64]]{{\[}}%[[LIN]]{{\]}}, %[[MASK]]
+// CHECK-SAME:     : i64, vector<8xindex>, vector<8xi1> -> vector<8xf16>
+// CHECK:        %[[RES:.+]] = arith.select %[[MASK]], %[[VEC]], %[[PASS]] : vector<8xi1>, vector<8xf16>
+// CHECK:        gpu.return %[[RES]] : vector<8xf16>
 }
-
-// -----
-gpu.module @xevm_module {
-gpu.func @load_1D_aligned(%source: memref<8x16x32xf32>,
-     %off1: index, %off2: index, %off3: index,
-     %indices: vector<8xindex>, %mask: vector<8xi1>,
-     %pass_thru: vector<8xf32>) -> vector<8xf32> {
-  %0 = vector.gather %source[%off1, %off2, %off3][%indices], %mask,
-       %pass_thru {alignment = 256} : memref<8x16x32xf32>, vector<8xindex>, vector<8xi1>, vector<8xf32> into vector<8xf32>
-  gpu.return %0 : vector<8xf32>
-}
-// CHECK-LABEL:  @load_1D_aligned(
-// CHECK-SAME:   %[[SRC:.+]]: memref<8x16x32xf32>,
-// CHECK-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index, %[[OFF3:.+]]: index,
-// CHECK-SAME:   %[[INDICES:.+]]: vector<8xindex>
-// CHECK-SAME:   %[[MASK:.+]]: vector<8xi1>
-// CHECK-SAME:   %[[PASS_THRU:.+]]: vector<8xf32>) -> vector<8xf32> {
-// CHECK-COUNT2: arith.muli {{.*}} : index
-// CHECK-COUNT2: arith.addi {{.*}} : index
-// CHECK:        %[[SPLAT:.+]] = vector.broadcast {{.*}}:  index to vector<8xindex>
-// CHECK:        %[[LIN_IDX:.+]] = arith.addi %[[SPLAT]], %[[INDICES]] : vector<8xindex>
-// CHECK:        %[[COLLAPSE:.+]] = memref.collapse_shape %[[SRC]] {{\[}}[0, 1, 2]{{\]}} : memref<8x16x32xf32> into memref<4096xf32>
-// CHECK:        %[[COLLAPSE_ALIGN:.+]] = memref.assume_alignment %[[COLLAPSE]], 256 : memref<4096xf32>
-// CHECK:        %[[VEC:.+]] = xegpu.load %[[COLLAPSE_ALIGN]]{{\[}}%[[LIN_IDX]]{{\]}}, %[[MASK]] : memref<4096xf32>, vector<8xindex>, vector<8xi1> -> vector<8xf32>
-// CHECK:        %[[RES:.+]] = arith.select %[[MASK]], %[[VEC]], %[[PASS_THRU]] : vector<8xi1>, vector<8xf32>
-// CHECK:        gpu.return %[[RES]] : vector<8xf32>
-}
-
diff --git a/mlir/test/Conversion/VectorToXeGPU/scatter-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/scatter-to-xegpu.mlir