[Matrix][IR] Don't crash when verifying strides with more than 64 bits

cofibrant · cofibrant · commit 4d7ea22974fc · 2025-10-16T11:58:17.000+01:00
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
@@ -6479,9 +6479,17 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
               NumRows->getZExtValue() * NumColumns->getZExtValue(),
           "Result of a matrix operation does not fit in the returned vector!");
 
-    if (Stride)
-      Check(Stride->getZExtValue() >= NumRows->getZExtValue(),
+    if (Stride) {
+      // Stride can occupy an arbitrary bit-width, while rows and columns are
+      // always 32-bit, so zero extend to the largest common bit-width to
+      // compare.
+      APInt StrideVal = Stride->getValue();
+      APInt NumRowsVal = NumRows->getValue();
+      unsigned BitWidth =
+          std::max(StrideVal.getBitWidth(), NumRowsVal.getBitWidth());
+      Check(StrideVal.zext(BitWidth).uge(NumRowsVal.zext(BitWidth)),
             "Stride must be greater or equal than the number of rows!", IF);
+    }
 
     break;
   }
diff --git a/llvm/test/Verifier/matrix-intrinsics.ll b/llvm/test/Verifier/matrix-intrinsics.ll
@@ -1,8 +1,7 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not opt -S %s 2>&1 | FileCheck %s
 
 define <4 x float> @transpose(<4 x float> %m, i32 %arg) {
-; CHECK: assembly parsed, but does not verify as correct!
-; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
+; CHECK: Result of a matrix operation does not fit in the returned vector!
 ; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
 ; CHECK-NEXT: Result of a matrix operation does not fit in the returned vector!
 ; CHECK-NEXT: immarg operand has non-immediate parameter
@@ -118,16 +117,43 @@ define void @column.major_store_stride_too_small(ptr %m, i64 %arg) {
   ret void
 }
 
+; This test ensures that verifier correctly handles very wide and very narrows
+; strides.
+
+define <4 x float> @column.major_load_stride_i8(ptr %m, i32 %arg) {
+  %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32.i128(ptr %m, i8 16, i1 false, i32 2, i32 2)
+  ret <4 x float> %result.1
+}
+
+define <4 x float> @column.major_load_stride_i128(ptr %m, i32 %arg) {
+  %result.1 = call <4 x float> @llvm.matrix.column.major.load.v4f32.i128(ptr %m, i128 u0x10000000000000000, i1 false, i32 2, i32 2)
+  ret <4 x float> %result.1
+}
+
+define void @column.major_store_stride_i8(ptr %m, i64 %arg) {
+  call void @llvm.matrix.column.major.store.v4f32.i128(<4 x float> zeroinitializer, ptr %m, i8 16, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @column.major_store_stride_i128(ptr %m, i64 %arg) {
+  call void @llvm.matrix.column.major.store.v4f32.i128(<4 x float> zeroinitializer, ptr %m, i128 u0x10000000000000000, i1 false, i32 2, i32 2)
+  ret void
+}
+
 declare <4 x i32>   @llvm.matrix.column.major.load.v4i32.i64(ptr, i64, i1, i32, i32)
 declare <4 x float> @llvm.matrix.column.major.load.v4f32.p0(ptr, i64, i1, i32, i32)
 declare <4 x float> @llvm.matrix.column.major.load.v4f32.i64(ptr, i64, i1, i32, i32)
 declare <6 x float> @llvm.matrix.column.major.load.v6f32.i64(ptr, i64, i1, i32, i32)
+declare <6 x float> @llvm.matrix.column.major.load.v6f32.i8(ptr, i8, i1, i32, i32)
+declare <6 x float> @llvm.matrix.column.major.load.v6f32.i128(ptr, i64, i1, i32, i32)
 
 declare void @llvm.matrix.column.major.store.v4f32.i64(<4 x float>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v6f32.i64(<6 x float>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v4i32.vi32(<4 x i32>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v4f32.p0(<4 x float>, ptr, i64, i1, i32, i32)
 declare void @llvm.matrix.column.major.store.v4p0.i64(<4 x ptr>, ptr, i64, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v4p0.i8(<4 x ptr>, ptr, i8, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v4p0.i128(<4 x ptr>, ptr, i64, i1, i32, i32)
 
 declare <4 x i32>   @llvm.matrix.transpose.v4i32.v4f32(<4 x float>, i32, i32)
 declare <4 x float> @llvm.matrix.transpose.v4f32(<4 x float>, i32, i32)