-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[msan] Handle AVX512 VCVTPS2PH #154460
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[msan] Handle AVX512 VCVTPS2PH #154460
Conversation
This extends maybeExtendVectorShadowWithZeros() from 556c846 (llvm#147377) to handle AVX512 VCVTPS2PH.
|
@llvm/pr-subscribers-compiler-rt-sanitizer Author: Thurston Dang (thurstond) ChangesThis extends maybeExtendVectorShadowWithZeros() from 556c846 (#147377) to handle AVX512 VCVTPS2PH. Patch is 22.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154460.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 948e2c6e06843..13262c2c8b36f 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3429,8 +3429,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return ShadowType;
}
- /// Doubles the length of a vector shadow (filled with zeros) if necessary to
- /// match the length of the shadow for the instruction.
+ /// Doubles the length of a vector shadow (extending with zeros) if necessary
+ /// to match the length of the shadow for the instruction.
+ /// If scalar types of the vectors are different, it will use the type of the
+ /// input vector.
/// This is more type-safe than CreateShadowCast().
Value *maybeExtendVectorShadowWithZeros(Value *Shadow, IntrinsicInst &I) {
IRBuilder<> IRB(&I);
@@ -3440,10 +3442,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Value *FullShadow = getCleanShadow(&I);
assert(cast<FixedVectorType>(Shadow->getType())->getNumElements() <=
cast<FixedVectorType>(FullShadow->getType())->getNumElements());
- assert(cast<FixedVectorType>(Shadow->getType())->getScalarType() ==
- cast<FixedVectorType>(FullShadow->getType())->getScalarType());
- if (Shadow->getType() == FullShadow->getType()) {
+ if (cast<FixedVectorType>(Shadow->getType())->getNumElements() ==
+ cast<FixedVectorType>(FullShadow->getType())->getNumElements()) {
FullShadow = Shadow;
} else {
// TODO: generalize beyond 2x?
@@ -4528,55 +4529,93 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return isFixedFPVectorTy(V->getType());
}
- // e.g., call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
- // (<16 x float> a, <16 x i32> writethru, i16 mask,
- // i32 rounding)
+ // e.g., <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
+ // (<16 x float> a, <16 x i32> writethru, i16 mask,
+ // i32 rounding)
+ //
+ // Inconveniently, some similar intrinsics have a different operand order:
+ // <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512
+ // (<16 x float> a, i32 rounding, <16 x i16> writethru,
+ // i16 mask)
+ //
+ // If the return type has more elements than A, the excess elements are
+ // zeroed (and the corresponding shadow is initialized).
+ // <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128
+ // (<4 x float> a, i32 rounding, <8 x i16> writethru,
+ // i8 mask)
//
// dst[i] = mask[i] ? convert(a[i]) : writethru[i]
// dst_shadow[i] = mask[i] ? all_or_nothing(a_shadow[i]) : writethru_shadow[i]
// where all_or_nothing(x) is fully uninitialized if x has any
// uninitialized bits
- void handleAVX512VectorConvertFPToInt(IntrinsicInst &I) {
+ void handleAVX512VectorConvertFPToInt(IntrinsicInst &I, bool LastMask) {
IRBuilder<> IRB(&I);
assert(I.arg_size() == 4);
Value *A = I.getOperand(0);
- Value *WriteThrough = I.getOperand(1);
- Value *Mask = I.getOperand(2);
- Value *RoundingMode = I.getOperand(3);
+ Value *WriteThrough;
+ Value *Mask;
+ Value *RoundingMode;
+ if (LastMask) {
+ WriteThrough = I.getOperand(2);
+ Mask = I.getOperand(3);
+ RoundingMode = I.getOperand(1);
+ } else {
+ WriteThrough = I.getOperand(1);
+ Mask = I.getOperand(2);
+ RoundingMode = I.getOperand(3);
+ }
assert(isFixedFPVector(A));
assert(isFixedIntVector(WriteThrough));
unsigned ANumElements =
cast<FixedVectorType>(A->getType())->getNumElements();
- assert(ANumElements ==
- cast<FixedVectorType>(WriteThrough->getType())->getNumElements());
+ unsigned WriteThruNumElements =
+ cast<FixedVectorType>(WriteThrough->getType())->getNumElements();
+ assert(ANumElements == WriteThruNumElements ||
+ ANumElements * 2 == WriteThruNumElements);
assert(Mask->getType()->isIntegerTy());
- assert(Mask->getType()->getScalarSizeInBits() == ANumElements);
+ unsigned MaskNumElements = Mask->getType()->getScalarSizeInBits();
+ assert(ANumElements == MaskNumElements ||
+ ANumElements * 2 == MaskNumElements);
+
+ assert(WriteThruNumElements == MaskNumElements);
+
insertCheckShadowOf(Mask, &I);
assert(RoundingMode->getType()->isIntegerTy());
- // Only four bits of the rounding mode are used, though it's very
+ // Only some bits of the rounding mode are used, though it's very
// unusual to have uninitialized bits there (more commonly, it's a
// constant).
insertCheckShadowOf(RoundingMode, &I);
assert(I.getType() == WriteThrough->getType());
+ Value *AShadow = getShadow(A);
+ AShadow = maybeExtendVectorShadowWithZeros(AShadow, I);
+
+ if (ANumElements * 2 == MaskNumElements) {
+ // Ensure that the irrelevant bits of the mask are zero, hence selecting
+ // from the zeroed shadow instead of the writethrough's shadow.
+ Mask = IRB.CreateTrunc(Mask, IRB.getIntNTy(ANumElements));
+ Mask = IRB.CreateZExt(Mask, IRB.getIntNTy(MaskNumElements));
+ }
+
// Convert i16 mask to <16 x i1>
Mask = IRB.CreateBitCast(
- Mask, FixedVectorType::get(IRB.getInt1Ty(), ANumElements));
+ Mask, FixedVectorType::get(IRB.getInt1Ty(), MaskNumElements));
- Value *AShadow = getShadow(A);
- /// For scalars:
- /// Since they are converting from floating-point, the output is:
+ /// For floating-point to integer conversion, the output is:
/// - fully uninitialized if *any* bit of the input is uninitialized
/// - fully ininitialized if all bits of the input are ininitialized
/// We apply the same principle on a per-element basis for vectors.
- AShadow = IRB.CreateSExt(IRB.CreateICmpNE(AShadow, getCleanShadow(A)),
- getShadowTy(A));
+ ///
+ /// We use the scalar width of the return type instead of A's.
+ AShadow = IRB.CreateSExt(
+ IRB.CreateICmpNE(AShadow, getCleanShadow(AShadow->getType())),
+ getShadowTy(&I));
Value *WriteThroughShadow = getShadow(WriteThrough);
Value *Shadow = IRB.CreateSelect(Mask, AShadow, WriteThroughShadow);
@@ -5920,11 +5959,29 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/*trailingVerbatimArgs=*/1);
break;
+ // Convert Packed Single Precision Floating-Point Values
+ // to Packed SignedDoubleword Integer Values
+ //
+ // <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
+ // (<16 x float>, <16 x i32>, i16, i32)
case Intrinsic::x86_avx512_mask_cvtps2dq_512: {
- handleAVX512VectorConvertFPToInt(I);
+ handleAVX512VectorConvertFPToInt(I, /*LastMask=*/false);
break;
}
+ // Convert Single-Precision FP Value to 16-bit FP Value
+ // <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512
+ // (<16 x float>, i32, <16 x i16>, i16)
+ // <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128
+ // (<4 x float>, i32, <8 x i16>, i8)
+ // <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256
+ // (<8 x float>, i32, <8 x i16>, i8)
+ case Intrinsic::x86_avx512_mask_vcvtps2ph_512:
+ case Intrinsic::x86_avx512_mask_vcvtps2ph_256:
+ case Intrinsic::x86_avx512_mask_vcvtps2ph_128:
+ handleAVX512VectorConvertFPToInt(I, /*LastMask=*/true);
+ break;
+
// AVX512 PMOV: Packed MOV, with truncation
// Precisely handled by applying the same intrinsic to the shadow
case Intrinsic::x86_avx512_mask_pmov_dw_512:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
index 3c1af6781f0ed..eba0beb5bf6ac 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -1903,50 +1903,46 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK: 6:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT: unreachable
-; CHECK: 7:
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = sext <16 x i1> [[TMP6]] to <16 x i16>
+; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> splat (i1 true), <16 x i16> [[TMP7]], <16 x i16> zeroinitializer
; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0:%.*]], i32 2, <16 x i16> zeroinitializer, i16 -1)
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i16>
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP10]], <16 x i16> [[TMP12]], <16 x i16> zeroinitializer
; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP2]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK: 9:
+; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK: 12:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
; CHECK-NEXT: unreachable
-; CHECK: 10:
-; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 11, <16 x i16> zeroinitializer, i16 [[MASK:%.*]])
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i16> [[TMP3]] to i256
-; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP12]], 0
-; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
-; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i16 [[TMP2]], 0
-; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
; CHECK: 13:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 11, <16 x i16> zeroinitializer, i16 [[MASK]])
+; CHECK-NEXT: [[TMP25:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT: [[TMP26:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP27:%.*]] = sext <16 x i1> [[TMP26]] to <16 x i16>
+; CHECK-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP25]], <16 x i16> [[TMP27]], <16 x i16> [[TMP3]]
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i16 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP6]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
+; CHECK: 18:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
; CHECK-NEXT: unreachable
-; CHECK: 14:
+; CHECK: 19:
; CHECK-NEXT: [[RES3:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 12, <16 x i16> [[SRC:%.*]], i16 [[MASK]])
; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[_MSCMP8]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK: 15:
+; CHECK-NEXT: br i1 [[_MSCMP8]], label [[TMP24:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
+; CHECK: 20:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
; CHECK-NEXT: unreachable
-; CHECK: 16:
+; CHECK: 21:
; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], 87960930222080
; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
-; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr [[TMP19]], align 32
+; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP19]], align 32
; CHECK-NEXT: store <16 x i16> [[RES1]], ptr [[DST]], align 32
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP13]], [[TMP20]]
; CHECK-NEXT: [[RES:%.*]] = add <16 x i16> [[RES2]], [[RES3]]
-; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i16> [[RES]]
;
%res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
index db0c2e7ae9ed6..e22301174a0ca 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
@@ -7893,40 +7893,44 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s
; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
-; CHECK: [[BB5]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB6]]:
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> bitcast (<1 x i8> splat (i8 15) to <8 x i1>), <8 x i16> [[TMP6]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 2, <8 x i16> zeroinitializer, i8 -1)
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP9:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT: [[TMP10:%.*]] = zext i4 [[TMP9]] to i8
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <8 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP11]], <8 x i16> [[TMP13]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK: [[BB8]]:
+; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
+; CHECK: [[BB15]]:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: [[BB9]]:
+; CHECK: [[BB16]]:
; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 10, <8 x i16> zeroinitializer, i8 [[MASK]])
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP18:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT: [[TMP19:%.*]] = zext i4 [[TMP18]] to i8
+; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP19]] to <8 x i1>
+; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i16>
+; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP20]], <8 x i16> [[TMP22]], <8 x i16> [[TMP3]]
; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT: br i1 [[_MSOR7]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
-; CHECK: [[BB12]]:
+; CHECK-NEXT: br i1 [[_MSCMP6]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]]
+; CHECK: [[BB24]]:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: [[BB13]]:
+; CHECK: [[BB25]]:
; CHECK-NEXT: [[RES3:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 11, <8 x i16> [[SRC]], i8 [[MASK]])
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], [[TMP14]]
; CHECK-NEXT: [[RES0:%.*]] = add <8 x i16> [[RES1]], [[RES2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i16> [[TMP23]], [[_MSPROP]]
; CHECK-NEXT: [[RES:%.*]] = add <8 x i16> [[RES3]], [[RES0]]
-; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i16> [[RES]]
;
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
@@ -7947,40 +7951,37 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s
; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
-; CHECK: [[BB5]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB6]]:
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> splat (i1 true), <8 x i16> [[TMP6]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> [[A0]], i32 2, <8 x i16> zeroinitializer, i8 -1)
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i16>
+; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> [[TMP11]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR]], label ...
[truncated]
|
|
@llvm/pr-subscribers-llvm-transforms Author: Thurston Dang (thurstond) ChangesThis extends maybeExtendVectorShadowWithZeros() from 556c846 (#147377) to handle AVX512 VCVTPS2PH. Patch is 22.33 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154460.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 948e2c6e06843..13262c2c8b36f 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3429,8 +3429,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return ShadowType;
}
- /// Doubles the length of a vector shadow (filled with zeros) if necessary to
- /// match the length of the shadow for the instruction.
+ /// Doubles the length of a vector shadow (extending with zeros) if necessary
+ /// to match the length of the shadow for the instruction.
+ /// If scalar types of the vectors are different, it will use the type of the
+ /// input vector.
/// This is more type-safe than CreateShadowCast().
Value *maybeExtendVectorShadowWithZeros(Value *Shadow, IntrinsicInst &I) {
IRBuilder<> IRB(&I);
@@ -3440,10 +3442,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Value *FullShadow = getCleanShadow(&I);
assert(cast<FixedVectorType>(Shadow->getType())->getNumElements() <=
cast<FixedVectorType>(FullShadow->getType())->getNumElements());
- assert(cast<FixedVectorType>(Shadow->getType())->getScalarType() ==
- cast<FixedVectorType>(FullShadow->getType())->getScalarType());
- if (Shadow->getType() == FullShadow->getType()) {
+ if (cast<FixedVectorType>(Shadow->getType())->getNumElements() ==
+ cast<FixedVectorType>(FullShadow->getType())->getNumElements()) {
FullShadow = Shadow;
} else {
// TODO: generalize beyond 2x?
@@ -4528,55 +4529,93 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return isFixedFPVectorTy(V->getType());
}
- // e.g., call <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
- // (<16 x float> a, <16 x i32> writethru, i16 mask,
- // i32 rounding)
+ // e.g., <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
+ // (<16 x float> a, <16 x i32> writethru, i16 mask,
+ // i32 rounding)
+ //
+ // Inconveniently, some similar intrinsics have a different operand order:
+ // <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512
+ // (<16 x float> a, i32 rounding, <16 x i16> writethru,
+ // i16 mask)
+ //
+ // If the return type has more elements than A, the excess elements are
+ // zeroed (and the corresponding shadow is initialized).
+ // <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128
+ // (<4 x float> a, i32 rounding, <8 x i16> writethru,
+ // i8 mask)
//
// dst[i] = mask[i] ? convert(a[i]) : writethru[i]
// dst_shadow[i] = mask[i] ? all_or_nothing(a_shadow[i]) : writethru_shadow[i]
// where all_or_nothing(x) is fully uninitialized if x has any
// uninitialized bits
- void handleAVX512VectorConvertFPToInt(IntrinsicInst &I) {
+ void handleAVX512VectorConvertFPToInt(IntrinsicInst &I, bool LastMask) {
IRBuilder<> IRB(&I);
assert(I.arg_size() == 4);
Value *A = I.getOperand(0);
- Value *WriteThrough = I.getOperand(1);
- Value *Mask = I.getOperand(2);
- Value *RoundingMode = I.getOperand(3);
+ Value *WriteThrough;
+ Value *Mask;
+ Value *RoundingMode;
+ if (LastMask) {
+ WriteThrough = I.getOperand(2);
+ Mask = I.getOperand(3);
+ RoundingMode = I.getOperand(1);
+ } else {
+ WriteThrough = I.getOperand(1);
+ Mask = I.getOperand(2);
+ RoundingMode = I.getOperand(3);
+ }
assert(isFixedFPVector(A));
assert(isFixedIntVector(WriteThrough));
unsigned ANumElements =
cast<FixedVectorType>(A->getType())->getNumElements();
- assert(ANumElements ==
- cast<FixedVectorType>(WriteThrough->getType())->getNumElements());
+ unsigned WriteThruNumElements =
+ cast<FixedVectorType>(WriteThrough->getType())->getNumElements();
+ assert(ANumElements == WriteThruNumElements ||
+ ANumElements * 2 == WriteThruNumElements);
assert(Mask->getType()->isIntegerTy());
- assert(Mask->getType()->getScalarSizeInBits() == ANumElements);
+ unsigned MaskNumElements = Mask->getType()->getScalarSizeInBits();
+ assert(ANumElements == MaskNumElements ||
+ ANumElements * 2 == MaskNumElements);
+
+ assert(WriteThruNumElements == MaskNumElements);
+
insertCheckShadowOf(Mask, &I);
assert(RoundingMode->getType()->isIntegerTy());
- // Only four bits of the rounding mode are used, though it's very
+ // Only some bits of the rounding mode are used, though it's very
// unusual to have uninitialized bits there (more commonly, it's a
// constant).
insertCheckShadowOf(RoundingMode, &I);
assert(I.getType() == WriteThrough->getType());
+ Value *AShadow = getShadow(A);
+ AShadow = maybeExtendVectorShadowWithZeros(AShadow, I);
+
+ if (ANumElements * 2 == MaskNumElements) {
+ // Ensure that the irrelevant bits of the mask are zero, hence selecting
+ // from the zeroed shadow instead of the writethrough's shadow.
+ Mask = IRB.CreateTrunc(Mask, IRB.getIntNTy(ANumElements));
+ Mask = IRB.CreateZExt(Mask, IRB.getIntNTy(MaskNumElements));
+ }
+
// Convert i16 mask to <16 x i1>
Mask = IRB.CreateBitCast(
- Mask, FixedVectorType::get(IRB.getInt1Ty(), ANumElements));
+ Mask, FixedVectorType::get(IRB.getInt1Ty(), MaskNumElements));
- Value *AShadow = getShadow(A);
- /// For scalars:
- /// Since they are converting from floating-point, the output is:
+ /// For floating-point to integer conversion, the output is:
/// - fully uninitialized if *any* bit of the input is uninitialized
/// - fully ininitialized if all bits of the input are ininitialized
/// We apply the same principle on a per-element basis for vectors.
- AShadow = IRB.CreateSExt(IRB.CreateICmpNE(AShadow, getCleanShadow(A)),
- getShadowTy(A));
+ ///
+ /// We use the scalar width of the return type instead of A's.
+ AShadow = IRB.CreateSExt(
+ IRB.CreateICmpNE(AShadow, getCleanShadow(AShadow->getType())),
+ getShadowTy(&I));
Value *WriteThroughShadow = getShadow(WriteThrough);
Value *Shadow = IRB.CreateSelect(Mask, AShadow, WriteThroughShadow);
@@ -5920,11 +5959,29 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
/*trailingVerbatimArgs=*/1);
break;
+ // Convert Packed Single Precision Floating-Point Values
+ // to Packed SignedDoubleword Integer Values
+ //
+ // <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512
+ // (<16 x float>, <16 x i32>, i16, i32)
case Intrinsic::x86_avx512_mask_cvtps2dq_512: {
- handleAVX512VectorConvertFPToInt(I);
+ handleAVX512VectorConvertFPToInt(I, /*LastMask=*/false);
break;
}
+ // Convert Single-Precision FP Value to 16-bit FP Value
+ // <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512
+ // (<16 x float>, i32, <16 x i16>, i16)
+ // <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128
+ // (<4 x float>, i32, <8 x i16>, i8)
+ // <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256
+ // (<8 x float>, i32, <8 x i16>, i8)
+ case Intrinsic::x86_avx512_mask_vcvtps2ph_512:
+ case Intrinsic::x86_avx512_mask_vcvtps2ph_256:
+ case Intrinsic::x86_avx512_mask_vcvtps2ph_128:
+ handleAVX512VectorConvertFPToInt(I, /*LastMask=*/true);
+ break;
+
// AVX512 PMOV: Packed MOV, with truncation
// Precisely handled by applying the same intrinsic to the shadow
case Intrinsic::x86_avx512_mask_pmov_dw_512:
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
index 3c1af6781f0ed..eba0beb5bf6ac 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512-intrinsics.ll
@@ -1903,50 +1903,46 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16
; CHECK-NEXT: [[TMP3:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8
; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 104) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i512 [[TMP5]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP7:%.*]], !prof [[PROF1]]
-; CHECK: 6:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
-; CHECK-NEXT: unreachable
-; CHECK: 7:
+; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = sext <16 x i1> [[TMP6]] to <16 x i16>
+; CHECK-NEXT: [[TMP8:%.*]] = select <16 x i1> splat (i1 true), <16 x i16> [[TMP7]], <16 x i16> zeroinitializer
; CHECK-NEXT: [[RES1:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0:%.*]], i32 2, <16 x i16> zeroinitializer, i16 -1)
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i512 [[TMP8]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16 [[MASK:%.*]] to <16 x i1>
+; CHECK-NEXT: [[TMP11:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = sext <16 x i1> [[TMP11]] to <16 x i16>
+; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP10]], <16 x i16> [[TMP12]], <16 x i16> zeroinitializer
; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i16 [[TMP2]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR]], label [[TMP9:%.*]], label [[TMP10:%.*]], !prof [[PROF1]]
-; CHECK: 9:
+; CHECK-NEXT: br i1 [[_MSCMP2]], label [[TMP14:%.*]], label [[TMP15:%.*]], !prof [[PROF1]]
+; CHECK: 12:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
; CHECK-NEXT: unreachable
-; CHECK: 10:
-; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 11, <16 x i16> zeroinitializer, i16 [[MASK:%.*]])
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
-; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i512 [[TMP11]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <16 x i16> [[TMP3]] to i256
-; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i256 [[TMP12]], 0
-; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
-; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i16 [[TMP2]], 0
-; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT: br i1 [[_MSOR7]], label [[TMP13:%.*]], label [[TMP14:%.*]], !prof [[PROF1]]
; CHECK: 13:
+; CHECK-NEXT: [[RES2:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 11, <16 x i16> zeroinitializer, i16 [[MASK]])
+; CHECK-NEXT: [[TMP25:%.*]] = bitcast i16 [[MASK]] to <16 x i1>
+; CHECK-NEXT: [[TMP26:%.*]] = icmp ne <16 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP27:%.*]] = sext <16 x i1> [[TMP26]] to <16 x i16>
+; CHECK-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP25]], <16 x i16> [[TMP27]], <16 x i16> [[TMP3]]
+; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i16 [[TMP2]], 0
+; CHECK-NEXT: br i1 [[_MSCMP6]], label [[TMP22:%.*]], label [[TMP23:%.*]], !prof [[PROF1]]
+; CHECK: 18:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
; CHECK-NEXT: unreachable
-; CHECK: 14:
+; CHECK: 19:
; CHECK-NEXT: [[RES3:%.*]] = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> [[A0]], i32 12, <16 x i16> [[SRC:%.*]], i16 [[MASK]])
; CHECK-NEXT: [[_MSCMP8:%.*]] = icmp ne i64 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[_MSCMP8]], label [[TMP15:%.*]], label [[TMP16:%.*]], !prof [[PROF1]]
-; CHECK: 15:
+; CHECK-NEXT: br i1 [[_MSCMP8]], label [[TMP24:%.*]], label [[TMP21:%.*]], !prof [[PROF1]]
+; CHECK: 20:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR10]]
; CHECK-NEXT: unreachable
-; CHECK: 16:
+; CHECK: 21:
; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[DST:%.*]] to i64
; CHECK-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], 87960930222080
; CHECK-NEXT: [[TMP19:%.*]] = inttoptr i64 [[TMP18]] to ptr
-; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr [[TMP19]], align 32
+; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP19]], align 32
; CHECK-NEXT: store <16 x i16> [[RES1]], ptr [[DST]], align 32
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <16 x i16> [[TMP13]], [[TMP20]]
; CHECK-NEXT: [[RES:%.*]] = add <16 x i16> [[RES2]], [[RES3]]
-; CHECK-NEXT: store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <16 x i16> [[_MSPROP]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <16 x i16> [[RES]]
;
%res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
index db0c2e7ae9ed6..e22301174a0ca 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl-intrinsics.ll
@@ -7893,40 +7893,44 @@ define <8 x i16> @test_x86_vcvtps2ph_128(<4 x float> %a0, i8 %mask, <8 x i16> %s
; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 24) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
-; CHECK: [[BB5]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB6]]:
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> bitcast (<1 x i8> splat (i8 15) to <8 x i1>), <8 x i16> [[TMP6]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 2, <8 x i16> zeroinitializer, i8 -1)
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i128 [[TMP7]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP9:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT: [[TMP10:%.*]] = zext i4 [[TMP9]] to i8
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8 [[TMP10]] to <8 x i1>
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <8 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT: [[TMP14:%.*]] = select <8 x i1> [[TMP11]], <8 x i16> [[TMP13]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
-; CHECK: [[BB8]]:
+; CHECK-NEXT: br i1 [[_MSCMP2]], label %[[BB15:.*]], label %[[BB16:.*]], !prof [[PROF1]]
+; CHECK: [[BB15]]:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: [[BB9]]:
+; CHECK: [[BB16]]:
; CHECK-NEXT: [[RES2:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 10, <8 x i16> zeroinitializer, i8 [[MASK]])
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
-; CHECK-NEXT: [[_MSCMP3:%.*]] = icmp ne i128 [[TMP10]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast <8 x i16> [[TMP3]] to i128
-; CHECK-NEXT: [[_MSCMP4:%.*]] = icmp ne i128 [[TMP11]], 0
-; CHECK-NEXT: [[_MSOR5:%.*]] = or i1 [[_MSCMP3]], [[_MSCMP4]]
+; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP18:%.*]] = trunc i8 [[MASK]] to i4
+; CHECK-NEXT: [[TMP19:%.*]] = zext i4 [[TMP18]] to i8
+; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8 [[TMP19]] to <8 x i1>
+; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP17]], zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i16>
+; CHECK-NEXT: [[TMP23:%.*]] = select <8 x i1> [[TMP20]], <8 x i16> [[TMP22]], <8 x i16> [[TMP3]]
; CHECK-NEXT: [[_MSCMP6:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT: [[_MSOR7:%.*]] = or i1 [[_MSOR5]], [[_MSCMP6]]
-; CHECK-NEXT: br i1 [[_MSOR7]], label %[[BB12:.*]], label %[[BB13:.*]], !prof [[PROF1]]
-; CHECK: [[BB12]]:
+; CHECK-NEXT: br i1 [[_MSCMP6]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]]
+; CHECK: [[BB24]]:
; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
; CHECK-NEXT: unreachable
-; CHECK: [[BB13]]:
+; CHECK: [[BB25]]:
; CHECK-NEXT: [[RES3:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> [[A0]], i32 11, <8 x i16> [[SRC]], i8 [[MASK]])
+; CHECK-NEXT: [[_MSPROP:%.*]] = or <8 x i16> [[TMP7]], [[TMP14]]
; CHECK-NEXT: [[RES0:%.*]] = add <8 x i16> [[RES1]], [[RES2]]
+; CHECK-NEXT: [[_MSPROP1:%.*]] = or <8 x i16> [[TMP23]], [[_MSPROP]]
; CHECK-NEXT: [[RES:%.*]] = add <8 x i16> [[RES3]], [[RES0]]
-; CHECK-NEXT: store <8 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT: store <8 x i16> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
; CHECK-NEXT: ret <8 x i16> [[RES]]
;
%res1 = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.128(<4 x float> %a0, i32 2, <8 x i16> zeroinitializer, i8 -1)
@@ -7947,40 +7951,37 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0, i8 %mask, <8 x i16> %s
; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 40) to ptr), align 8
; CHECK-NEXT: call void @llvm.donothing()
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[_MSCMP]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
-; CHECK: [[BB5]]:
-; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]]
-; CHECK-NEXT: unreachable
-; CHECK: [[BB6]]:
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i16>
+; CHECK-NEXT: [[TMP7:%.*]] = select <8 x i1> splat (i1 true), <8 x i16> [[TMP6]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[RES1:%.*]] = call <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float> [[A0]], i32 2, <8 x i16> zeroinitializer, i8 -1)
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
-; CHECK-NEXT: [[_MSCMP1:%.*]] = icmp ne i256 [[TMP7]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8 [[MASK]] to <8 x i1>
+; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i16>
+; CHECK-NEXT: [[TMP12:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> [[TMP11]], <8 x i16> zeroinitializer
; CHECK-NEXT: [[_MSCMP2:%.*]] = icmp ne i8 [[TMP2]], 0
-; CHECK-NEXT: [[_MSOR:%.*]] = or i1 [[_MSCMP1]], [[_MSCMP2]]
-; CHECK-NEXT: br i1 [[_MSOR]], label ...
[truncated]
|
| if (cast<FixedVectorType>(Shadow->getType())->getNumElements() == | ||
| cast<FixedVectorType>(FullShadow->getType())->getNumElements()) { | ||
| FullShadow = Shadow; | ||
| } else { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
driveby: should we assert fail for not the same or double?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added
| // | ||
| // <16 x i32> @llvm.x86.avx512.mask.cvtps2dq.512 | ||
| // (<16 x float>, <16 x i32>, i16, i32) | ||
| case Intrinsic::x86_avx512_mask_cvtps2dq_512: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove braces
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed
| /*trailingVerbatimArgs=*/1); | ||
| break; | ||
|
|
||
| case Intrinsic::x86_avx512_mask_cvtps2dq_512: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you moved this around to group "convert" stuff, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, that was the intent. For simplicity, I've undone the change. (I'll move it around in a future NFC patch.)
and undo NFC movement of case
This extends handleAVX512VectorConvertFPToInt() from 556c846 (#147377) to handle AVX512 VCVTPS2PH.