Skip to content

Commit 35b35a3

Browse files
committed
[X86] Prevent shuffle combining from creating an identical X86ISD::SHUF128.
This can cause an infinite loop if SimplifiedDemandedElts asks for the node to replace itself. A similar protection exists in other places in shuffle combining. Fixes ISPC ispc/ispc#1864
1 parent 35c6d56 commit 35b35a3

File tree

2 files changed

+46
-0
lines changed

2 files changed

+46
-0
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34909,6 +34909,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
3490934909
(Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
3491034910

3491134911
if (!isAnyZero(Mask) && !PreferPERMQ) {
34912+
if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
34913+
return SDValue(); // Nothing to do!
3491234914
if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
3491334915
return DAG.getBitcast(RootVT, V);
3491434916
}

llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -764,3 +764,47 @@ define <16 x float> @mask_shuffle_v4f32_v16f32_00_01_02_03_00_01_02_03_00_01_02_
764764
%res = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
765765
ret <16 x float> %res
766766
}
767+
768+
%struct.foo = type { [4 x double], [3 x [4 x double]], [4 x double] }
769+
770+
; This test previously hung in shuffle combining. https://github.com/ispc/ispc/issues/1864
771+
define void @ispc_1864(<16 x float>* %arg) {
772+
; ALL-LABEL: ispc_1864:
773+
; ALL: # %bb.0: # %bb
774+
; ALL-NEXT: pushq %rbp
775+
; ALL-NEXT: .cfi_def_cfa_offset 16
776+
; ALL-NEXT: .cfi_offset %rbp, -16
777+
; ALL-NEXT: movq %rsp, %rbp
778+
; ALL-NEXT: .cfi_def_cfa_register %rbp
779+
; ALL-NEXT: andq $-64, %rsp
780+
; ALL-NEXT: subq $4864, %rsp # imm = 0x1300
781+
; ALL-NEXT: vbroadcastss {{.*#+}} ymm0 = [-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0,-5.0E+0]
782+
; ALL-NEXT: vmulps 32(%rdi), %ymm0, %ymm0
783+
; ALL-NEXT: vcvtps2pd %ymm0, %zmm0
784+
; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,4,5,0,1,0,1]
785+
; ALL-NEXT: vmovapd %ymm0, {{[0-9]+}}(%rsp)
786+
; ALL-NEXT: movq %rbp, %rsp
787+
; ALL-NEXT: popq %rbp
788+
; ALL-NEXT: .cfi_def_cfa %rsp, 8
789+
; ALL-NEXT: vzeroupper
790+
; ALL-NEXT: retq
791+
bb:
792+
%tmp = alloca [30 x %struct.foo], align 64
793+
%tmp1 = load <16 x float>, <16 x float>* %arg, align 4
794+
%tmp2 = fmul <16 x float> %tmp1, <float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00, float -5.000000e+00>
795+
%tmp3 = fpext <16 x float> %tmp2 to <16 x double>
796+
%tmp4 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 0
797+
%tmp5 = extractelement <16 x double> %tmp3, i32 10
798+
store double %tmp5, double* %tmp4, align 32
799+
%tmp6 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 1
800+
%tmp7 = extractelement <16 x double> %tmp3, i32 11
801+
store double %tmp7, double* %tmp6, align 8
802+
%tmp8 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 2
803+
%tmp9 = extractelement <16 x double> %tmp3, i32 12
804+
store double %tmp9, double* %tmp8, align 16
805+
%tmp10 = getelementptr inbounds [30 x %struct.foo], [30 x %struct.foo]* %tmp, i64 0, i64 3, i32 2, i64 3
806+
%tmp11 = extractelement <16 x double> %tmp3, i32 13
807+
store double %tmp11, double* %tmp10, align 8
808+
ret void
809+
}
810+

0 commit comments

Comments
 (0)