diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 5b3a1df9dfd7c..b02186ef65fbc 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3895,8 +3895,9 @@ InstructionCost AArch64TTIImpl::getVectorInstrCost( unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef> ScalarUserAndIdx) const { - return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, false, nullptr, - Scalar, ScalarUserAndIdx); + return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, + /*HasRealUse=*/true, nullptr, Scalar, + ScalarUserAndIdx); } InstructionCost AArch64TTIImpl::getVectorInstrCost(const Instruction &I, diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll index 2b5ee59aeb163..9b998ba0c9d35 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/external-use-icmp.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64 -slp-threshold=-20 -slp-vectorize-hor=0 < %s | FileCheck %s +; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64 -slp-threshold=-40 -slp-vectorize-hor=0 < %s | FileCheck %s define i16 @foo(i16 %in1, i16 %in2) { ; CHECK-LABEL: define i16 @foo( @@ -7,23 +7,22 @@ define i16 @foo(i16 %in1, i16 %in2) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> poison, i16 [[IN1]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i16> poison, i16 [[IN2]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i64> [[TMP9]], splat (i64 65535) -; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[TMP12]], splat (i64 65533) +; CHECK-NEXT: [[TMP2:%.*]] = zext <2 x i16> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i16> poison, i16 [[IN2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64> +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <2 x i64> [[TMP5]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[TMP6]], splat (i64 196605) ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 ; CHECK-NEXT: [[ZEXT3_1:%.*]] = zext i1 [[TMP8]] to i16 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 -; CHECK-NEXT: [[CMP2_1:%.*]] = icmp ne i64 [[TMP10]], 196605 +; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535) +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i64> [[TMP9]], splat (i64 65533) +; CHECK-NEXT: [[CMP2_1:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 ; CHECK-NEXT: [[ZEXT4_1:%.*]] = zext i1 [[CMP2_1]] to i16 -; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT3_1]], [[ZEXT4_1]] -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw i16 [[ZEXT4_1]], [[ZEXT3_1]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 ; CHECK-NEXT: [[ZEXT3_2:%.*]] = zext i1 [[TMP11]] to i16 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 -; CHECK-NEXT: [[CMP2_2:%.*]] = icmp ne i64 [[TMP13]], 196605 +; CHECK-NEXT: [[CMP2_2:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 ; CHECK-NEXT: [[ZEXT4_2:%.*]] = zext i1 [[CMP2_2]] to i16 ; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i16 [[ADD1]], [[ZEXT4_2]] ; CHECK-NEXT: [[ADD3:%.*]] = add nuw nsw i16 [[ADD2]], [[ZEXT3_2]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll index c43b79e138a30..6fc41ff1477fc 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -1,7 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=aarch64--linux-gnu -passes=slp-vectorizer,dce,instcombine -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s -; RUN: cat %t | FileCheck -check-prefix=YAML %s -; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-7 -pass-remarks-output=%t < %s | FileCheck %s +; RUN: opt -S -mtriple=aarch64--linux-gnu -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-9 -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s ; These tests check that we remove from consideration pairs of seed @@ -20,13 +18,23 @@ ; ; YAML-LABEL: Function: getelementptr_4x32 +; YAML: --- !Passed +; YAML-NEXT: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedHorizontalReduction +; YAML-NEXT: Function: getelementptr_4x32 +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' +; YAML-NEXT: - Cost: '7' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '1' + ; YAML: --- !Passed ; YAML-NEXT: Pass: slp-vectorizer ; YAML-NEXT: Name: VectorizedList ; YAML-NEXT: Function: getelementptr_4x32 ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '4' +; YAML-NEXT: - Cost: '6' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '3' @@ -36,7 +44,7 @@ ; YAML-NEXT: Function: getelementptr_4x32 ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'SLP vectorized with cost ' -; YAML-NEXT: - Cost: '6' +; YAML-NEXT: - Cost: '8' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '3' @@ -66,23 +74,25 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK-NEXT: [[TMP7:%.*]] = zext nneg i32 [[TMP6]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[G:%.*]], i64 [[TMP7]] ; CHECK-NEXT: [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i64 1 ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]] ; CHECK-NEXT: [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]] ; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]] ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 ; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP12]] ; CHECK-NEXT: [[T10:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4 -; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]] ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i64 1 ; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP14]] ; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[ADD16]] = add nsw i32 [[ADD11]], [[T12]] +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> poison, i32 [[T6]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[T8]], i64 1 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[T10]], i64 2 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[T12]], i64 3 +; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]]) +; CHECK-NEXT: [[ADD16]] = add i32 [[TMP19]], [[SUM_032]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] @@ -133,7 +143,7 @@ for.body: ; YAML: Function: getelementptr_2x32 ; YAML: Args: ; YAML: - String: 'SLP vectorized with cost ' -; YAML: - Cost: '4' +; YAML: - Cost: '6' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '3' diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll index 9f5744b17cb79..22cc5b41873f9 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/memory-runtime-checks.ll @@ -597,18 +597,30 @@ bb15: ; preds = %bb15, %bb14 ; Some points we collected as candidates for runtime checks have been removed ; before generating runtime checks. Make sure versioning is skipped. -define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c) { +define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c, float %f, float %g) { ; CHECK-LABEL: @test_bounds_removed_before_runtime_checks( ; CHECK-NEXT: entry: -; CHECK-NEXT: store <2 x i32> , ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = fmul float [[F:%.*]], 2.000000e+01 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi float [[TMP1]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = fmul float [[G:%.*]], 2.000000e+01 +; CHECK-NEXT: [[TMP4:%.*]] = fptosi float [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 100, [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP2]], i32 10 +; CHECK-NEXT: [[TMP7:%.*]] = select i1 false, i32 0, i32 [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i32 200, [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i32 [[TMP4]], i32 300 +; CHECK-NEXT: [[TMP10:%.*]] = select i1 false, i32 0, i32 [[TMP9]] +; CHECK-NEXT: store i32 [[TMP7]], ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 1 +; CHECK-NEXT: store i32 [[TMP10]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = load ptr, ptr [[B:%.*]], align 8 ; CHECK-NEXT: br i1 [[C:%.*]], label [[BB23:%.*]], label [[BB14:%.*]] ; CHECK: bb14: -; CHECK-NEXT: [[TMP15:%.*]] = sext i32 10 to i64 +; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP7]] to i64 ; CHECK-NEXT: [[TMP16:%.*]] = add nsw i64 2, [[TMP15]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i64 [[TMP16]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP17]], i64 3 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT:%.*]], ptr [[A]], i64 0, i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 2 ; CHECK-NEXT: store float 0.000000e+00, ptr [[TMP20]], align 8 ; CHECK-NEXT: [[TMP21:%.*]] = load i8, ptr [[TMP19]], align 1 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT]], ptr [[A]], i64 0, i32 3 @@ -618,9 +630,9 @@ define void @test_bounds_removed_before_runtime_checks(ptr %A, ptr %B, i1 %c) { ; CHECK-NEXT: ret void ; entry: - %tmp1 = fmul float 10.0, 20.0 + %tmp1 = fmul float %f, 20.0 %tmp2 = fptosi float %tmp1 to i32 - %tmp3 = fmul float 30.0, 20.0 + %tmp3 = fmul float %g, 20.0 %tmp4 = fptosi float %tmp3 to i32 %tmp5 = icmp sgt i32 100, %tmp2 %tmp6 = select i1 %tmp5, i32 %tmp2, i32 10 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll index 07411cacb3626..8561a00490bfa 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll @@ -14,159 +14,387 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) { ; CHECK-LABEL: @straight( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[P:%.*]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]] +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]] +; CHECK-NEXT: [[ADD11_1:%.*]] = add nuw i32 [[MUL_1]], [[MUL]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]] +; CHECK-NEXT: [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]] +; CHECK-NEXT: [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]] +; CHECK-NEXT: [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]] +; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2 +; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32 +; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]] +; CHECK-NEXT: [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]] +; CHECK-NEXT: [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]] +; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 5 +; CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 +; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]] +; CHECK-NEXT: [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]] +; CHECK-NEXT: [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]] +; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 6 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 +; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32 +; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]] +; CHECK-NEXT: [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]] +; CHECK-NEXT: [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]] +; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 7 +; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32 +; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]] +; CHECK-NEXT: [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]] +; CHECK-NEXT: [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]] +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[ADD_PTR]], align 2 +; CHECK-NEXT: [[CONV_140:%.*]] = zext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]] +; CHECK-NEXT: [[MUL_142:%.*]] = mul nuw nsw i32 [[CONV_140]], [[CONV_140]] +; CHECK-NEXT: [[ADD11_143:%.*]] = add i32 [[MUL_142]], [[ADD11_7]] +; CHECK-NEXT: [[ARRAYIDX_1_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_1_1]], align 2 +; CHECK-NEXT: [[CONV_1_1:%.*]] = zext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]] +; CHECK-NEXT: [[MUL_1_1:%.*]] = mul nuw nsw i32 [[CONV_1_1]], [[CONV_1_1]] +; CHECK-NEXT: [[ADD11_1_1:%.*]] = add i32 [[MUL_1_1]], [[ADD11_143]] +; CHECK-NEXT: [[ARRAYIDX_2_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 2 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_2_1]], align 2 +; CHECK-NEXT: [[CONV_2_1:%.*]] = zext i16 [[TMP10]] to i32 +; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]] +; CHECK-NEXT: [[MUL_2_1:%.*]] = mul nuw nsw i32 [[CONV_2_1]], [[CONV_2_1]] +; CHECK-NEXT: [[ADD11_2_1:%.*]] = add i32 [[MUL_2_1]], [[ADD11_1_1]] +; CHECK-NEXT: [[ARRAYIDX_3_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 3 +; CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_3_1]], align 2 +; CHECK-NEXT: [[CONV_3_1:%.*]] = zext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]] +; CHECK-NEXT: [[MUL_3_1:%.*]] = mul nuw nsw i32 [[CONV_3_1]], [[CONV_3_1]] +; CHECK-NEXT: [[ADD11_3_1:%.*]] = add i32 [[MUL_3_1]], [[ADD11_2_1]] +; CHECK-NEXT: [[ARRAYIDX_4_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_4_1]], align 2 +; CHECK-NEXT: [[CONV_4_1:%.*]] = zext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]] +; CHECK-NEXT: [[MUL_4_1:%.*]] = mul nuw nsw i32 [[CONV_4_1]], [[CONV_4_1]] +; CHECK-NEXT: [[ADD11_4_1:%.*]] = add i32 [[MUL_4_1]], [[ADD11_3_1]] +; CHECK-NEXT: [[ARRAYIDX_5_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 5 +; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_5_1]], align 2 +; CHECK-NEXT: [[CONV_5_1:%.*]] = zext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]] +; CHECK-NEXT: [[MUL_5_1:%.*]] = mul nuw nsw i32 [[CONV_5_1]], [[CONV_5_1]] +; CHECK-NEXT: [[ADD11_5_1:%.*]] = add i32 [[MUL_5_1]], [[ADD11_4_1]] +; CHECK-NEXT: [[ARRAYIDX_6_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 6 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_6_1]], align 2 +; CHECK-NEXT: [[CONV_6_1:%.*]] = zext i16 [[TMP14]] to i32 +; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]] +; CHECK-NEXT: [[MUL_6_1:%.*]] = mul nuw nsw i32 [[CONV_6_1]], [[CONV_6_1]] +; CHECK-NEXT: [[ADD11_6_1:%.*]] = add i32 [[MUL_6_1]], [[ADD11_5_1]] +; CHECK-NEXT: [[ARRAYIDX_7_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 7 +; CHECK-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_7_1]], align 2 +; CHECK-NEXT: [[CONV_7_1:%.*]] = zext i16 [[TMP15]] to i32 +; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]] +; CHECK-NEXT: [[MUL_7_1:%.*]] = mul nuw nsw i32 [[CONV_7_1]], [[CONV_7_1]] +; CHECK-NEXT: [[ADD11_7_1:%.*]] = add i32 [[MUL_7_1]], [[ADD11_6_1]] ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP16:%.*]] = load i16, ptr [[ADD_PTR_1]], align 2 +; CHECK-NEXT: [[CONV_244:%.*]] = zext i16 [[TMP16]] to i32 +; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]] +; CHECK-NEXT: [[MUL_246:%.*]] = mul nuw nsw i32 [[CONV_244]], [[CONV_244]] +; CHECK-NEXT: [[ADD11_247:%.*]] = add i32 [[MUL_246]], [[ADD11_7_1]] +; CHECK-NEXT: [[ARRAYIDX_1_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 1 +; CHECK-NEXT: [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX_1_2]], align 2 +; CHECK-NEXT: [[CONV_1_2:%.*]] = zext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]] +; CHECK-NEXT: [[MUL_1_2:%.*]] = mul nuw nsw i32 [[CONV_1_2]], [[CONV_1_2]] +; CHECK-NEXT: [[ADD11_1_2:%.*]] = add i32 [[MUL_1_2]], [[ADD11_247]] +; CHECK-NEXT: [[ARRAYIDX_2_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 2 +; CHECK-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_2_2]], align 2 +; CHECK-NEXT: [[CONV_2_2:%.*]] = zext i16 [[TMP18]] to i32 +; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]] +; CHECK-NEXT: [[MUL_2_2:%.*]] = mul nuw nsw i32 [[CONV_2_2]], [[CONV_2_2]] +; CHECK-NEXT: [[ADD11_2_2:%.*]] = add i32 [[MUL_2_2]], [[ADD11_1_2]] +; CHECK-NEXT: [[ARRAYIDX_3_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 3 +; CHECK-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_3_2]], align 2 +; CHECK-NEXT: [[CONV_3_2:%.*]] = zext i16 [[TMP19]] to i32 +; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]] +; CHECK-NEXT: [[MUL_3_2:%.*]] = mul nuw nsw i32 [[CONV_3_2]], [[CONV_3_2]] +; CHECK-NEXT: [[ADD11_3_2:%.*]] = add i32 [[MUL_3_2]], [[ADD11_2_2]] +; CHECK-NEXT: [[ARRAYIDX_4_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 4 +; CHECK-NEXT: [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX_4_2]], align 2 +; CHECK-NEXT: [[CONV_4_2:%.*]] = zext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]] +; CHECK-NEXT: [[MUL_4_2:%.*]] = mul nuw nsw i32 [[CONV_4_2]], [[CONV_4_2]] +; CHECK-NEXT: [[ADD11_4_2:%.*]] = add i32 [[MUL_4_2]], [[ADD11_3_2]] +; CHECK-NEXT: [[ARRAYIDX_5_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 5 +; CHECK-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_5_2]], align 2 +; CHECK-NEXT: [[CONV_5_2:%.*]] = zext i16 [[TMP21]] to i32 +; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]] +; CHECK-NEXT: [[MUL_5_2:%.*]] = mul nuw nsw i32 [[CONV_5_2]], [[CONV_5_2]] +; CHECK-NEXT: [[ADD11_5_2:%.*]] = add i32 [[MUL_5_2]], [[ADD11_4_2]] +; CHECK-NEXT: [[ARRAYIDX_6_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 6 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_6_2]], align 2 +; CHECK-NEXT: [[CONV_6_2:%.*]] = zext i16 [[TMP22]] to i32 +; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]] +; CHECK-NEXT: [[MUL_6_2:%.*]] = mul nuw nsw i32 [[CONV_6_2]], [[CONV_6_2]] +; CHECK-NEXT: [[ADD11_6_2:%.*]] = add i32 [[MUL_6_2]], [[ADD11_5_2]] +; CHECK-NEXT: [[ARRAYIDX_7_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 7 +; CHECK-NEXT: [[TMP23:%.*]] = load i16, ptr [[ARRAYIDX_7_2]], align 2 +; CHECK-NEXT: [[CONV_7_2:%.*]] = zext i16 [[TMP23]] to i32 +; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]] +; CHECK-NEXT: [[MUL_7_2:%.*]] = mul nuw nsw i32 [[CONV_7_2]], [[CONV_7_2]] +; CHECK-NEXT: [[ADD11_7_2:%.*]] = add i32 [[MUL_7_2]], [[ADD11_6_2]] ; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP24:%.*]] = load i16, ptr [[ADD_PTR_2]], align 2 +; CHECK-NEXT: [[CONV_348:%.*]] = zext i16 [[TMP24]] to i32 +; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]] +; CHECK-NEXT: [[MUL_350:%.*]] = mul nuw nsw i32 [[CONV_348]], [[CONV_348]] +; CHECK-NEXT: [[ADD11_351:%.*]] = add i32 [[MUL_350]], [[ADD11_7_2]] +; CHECK-NEXT: [[ARRAYIDX_1_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 1 +; CHECK-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX_1_3]], align 2 +; CHECK-NEXT: [[CONV_1_3:%.*]] = zext i16 [[TMP25]] to i32 +; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]] +; CHECK-NEXT: [[MUL_1_3:%.*]] = mul nuw nsw i32 [[CONV_1_3]], [[CONV_1_3]] +; CHECK-NEXT: [[ADD11_1_3:%.*]] = add i32 [[MUL_1_3]], [[ADD11_351]] +; CHECK-NEXT: [[ARRAYIDX_2_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 2 +; CHECK-NEXT: [[TMP26:%.*]] = load i16, ptr [[ARRAYIDX_2_3]], align 2 +; CHECK-NEXT: [[CONV_2_3:%.*]] = zext i16 [[TMP26]] to i32 +; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]] +; CHECK-NEXT: [[MUL_2_3:%.*]] = mul nuw nsw i32 [[CONV_2_3]], [[CONV_2_3]] +; CHECK-NEXT: [[ADD11_2_3:%.*]] = add i32 [[MUL_2_3]], [[ADD11_1_3]] +; CHECK-NEXT: [[ARRAYIDX_3_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 3 +; CHECK-NEXT: [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX_3_3]], align 2 +; CHECK-NEXT: [[CONV_3_3:%.*]] = zext i16 [[TMP27]] to i32 +; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]] +; CHECK-NEXT: [[MUL_3_3:%.*]] = mul nuw nsw i32 [[CONV_3_3]], [[CONV_3_3]] +; CHECK-NEXT: [[ADD11_3_3:%.*]] = add i32 [[MUL_3_3]], [[ADD11_2_3]] +; CHECK-NEXT: [[ARRAYIDX_4_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX_4_3]], align 2 +; CHECK-NEXT: [[CONV_4_3:%.*]] = zext i16 [[TMP28]] to i32 +; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]] +; CHECK-NEXT: [[MUL_4_3:%.*]] = mul nuw nsw i32 [[CONV_4_3]], [[CONV_4_3]] +; CHECK-NEXT: [[ADD11_4_3:%.*]] = add i32 [[MUL_4_3]], [[ADD11_3_3]] +; CHECK-NEXT: [[ARRAYIDX_5_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 5 +; CHECK-NEXT: [[TMP29:%.*]] = load i16, ptr [[ARRAYIDX_5_3]], align 2 +; CHECK-NEXT: [[CONV_5_3:%.*]] = zext i16 [[TMP29]] to i32 +; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]] +; CHECK-NEXT: [[MUL_5_3:%.*]] = mul nuw nsw i32 [[CONV_5_3]], [[CONV_5_3]] +; CHECK-NEXT: [[ADD11_5_3:%.*]] = add i32 [[MUL_5_3]], [[ADD11_4_3]] +; CHECK-NEXT: [[ARRAYIDX_6_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 6 +; CHECK-NEXT: [[TMP30:%.*]] = load i16, ptr [[ARRAYIDX_6_3]], align 2 +; CHECK-NEXT: [[CONV_6_3:%.*]] = zext i16 [[TMP30]] to i32 +; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]] +; CHECK-NEXT: [[MUL_6_3:%.*]] = mul nuw nsw i32 [[CONV_6_3]], [[CONV_6_3]] +; CHECK-NEXT: [[ADD11_6_3:%.*]] = add i32 [[MUL_6_3]], [[ADD11_5_3]] +; CHECK-NEXT: [[ARRAYIDX_7_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 7 +; CHECK-NEXT: [[TMP31:%.*]] = load i16, ptr [[ARRAYIDX_7_3]], align 2 +; CHECK-NEXT: [[CONV_7_3:%.*]] = zext i16 [[TMP31]] to i32 +; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]] +; CHECK-NEXT: [[MUL_7_3:%.*]] = mul nuw nsw i32 [[CONV_7_3]], [[CONV_7_3]] +; CHECK-NEXT: [[ADD11_7_3:%.*]] = add i32 [[MUL_7_3]], [[ADD11_6_3]] ; CHECK-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP32:%.*]] = load i16, ptr [[ADD_PTR_3]], align 2 +; CHECK-NEXT: [[CONV_452:%.*]] = zext i16 [[TMP32]] to i32 +; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]] +; CHECK-NEXT: [[MUL_454:%.*]] = mul nuw nsw i32 [[CONV_452]], [[CONV_452]] +; CHECK-NEXT: [[ADD11_455:%.*]] = add i32 [[MUL_454]], [[ADD11_7_3]] +; CHECK-NEXT: [[ARRAYIDX_1_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i16, ptr [[ARRAYIDX_1_4]], align 2 +; CHECK-NEXT: [[CONV_1_4:%.*]] = zext i16 [[TMP33]] to i32 +; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]] +; CHECK-NEXT: [[MUL_1_4:%.*]] = mul nuw nsw i32 [[CONV_1_4]], [[CONV_1_4]] +; CHECK-NEXT: [[ADD11_1_4:%.*]] = add i32 [[MUL_1_4]], [[ADD11_455]] +; CHECK-NEXT: [[ARRAYIDX_2_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 2 +; CHECK-NEXT: [[TMP34:%.*]] = load i16, ptr [[ARRAYIDX_2_4]], align 2 +; CHECK-NEXT: [[CONV_2_4:%.*]] = zext i16 [[TMP34]] to i32 +; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]] +; CHECK-NEXT: [[MUL_2_4:%.*]] = mul nuw nsw i32 [[CONV_2_4]], [[CONV_2_4]] +; CHECK-NEXT: [[ADD11_2_4:%.*]] = add i32 [[MUL_2_4]], [[ADD11_1_4]] +; CHECK-NEXT: [[ARRAYIDX_3_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 3 +; CHECK-NEXT: [[TMP35:%.*]] = load i16, ptr [[ARRAYIDX_3_4]], align 2 +; CHECK-NEXT: [[CONV_3_4:%.*]] = zext i16 [[TMP35]] to i32 +; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]] +; CHECK-NEXT: [[MUL_3_4:%.*]] = mul nuw nsw i32 [[CONV_3_4]], [[CONV_3_4]] +; CHECK-NEXT: [[ADD11_3_4:%.*]] = add i32 [[MUL_3_4]], [[ADD11_2_4]] +; CHECK-NEXT: [[ARRAYIDX_4_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 4 +; CHECK-NEXT: [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX_4_4]], align 2 +; CHECK-NEXT: [[CONV_4_4:%.*]] = zext i16 [[TMP36]] to i32 +; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]] +; CHECK-NEXT: [[MUL_4_4:%.*]] = mul nuw nsw i32 [[CONV_4_4]], [[CONV_4_4]] +; CHECK-NEXT: [[ADD11_4_4:%.*]] = add i32 [[MUL_4_4]], [[ADD11_3_4]] +; CHECK-NEXT: [[ARRAYIDX_5_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 5 +; CHECK-NEXT: [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX_5_4]], align 2 +; CHECK-NEXT: [[CONV_5_4:%.*]] = zext i16 [[TMP37]] to i32 +; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]] +; CHECK-NEXT: [[MUL_5_4:%.*]] = mul nuw nsw i32 [[CONV_5_4]], [[CONV_5_4]] +; CHECK-NEXT: [[ADD11_5_4:%.*]] = add i32 [[MUL_5_4]], [[ADD11_4_4]] +; CHECK-NEXT: [[ARRAYIDX_6_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 6 +; CHECK-NEXT: [[TMP38:%.*]] = load i16, ptr [[ARRAYIDX_6_4]], align 2 +; CHECK-NEXT: [[CONV_6_4:%.*]] = zext i16 [[TMP38]] to i32 +; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]] +; CHECK-NEXT: [[MUL_6_4:%.*]] = mul nuw nsw i32 [[CONV_6_4]], [[CONV_6_4]] +; CHECK-NEXT: [[ADD11_6_4:%.*]] = add i32 [[MUL_6_4]], [[ADD11_5_4]] +; CHECK-NEXT: [[ARRAYIDX_7_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 7 +; CHECK-NEXT: [[TMP39:%.*]] = load i16, ptr [[ARRAYIDX_7_4]], align 2 +; CHECK-NEXT: [[CONV_7_4:%.*]] = zext i16 [[TMP39]] to i32 +; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]] +; CHECK-NEXT: [[MUL_7_4:%.*]] = mul nuw nsw i32 [[CONV_7_4]], [[CONV_7_4]] +; CHECK-NEXT: [[ADD11_7_4:%.*]] = add i32 [[MUL_7_4]], [[ADD11_6_4]] ; CHECK-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP40:%.*]] = load i16, ptr [[ADD_PTR_4]], align 2 +; CHECK-NEXT: [[CONV_556:%.*]] = zext i16 [[TMP40]] to i32 +; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]] +; CHECK-NEXT: [[MUL_558:%.*]] = mul nuw nsw i32 [[CONV_556]], [[CONV_556]] +; CHECK-NEXT: [[ADD11_559:%.*]] = add i32 [[MUL_558]], [[ADD11_7_4]] +; CHECK-NEXT: [[ARRAYIDX_1_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 1 +; CHECK-NEXT: [[TMP41:%.*]] = load i16, ptr [[ARRAYIDX_1_5]], align 2 +; CHECK-NEXT: [[CONV_1_5:%.*]] = zext i16 [[TMP41]] to i32 +; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]] +; CHECK-NEXT: [[MUL_1_5:%.*]] = mul nuw nsw i32 [[CONV_1_5]], [[CONV_1_5]] +; CHECK-NEXT: [[ADD11_1_5:%.*]] = add i32 [[MUL_1_5]], [[ADD11_559]] +; CHECK-NEXT: [[ARRAYIDX_2_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 2 +; CHECK-NEXT: [[TMP42:%.*]] = load i16, ptr [[ARRAYIDX_2_5]], align 2 +; CHECK-NEXT: [[CONV_2_5:%.*]] = zext i16 [[TMP42]] to i32 +; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]] +; CHECK-NEXT: [[MUL_2_5:%.*]] = mul nuw nsw i32 [[CONV_2_5]], [[CONV_2_5]] +; CHECK-NEXT: [[ADD11_2_5:%.*]] = add i32 [[MUL_2_5]], [[ADD11_1_5]] +; CHECK-NEXT: [[ARRAYIDX_3_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 3 +; CHECK-NEXT: [[TMP43:%.*]] = load i16, ptr [[ARRAYIDX_3_5]], align 2 +; CHECK-NEXT: [[CONV_3_5:%.*]] = zext i16 [[TMP43]] to i32 +; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]] +; CHECK-NEXT: [[MUL_3_5:%.*]] = mul nuw nsw i32 [[CONV_3_5]], [[CONV_3_5]] +; CHECK-NEXT: [[ADD11_3_5:%.*]] = add i32 [[MUL_3_5]], [[ADD11_2_5]] +; CHECK-NEXT: [[ARRAYIDX_4_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 4 +; CHECK-NEXT: [[TMP44:%.*]] = load i16, ptr [[ARRAYIDX_4_5]], align 2 +; CHECK-NEXT: [[CONV_4_5:%.*]] = zext i16 [[TMP44]] to i32 +; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]] +; CHECK-NEXT: [[MUL_4_5:%.*]] = mul nuw nsw i32 [[CONV_4_5]], [[CONV_4_5]] +; CHECK-NEXT: [[ADD11_4_5:%.*]] = add i32 [[MUL_4_5]], [[ADD11_3_5]] +; CHECK-NEXT: [[ARRAYIDX_5_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 5 +; CHECK-NEXT: [[TMP45:%.*]] = load i16, ptr [[ARRAYIDX_5_5]], align 2 +; CHECK-NEXT: [[CONV_5_5:%.*]] = zext i16 [[TMP45]] to i32 +; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]] +; CHECK-NEXT: [[MUL_5_5:%.*]] = mul nuw nsw i32 [[CONV_5_5]], [[CONV_5_5]] +; CHECK-NEXT: [[ADD11_5_5:%.*]] = add i32 [[MUL_5_5]], [[ADD11_4_5]] +; CHECK-NEXT: [[ARRAYIDX_6_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 6 +; CHECK-NEXT: [[TMP46:%.*]] = load i16, ptr [[ARRAYIDX_6_5]], align 2 +; CHECK-NEXT: [[CONV_6_5:%.*]] = zext i16 [[TMP46]] to i32 +; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]] +; CHECK-NEXT: [[MUL_6_5:%.*]] = mul nuw nsw i32 [[CONV_6_5]], [[CONV_6_5]] +; CHECK-NEXT: [[ADD11_6_5:%.*]] = add i32 [[MUL_6_5]], [[ADD11_5_5]] +; CHECK-NEXT: [[ARRAYIDX_7_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 7 +; CHECK-NEXT: [[TMP47:%.*]] = load i16, ptr [[ARRAYIDX_7_5]], align 2 +; CHECK-NEXT: [[CONV_7_5:%.*]] = zext i16 [[TMP47]] to i32 +; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]] +; CHECK-NEXT: [[MUL_7_5:%.*]] = mul nuw nsw i32 [[CONV_7_5]], [[CONV_7_5]] +; CHECK-NEXT: [[ADD11_7_5:%.*]] = add i32 [[MUL_7_5]], [[ADD11_6_5]] ; CHECK-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[TMP48:%.*]] = load i16, ptr [[ADD_PTR_5]], align 2 +; CHECK-NEXT: [[CONV_660:%.*]] = zext i16 [[TMP48]] to i32 +; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]] +; CHECK-NEXT: [[MUL_662:%.*]] = mul nuw nsw i32 [[CONV_660]], [[CONV_660]] +; CHECK-NEXT: [[ADD11_663:%.*]] = add i32 [[MUL_662]], [[ADD11_7_5]] +; CHECK-NEXT: [[ARRAYIDX_1_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 1 +; CHECK-NEXT: [[TMP49:%.*]] = load i16, ptr [[ARRAYIDX_1_6]], align 2 +; CHECK-NEXT: [[CONV_1_6:%.*]] = zext i16 [[TMP49]] to i32 +; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]] +; CHECK-NEXT: [[MUL_1_6:%.*]] = mul nuw nsw i32 [[CONV_1_6]], [[CONV_1_6]] +; CHECK-NEXT: [[ADD11_1_6:%.*]] = add i32 [[MUL_1_6]], [[ADD11_663]] +; CHECK-NEXT: [[ARRAYIDX_2_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 2 +; CHECK-NEXT: [[TMP50:%.*]] = load i16, ptr [[ARRAYIDX_2_6]], align 2 +; CHECK-NEXT: [[CONV_2_6:%.*]] = zext i16 [[TMP50]] to i32 +; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]] +; CHECK-NEXT: [[MUL_2_6:%.*]] = mul nuw nsw i32 [[CONV_2_6]], [[CONV_2_6]] +; CHECK-NEXT: [[ADD11_2_6:%.*]] = add i32 [[MUL_2_6]], [[ADD11_1_6]] +; CHECK-NEXT: [[ARRAYIDX_3_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 3 +; CHECK-NEXT: [[TMP51:%.*]] = load i16, ptr [[ARRAYIDX_3_6]], align 2 +; CHECK-NEXT: [[CONV_3_6:%.*]] = zext i16 [[TMP51]] to i32 +; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]] +; CHECK-NEXT: [[MUL_3_6:%.*]] = mul nuw nsw i32 [[CONV_3_6]], [[CONV_3_6]] +; CHECK-NEXT: [[ADD11_3_6:%.*]] = add i32 [[MUL_3_6]], [[ADD11_2_6]] +; CHECK-NEXT: [[ARRAYIDX_4_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 4 +; CHECK-NEXT: [[TMP52:%.*]] = load i16, ptr [[ARRAYIDX_4_6]], align 2 +; CHECK-NEXT: [[CONV_4_6:%.*]] = zext i16 [[TMP52]] to i32 +; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]] +; CHECK-NEXT: [[MUL_4_6:%.*]] = mul nuw nsw i32 [[CONV_4_6]], [[CONV_4_6]] +; CHECK-NEXT: [[ADD11_4_6:%.*]] = add i32 [[MUL_4_6]], [[ADD11_3_6]] +; CHECK-NEXT: [[ARRAYIDX_5_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 5 +; CHECK-NEXT: [[TMP53:%.*]] = load i16, ptr [[ARRAYIDX_5_6]], align 2 +; CHECK-NEXT: [[CONV_5_6:%.*]] = zext i16 [[TMP53]] to i32 +; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]] +; CHECK-NEXT: [[MUL_5_6:%.*]] = mul nuw nsw i32 [[CONV_5_6]], [[CONV_5_6]] +; CHECK-NEXT: [[ADD11_5_6:%.*]] = add i32 [[MUL_5_6]], [[ADD11_4_6]] +; CHECK-NEXT: [[ARRAYIDX_6_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 6 +; CHECK-NEXT: [[TMP54:%.*]] = load i16, ptr [[ARRAYIDX_6_6]], align 2 +; CHECK-NEXT: [[CONV_6_6:%.*]] = zext i16 [[TMP54]] to i32 +; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]] +; CHECK-NEXT: [[MUL_6_6:%.*]] = mul nuw nsw i32 [[CONV_6_6]], [[CONV_6_6]] +; CHECK-NEXT: [[ADD11_6_6:%.*]] = add i32 [[MUL_6_6]], [[ADD11_5_6]] +; CHECK-NEXT: [[ARRAYIDX_7_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 7 +; CHECK-NEXT: [[TMP55:%.*]] = load i16, ptr [[ARRAYIDX_7_6]], align 2 +; CHECK-NEXT: [[CONV_7_6:%.*]] = zext i16 [[TMP55]] to i32 +; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]] +; CHECK-NEXT: [[MUL_7_6:%.*]] = mul nuw nsw i32 [[CONV_7_6]], [[CONV_7_6]] +; CHECK-NEXT: [[ADD11_7_6:%.*]] = add i32 [[MUL_7_6]], [[ADD11_6_6]] ; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2 -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2 -; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2 -; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 -; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> poison, <8 x i16> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP9:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP8]], <8 x i16> [[TMP1]], i64 8) -; CHECK-NEXT: [[TMP10:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP9]], <8 x i16> [[TMP2]], i64 16) -; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP10]], <8 x i16> [[TMP3]], i64 24) -; CHECK-NEXT: [[TMP12:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP11]], <8 x i16> [[TMP4]], i64 32) -; CHECK-NEXT: [[TMP13:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP12]], <8 x i16> [[TMP5]], i64 40) -; CHECK-NEXT: [[TMP14:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP13]], <8 x i16> [[TMP6]], i64 48) -; CHECK-NEXT: [[TMP15:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP14]], <8 x i16> [[TMP7]], i64 56) -; CHECK-NEXT: [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[TMP17]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = mul nuw nsw <64 x i32> [[TMP16]], [[TMP16]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <64 x i32> [[TMP16]], i32 2 -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <64 x i32> [[TMP16]], i32 3 -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i32> [[TMP16]], i32 4 -; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <64 x i32> [[TMP16]], i32 5 -; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP23]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <64 x i32> [[TMP16]], i32 6 -; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP24]] -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <64 x i32> [[TMP16]], i32 7 -; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP25]] -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <64 x i32> [[TMP16]], i32 8 -; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP26]] -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <64 x i32> [[TMP16]], i32 9 -; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP27]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <64 x i32> [[TMP16]], i32 10 -; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP28]] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <64 x i32> [[TMP16]], i32 11 -; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP29]] -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <64 x i32> [[TMP16]], i32 12 -; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP30]] -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <64 x i32> [[TMP16]], i32 13 -; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP31]] -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <64 x i32> [[TMP16]], i32 14 -; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP32]] -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <64 x i32> [[TMP16]], i32 15 -; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP33]] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <64 x i32> [[TMP16]], i32 16 -; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP34]] -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <64 x i32> [[TMP16]], i32 17 -; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP35]] -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <64 x i32> [[TMP16]], i32 18 -; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP36]] -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <64 x i32> [[TMP16]], i32 19 -; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP37]] -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <64 x i32> [[TMP16]], i32 20 -; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP38]] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <64 x i32> [[TMP16]], i32 21 -; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP39]] -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <64 x i32> [[TMP16]], i32 22 -; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP40]] -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <64 x i32> [[TMP16]], i32 23 -; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP41]] -; CHECK-NEXT: [[TMP42:%.*]] = extractelement <64 x i32> [[TMP16]], i32 24 -; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP42]] -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <64 x i32> [[TMP16]], i32 25 -; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP43]] -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <64 x i32> [[TMP16]], i32 26 -; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP44]] -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <64 x i32> [[TMP16]], i32 27 -; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP45]] -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <64 x i32> [[TMP16]], i32 28 -; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP46]] -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <64 x i32> [[TMP16]], i32 29 -; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP47]] -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <64 x i32> [[TMP16]], i32 30 -; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP48]] -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <64 x i32> [[TMP16]], i32 31 -; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP49]] -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <64 x i32> [[TMP16]], i32 32 -; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP50]] -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <64 x i32> [[TMP16]], i32 33 -; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP51]] -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <64 x i32> [[TMP16]], i32 34 -; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP52]] -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <64 x i32> [[TMP16]], i32 35 -; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP53]] -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <64 x i32> [[TMP16]], i32 36 -; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP54]] -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <64 x i32> [[TMP16]], i32 37 -; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP55]] -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <64 x i32> [[TMP16]], i32 38 -; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP56]] -; CHECK-NEXT: [[TMP57:%.*]] = extractelement <64 x i32> [[TMP16]], i32 39 -; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP57]] -; CHECK-NEXT: [[TMP58:%.*]] = extractelement <64 x i32> [[TMP16]], i32 40 -; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP58]] -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <64 x i32> [[TMP16]], i32 41 -; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP59]] -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <64 x i32> [[TMP16]], i32 42 -; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP60]] -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <64 x i32> [[TMP16]], i32 43 -; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP61]] -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <64 x i32> [[TMP16]], i32 44 -; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP62]] -; CHECK-NEXT: [[TMP63:%.*]] = extractelement <64 x i32> [[TMP16]], i32 45 -; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP63]] -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <64 x i32> [[TMP16]], i32 46 -; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP64]] -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <64 x i32> [[TMP16]], i32 47 -; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP65]] -; CHECK-NEXT: [[TMP66:%.*]] = extractelement <64 x i32> [[TMP16]], i32 48 -; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP66]] -; CHECK-NEXT: [[TMP67:%.*]] = extractelement <64 x i32> [[TMP16]], i32 49 -; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP67]] -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <64 x i32> [[TMP16]], i32 50 -; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP68]] -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <64 x i32> [[TMP16]], i32 51 -; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP69]] -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <64 x i32> [[TMP16]], i32 52 -; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP70]] -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <64 x i32> [[TMP16]], i32 53 -; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP71]] -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <64 x i32> [[TMP16]], i32 54 -; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP72]] -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <64 x i32> [[TMP16]], i32 55 -; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP73]] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <64 x i32> [[TMP16]], i32 56 +; CHECK-NEXT: [[TMP56:%.*]] = load i16, ptr [[ADD_PTR_6]], align 2 +; CHECK-NEXT: [[TMP74:%.*]] = zext i16 [[TMP56]] to i32 ; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP74]] -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <64 x i32> [[TMP16]], i32 57 +; CHECK-NEXT: [[MUL_766:%.*]] = mul nuw nsw i32 [[TMP74]], [[TMP74]] +; CHECK-NEXT: [[ADD11_767:%.*]] = add i32 [[MUL_766]], [[ADD11_7_6]] +; CHECK-NEXT: [[ARRAYIDX_1_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 1 +; CHECK-NEXT: [[TMP57:%.*]] = load i16, ptr [[ARRAYIDX_1_7]], align 2 +; CHECK-NEXT: [[TMP75:%.*]] = zext i16 [[TMP57]] to i32 ; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP75]] -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <64 x i32> [[TMP16]], i32 58 +; CHECK-NEXT: [[MUL_1_7:%.*]] = mul nuw nsw i32 [[TMP75]], [[TMP75]] +; CHECK-NEXT: [[ADD11_1_7:%.*]] = add i32 [[MUL_1_7]], [[ADD11_767]] +; CHECK-NEXT: [[ARRAYIDX_2_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 2 +; CHECK-NEXT: [[TMP58:%.*]] = load i16, ptr [[ARRAYIDX_2_7]], align 2 +; CHECK-NEXT: [[TMP76:%.*]] = zext i16 [[TMP58]] to i32 ; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP76]] -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <64 x i32> [[TMP16]], i32 59 +; CHECK-NEXT: [[MUL_2_7:%.*]] = mul nuw nsw i32 [[TMP76]], [[TMP76]] +; CHECK-NEXT: [[ADD11_2_7:%.*]] = add i32 [[MUL_2_7]], [[ADD11_1_7]] +; CHECK-NEXT: [[ARRAYIDX_3_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 3 +; CHECK-NEXT: [[TMP59:%.*]] = load i16, ptr [[ARRAYIDX_3_7]], align 2 +; CHECK-NEXT: [[TMP77:%.*]] = zext i16 [[TMP59]] to i32 ; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP77]] -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <64 x i32> [[TMP16]], i32 60 +; CHECK-NEXT: [[MUL_3_7:%.*]] = mul nuw nsw i32 [[TMP77]], [[TMP77]] +; CHECK-NEXT: [[ADD11_3_7:%.*]] = add i32 [[MUL_3_7]], [[ADD11_2_7]] +; CHECK-NEXT: [[ARRAYIDX_4_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 4 +; CHECK-NEXT: [[TMP60:%.*]] = load i16, ptr [[ARRAYIDX_4_7]], align 2 +; CHECK-NEXT: [[TMP78:%.*]] = zext i16 [[TMP60]] to i32 ; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP78]] -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <64 x i32> [[TMP16]], i32 61 +; CHECK-NEXT: [[MUL_4_7:%.*]] = mul nuw nsw i32 [[TMP78]], [[TMP78]] +; CHECK-NEXT: [[ADD11_4_7:%.*]] = add i32 [[MUL_4_7]], [[ADD11_3_7]] +; CHECK-NEXT: [[ARRAYIDX_5_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 5 +; CHECK-NEXT: [[TMP61:%.*]] = load i16, ptr [[ARRAYIDX_5_7]], align 2 +; CHECK-NEXT: [[TMP79:%.*]] = zext i16 [[TMP61]] to i32 ; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP79]] -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <64 x i32> [[TMP16]], i32 62 +; CHECK-NEXT: [[MUL_5_7:%.*]] = mul nuw nsw i32 [[TMP79]], [[TMP79]] +; CHECK-NEXT: [[ADD11_5_7:%.*]] = add i32 [[MUL_5_7]], [[ADD11_4_7]] +; CHECK-NEXT: [[ARRAYIDX_6_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 6 +; CHECK-NEXT: [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX_6_7]], align 2 +; CHECK-NEXT: [[TMP80:%.*]] = zext i16 [[TMP62]] to i32 ; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP80]] -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <64 x i32> [[TMP16]], i32 63 +; CHECK-NEXT: [[MUL_6_7:%.*]] = mul nuw nsw i32 [[TMP80]], [[TMP80]] +; CHECK-NEXT: [[ADD11_6_7:%.*]] = add i32 [[MUL_6_7]], [[ADD11_5_7]] +; CHECK-NEXT: [[ARRAYIDX_7_7:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_6]], i64 7 +; CHECK-NEXT: [[TMP63:%.*]] = load i16, ptr [[ARRAYIDX_7_7]], align 2 +; CHECK-NEXT: [[TMP81:%.*]] = zext i16 [[TMP63]] to i32 ; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP81]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP19]]) +; CHECK-NEXT: [[MUL_7_7:%.*]] = mul nuw nsw i32 [[TMP81]], [[TMP81]] +; CHECK-NEXT: [[TMP82:%.*]] = add i32 [[MUL_7_7]], [[ADD11_6_7]] ; CHECK-NEXT: [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64 ; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP82]] to i64 ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32 @@ -573,13 +801,101 @@ define i64 @looped(ptr nocapture noundef readonly %p, i32 noundef %st) { ; CHECK-NEXT: [[SQ_037:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX:%.*]], [[FOR_COND1_PREHEADER]] ] ; CHECK-NEXT: [[SM_036:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[OP_RDX1:%.*]], [[FOR_COND1_PREHEADER]] ] ; CHECK-NEXT: [[P_ADDR_035:%.*]] = phi ptr [ [[P:%.*]], [[ENTRY]] ], [ [[ADD_PTR:%.*]], [[FOR_COND1_PREHEADER]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i16>, ptr [[P_ADDR_035]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = zext <16 x i16> [[TMP0]] to <16 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <16 x i32> [[TMP1]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]]) -; CHECK-NEXT: [[OP_RDX1]] = add i32 [[TMP3]], [[SM_036]] -; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP4]], [[SQ_037]] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[P_ADDR_035]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP0]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SM_036]], [[CONV]] +; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV]], [[CONV]] +; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[MUL]], [[SQ_037]] +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP1]] to i32 +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD]], [[CONV_1]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul nuw nsw i32 [[CONV_1]], [[CONV_1]] +; CHECK-NEXT: [[ADD11_1:%.*]] = add i32 [[MUL_1]], [[ADD11]] +; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP2]] to i32 +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[CONV_2]] +; CHECK-NEXT: [[MUL_2:%.*]] = mul nuw nsw i32 [[CONV_2]], [[CONV_2]] +; CHECK-NEXT: [[ADD11_2:%.*]] = add i32 [[MUL_2]], [[ADD11_1]] +; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 3 +; CHECK-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP3]] to i32 +; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[ADD_2]], [[CONV_3]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul nuw nsw i32 [[CONV_3]], [[CONV_3]] +; CHECK-NEXT: [[ADD11_3:%.*]] = add i32 [[MUL_3]], [[ADD11_2]] +; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 4 +; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2 +; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP4]] to i32 +; CHECK-NEXT: [[ADD_4:%.*]] = add i32 [[ADD_3]], [[CONV_4]] +; CHECK-NEXT: [[MUL_4:%.*]] = mul nuw nsw i32 [[CONV_4]], [[CONV_4]] +; CHECK-NEXT: [[ADD11_4:%.*]] = add i32 [[MUL_4]], [[ADD11_3]] +; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 5 +; CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2 +; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP5]] to i32 +; CHECK-NEXT: [[ADD_5:%.*]] = add i32 [[ADD_4]], [[CONV_5]] +; CHECK-NEXT: [[MUL_5:%.*]] = mul nuw nsw i32 [[CONV_5]], [[CONV_5]] +; CHECK-NEXT: [[ADD11_5:%.*]] = add i32 [[MUL_5]], [[ADD11_4]] +; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 6 +; CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2 +; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP6]] to i32 +; CHECK-NEXT: [[ADD_6:%.*]] = add i32 [[ADD_5]], [[CONV_6]] +; CHECK-NEXT: [[MUL_6:%.*]] = mul nuw nsw i32 [[CONV_6]], [[CONV_6]] +; CHECK-NEXT: [[ADD11_6:%.*]] = add i32 [[MUL_6]], [[ADD11_5]] +; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 7 +; CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2 +; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP7]] to i32 +; CHECK-NEXT: [[ADD_7:%.*]] = add i32 [[ADD_6]], [[CONV_7]] +; CHECK-NEXT: [[MUL_7:%.*]] = mul nuw nsw i32 [[CONV_7]], [[CONV_7]] +; CHECK-NEXT: [[ADD11_7:%.*]] = add i32 [[MUL_7]], [[ADD11_6]] +; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 8 +; CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_8]], align 2 +; CHECK-NEXT: [[CONV_8:%.*]] = zext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[ADD_8:%.*]] = add i32 [[ADD_7]], [[CONV_8]] +; CHECK-NEXT: [[MUL_8:%.*]] = mul nuw nsw i32 [[CONV_8]], [[CONV_8]] +; CHECK-NEXT: [[ADD11_8:%.*]] = add i32 [[MUL_8]], [[ADD11_7]] +; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 9 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_9]], align 2 +; CHECK-NEXT: [[CONV_9:%.*]] = zext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[ADD_9:%.*]] = add i32 [[ADD_8]], [[CONV_9]] +; CHECK-NEXT: [[MUL_9:%.*]] = mul nuw nsw i32 [[CONV_9]], [[CONV_9]] +; CHECK-NEXT: [[ADD11_9:%.*]] = add i32 [[MUL_9]], [[ADD11_8]] +; CHECK-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 10 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_10]], align 2 +; CHECK-NEXT: [[CONV_10:%.*]] = zext i16 [[TMP10]] to i32 +; CHECK-NEXT: [[ADD_10:%.*]] = add i32 [[ADD_9]], [[CONV_10]] +; CHECK-NEXT: [[MUL_10:%.*]] = mul nuw nsw i32 [[CONV_10]], [[CONV_10]] +; CHECK-NEXT: [[ADD11_10:%.*]] = add i32 [[MUL_10]], [[ADD11_9]] +; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 11 +; CHECK-NEXT: [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_11]], align 2 +; CHECK-NEXT: [[CONV_11:%.*]] = zext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[ADD_11:%.*]] = add i32 [[ADD_10]], [[CONV_11]] +; CHECK-NEXT: [[MUL_11:%.*]] = mul nuw nsw i32 [[CONV_11]], [[CONV_11]] +; CHECK-NEXT: [[ADD11_11:%.*]] = add i32 [[MUL_11]], [[ADD11_10]] +; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 12 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_12]], align 2 +; CHECK-NEXT: [[CONV_12:%.*]] = zext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[ADD_12:%.*]] = add i32 [[ADD_11]], [[CONV_12]] +; CHECK-NEXT: [[MUL_12:%.*]] = mul nuw nsw i32 [[CONV_12]], [[CONV_12]] +; CHECK-NEXT: [[ADD11_12:%.*]] = add i32 [[MUL_12]], [[ADD11_11]] +; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 13 +; CHECK-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_13]], align 2 +; CHECK-NEXT: [[CONV_13:%.*]] = zext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[ADD_13:%.*]] = add i32 [[ADD_12]], [[CONV_13]] +; CHECK-NEXT: [[MUL_13:%.*]] = mul nuw nsw i32 [[CONV_13]], [[CONV_13]] +; CHECK-NEXT: [[ADD11_13:%.*]] = add i32 [[MUL_13]], [[ADD11_12]] +; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 14 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX_14]], align 2 +; CHECK-NEXT: [[CONV_14:%.*]] = zext i16 [[TMP14]] to i32 +; CHECK-NEXT: [[ADD_14:%.*]] = add i32 [[ADD_13]], [[CONV_14]] +; CHECK-NEXT: [[MUL_14:%.*]] = mul nuw nsw i32 [[CONV_14]], [[CONV_14]] +; CHECK-NEXT: [[ADD11_14:%.*]] = add i32 [[MUL_14]], [[ADD11_13]] +; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 15 +; CHECK-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_15]], align 2 +; CHECK-NEXT: [[CONV_15:%.*]] = zext i16 [[TMP15]] to i32 +; CHECK-NEXT: [[OP_RDX1]] = add i32 [[ADD_14]], [[CONV_15]] +; CHECK-NEXT: [[MUL_15:%.*]] = mul nuw nsw i32 [[CONV_15]], [[CONV_15]] +; CHECK-NEXT: [[OP_RDX]] = add i32 [[MUL_15]], [[ADD11_14]] ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, ptr [[P_ADDR_035]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[INC13]] = add nuw nsw i32 [[Y_038]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC13]], 16 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll index aeb82d800a2f7..3c2f9e4d0ab5d 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll @@ -4,17 +4,17 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr null, align 1 +; CHECK-NEXT: [[G_2197_REAL32_PRE:%.*]] = load i32, ptr null, align 1 +; CHECK-NEXT: [[G_2197_IMAG33_PRE:%.*]] = load i32, ptr getelementptr inbounds nuw ({ i32, i32 }, ptr null, i32 0, i32 1), align 1 ; CHECK-NEXT: br label %[[IF_END:.*]] ; CHECK: [[IF_THEN:.*]]: ; CHECK-NEXT: br label %[[IF_END]] ; CHECK: [[IF_END]]: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP0]], %[[ENTRY]] ], [ poison, %[[IF_THEN]] ] -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[G_2197_IMAG33_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[G_2197_REAL32_PRE]], %[[ENTRY]] ], [ 0, %[[IF_THEN]] ] ; CHECK-NEXT: store i32 [[TMP2]], ptr null, align 1 ; CHECK-NEXT: br label %[[TRAP:.*]] ; CHECK: [[BB3:.*:]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 ; CHECK-NEXT: store i32 [[TMP4]], ptr null, align 1 ; CHECK-NEXT: ret void ; CHECK: [[TRAP]]: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll index 3cb81b72d26a1..858f638e98730 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vector-getelementptr.ll @@ -1,24 +1,42 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -mtriple=arm64-apple-ios -S -passes=slp-vectorizer < %s | FileCheck %s -; vectorization requires a vector GEP + extracts, but the cost is offset by being able to efficiently vectorize the rest of the tree -define void @should_vectorize_gep(ptr %base1, ptr %base2, ptr %base_gep) { -; CHECK-LABEL: define void @should_vectorize_gep +; vectorization requires a vector GEP + extracts, making the cost too high to vectorize. + +define void @shouldnt_vectorize_gep(ptr %base1, ptr %base2, ptr %base_gep) { +; CHECK-LABEL: define void @shouldnt_vectorize_gep ; CHECK-SAME: (ptr [[BASE1:%.*]], ptr [[BASE2:%.*]], ptr [[BASE_GEP:%.*]]) { ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[BASE1]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i32> [[TMP0]] to <4 x i64> -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[BASE2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i32> [[TMP2]] to <4 x i64> -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i64> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i64> [[TMP4]], i32 1 -; CHECK-NEXT: [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP4]], i32 2 -; CHECK-NEXT: [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i64> [[TMP4]], i32 3 -; CHECK-NEXT: [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[TMP8]] +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[BASE1]], align 2 +; CHECK-NEXT: [[ZEXT1:%.*]] = zext i32 [[LOAD1]] to i64 +; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[BASE2]], align 2 +; CHECK-NEXT: [[ZEXT2:%.*]] = zext i32 [[LOAD2]] to i64 +; CHECK-NEXT: [[SUB:%.*]] = sub i64 [[ZEXT1]], [[ZEXT2]] +; CHECK-NEXT: [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB]] +; CHECK-NEXT: [[GETELEMENTPTR1:%.*]] = getelementptr i32, ptr [[BASE1]], i64 1 +; CHECK-NEXT: [[GETELEMENTPTR2:%.*]] = getelementptr i32, ptr [[BASE2]], i64 1 +; CHECK-NEXT: [[LOAD3:%.*]] = load i32, ptr [[GETELEMENTPTR1]], align 2 +; CHECK-NEXT: [[ZEXT3:%.*]] = zext i32 [[LOAD3]] to i64 +; CHECK-NEXT: [[LOAD4:%.*]] = load i32, ptr [[GETELEMENTPTR2]], align 2 +; CHECK-NEXT: [[ZEXT4:%.*]] = zext i32 [[LOAD4]] to i64 +; CHECK-NEXT: [[SUB2:%.*]] = sub i64 [[ZEXT3]], [[ZEXT4]] +; CHECK-NEXT: [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB2]] +; CHECK-NEXT: [[GETELEMENTPTR3:%.*]] = getelementptr i32, ptr [[BASE1]], i64 2 +; CHECK-NEXT: [[GETELEMENTPTR4:%.*]] = getelementptr i32, ptr [[BASE2]], i64 2 +; CHECK-NEXT: [[LOAD5:%.*]] = load i32, ptr [[GETELEMENTPTR3]], align 2 +; CHECK-NEXT: [[ZEXT5:%.*]] = zext i32 [[LOAD5]] to i64 +; CHECK-NEXT: [[LOAD6:%.*]] = load i32, ptr [[GETELEMENTPTR4]], align 2 +; CHECK-NEXT: [[ZEXT6:%.*]] = zext i32 [[LOAD6]] to i64 +; CHECK-NEXT: [[SUB3:%.*]] = sub i64 [[ZEXT5]], [[ZEXT6]] +; CHECK-NEXT: [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB3]] +; CHECK-NEXT: [[GETELEMENTPTR5:%.*]] = getelementptr i32, ptr [[BASE1]], i64 3 +; CHECK-NEXT: [[GETELEMENTPTR6:%.*]] = getelementptr i32, ptr [[BASE2]], i64 3 +; CHECK-NEXT: [[LOAD7:%.*]] = load i32, ptr [[GETELEMENTPTR5]], align 2 +; CHECK-NEXT: [[ZEXT7:%.*]] = zext i32 [[LOAD7]] to i64 +; CHECK-NEXT: [[LOAD8:%.*]] = load i32, ptr [[GETELEMENTPTR6]], align 2 +; CHECK-NEXT: [[ZEXT8:%.*]] = zext i32 [[LOAD8]] to i64 +; CHECK-NEXT: [[SUB4:%.*]] = sub i64 [[ZEXT7]], [[ZEXT8]] +; CHECK-NEXT: [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB4]] ; CHECK-NEXT: call void @use_4(ptr [[GETELEMENTPTR_RES_1]], ptr [[GETELEMENTPTR_RES_2]], ptr [[GETELEMENTPTR_RES_3]], ptr [[GETELEMENTPTR_RES_4]]) ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index acfd4581f98f7..e91bd25034475 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -249,7 +249,7 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 ; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[PTR]], align 1 ; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll index 9b6511d0d8284..d880c6b1783c8 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-opcode-sindle-bv.ll @@ -17,12 +17,12 @@ define <2 x i32> @test(i32 %arg) { ; AARCH64-LABEL: define <2 x i32> @test( ; AARCH64-SAME: i32 [[ARG:%.*]]) { ; AARCH64-NEXT: bb: -; AARCH64-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[ARG]], i32 0 -; AARCH64-NEXT: [[TMP1:%.*]] = mul <2 x i32> [[TMP0]], zeroinitializer -; AARCH64-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0 -; AARCH64-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1 +; AARCH64-NEXT: [[TMP2:%.*]] = or i32 [[ARG]], 0 +; AARCH64-NEXT: [[TMP3:%.*]] = mul i32 0, 1 ; AARCH64-NEXT: [[MUL1:%.*]] = mul i32 [[TMP2]], [[TMP3]] ; AARCH64-NEXT: [[CMP:%.*]] = icmp ugt i32 0, [[MUL1]] +; AARCH64-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[TMP2]], i32 0 +; AARCH64-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[TMP3]], i32 1 ; AARCH64-NEXT: ret <2 x i32> [[TMP1]] ; bb: