@@ -12,9 +12,9 @@ target triple = "aarch64-unknown-linux-gnu"
1212; DEBUG: LV: Found maximum trip count: 19
1313; DEBUG: LV: IC is 1
1414; DEBUG-VS1: LV: VF is vscale x 16
15- ; DEBUG-VS1: Main Loop VF:vscale x 16, Main Loop UF:1, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1
15+ ; DEBUG-VS1: Main Loop VF:vscale x 16, Main Loop UF:1, Epilogue Loop VF:8, Epilogue Loop UF:1
1616; DEBUG-VS2: LV: VF is vscale x 8
17- ; DEBUG-VS2: Main Loop VF:vscale x 8, Main Loop UF:1, Epilogue Loop VF:vscale x 4 , Epilogue Loop UF:1
17+ ; DEBUG-VS2: Main Loop VF:vscale x 8, Main Loop UF:1, Epilogue Loop VF:8 , Epilogue Loop UF:1
1818
1919; DEBUG-LABEL: LV: Checking a loop in 'trip_count_too_small'
2020; DEBUG: LV: Found a loop with a very small trip count. This loop is worth vectorizing only if no scalar iteration overheads are incurred.
@@ -48,9 +48,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
4848; CHECK-VS1-NEXT: [[TMP1:%.*]] = add i32 [[TC]], 1
4949; CHECK-VS1-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
5050; CHECK-VS1-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
51- ; CHECK-VS1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
52- ; CHECK-VS1-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
53- ; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
51+ ; CHECK-VS1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8
5452; CHECK-VS1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
5553; CHECK-VS1: [[VECTOR_SCEVCHECK]]:
5654; CHECK-VS1-NEXT: [[TMP6:%.*]] = add i32 [[TC]], 1
@@ -91,28 +89,24 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
9189; CHECK-VS1: [[VEC_EPILOG_ITER_CHECK]]:
9290; CHECK-VS1-NEXT: [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
9391; CHECK-VS1-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
94- ; CHECK-VS1-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
95- ; CHECK-VS1-NEXT: [[TMP27:%.*]] = shl nuw i64 [[TMP26]], 3
96- ; CHECK-VS1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP27]]
92+ ; CHECK-VS1-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
9793; CHECK-VS1-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
9894; CHECK-VS1: [[VEC_EPILOG_PH]]:
9995; CHECK-VS1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
100- ; CHECK-VS1-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
101- ; CHECK-VS1-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 8
102- ; CHECK-VS1-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
96+ ; CHECK-VS1-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 8
10397; CHECK-VS1-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
10498; CHECK-VS1-NEXT: [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
105- ; CHECK-VS1-NEXT: [[BROADCAST_SPLATINSERT7 :%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[CONV]], i64 0
106- ; CHECK-VS1-NEXT: [[BROADCAST_SPLAT8 :%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT7 ]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
99+ ; CHECK-VS1-NEXT: [[BROADCAST_SPLATINSERT4 :%.*]] = insertelement <8 x i8> poison, i8 [[CONV]], i64 0
100+ ; CHECK-VS1-NEXT: [[BROADCAST_SPLAT5 :%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT4 ]], <8 x i8> poison, <8 x i32> zeroinitializer
107101; CHECK-VS1-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
108102; CHECK-VS1: [[VEC_EPILOG_VECTOR_BODY]]:
109103; CHECK-VS1-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
110104; CHECK-VS1-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX5]]
111105; CHECK-VS1-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[OFFSET_IDX]]
112- ; CHECK-VS1-NEXT: [[WIDE_LOAD6 :%.*]] = load <vscale x 8 x i8>, ptr [[TMP33]], align 1
113- ; CHECK-VS1-NEXT: [[TMP35 :%.*]] = add <vscale x 8 x i8> [[WIDE_LOAD6 ]], [[BROADCAST_SPLAT8 ]]
114- ; CHECK-VS1-NEXT: store <vscale x 8 x i8> [[TMP35 ]], ptr [[TMP33]], align 1
115- ; CHECK-VS1-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], [[TMP29]]
106+ ; CHECK-VS1-NEXT: [[WIDE_LOAD7 :%.*]] = load <8 x i8>, ptr [[TMP33]], align 1
107+ ; CHECK-VS1-NEXT: [[TMP23 :%.*]] = add <8 x i8> [[WIDE_LOAD7 ]], [[BROADCAST_SPLAT5 ]]
108+ ; CHECK-VS1-NEXT: store <8 x i8> [[TMP23 ]], ptr [[TMP33]], align 1
109+ ; CHECK-VS1-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 8
116110; CHECK-VS1-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
117111; CHECK-VS1-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
118112; CHECK-VS1: [[VEC_EPILOG_MIDDLE_BLOCK]]:
@@ -148,9 +142,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
148142; CHECK-VS2-NEXT: [[TMP1:%.*]] = add i32 [[TC]], 1
149143; CHECK-VS2-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
150144; CHECK-VS2-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]]
151- ; CHECK-VS2-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
152- ; CHECK-VS2-NEXT: [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
153- ; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]]
145+ ; CHECK-VS2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8
154146; CHECK-VS2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
155147; CHECK-VS2: [[VECTOR_SCEVCHECK]]:
156148; CHECK-VS2-NEXT: [[TMP6:%.*]] = add i32 [[TC]], 1
@@ -191,28 +183,24 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
191183; CHECK-VS2: [[VEC_EPILOG_ITER_CHECK]]:
192184; CHECK-VS2-NEXT: [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
193185; CHECK-VS2-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]]
194- ; CHECK-VS2-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
195- ; CHECK-VS2-NEXT: [[TMP27:%.*]] = shl nuw i64 [[TMP26]], 2
196- ; CHECK-VS2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP27]]
186+ ; CHECK-VS2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
197187; CHECK-VS2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
198188; CHECK-VS2: [[VEC_EPILOG_PH]]:
199189; CHECK-VS2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
200- ; CHECK-VS2-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
201- ; CHECK-VS2-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP28]], 4
202- ; CHECK-VS2-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], [[TMP29]]
190+ ; CHECK-VS2-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TMP3]], 8
203191; CHECK-VS2-NEXT: [[N_VEC3:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF2]]
204192; CHECK-VS2-NEXT: [[TMP39:%.*]] = add i64 [[TMP0]], [[N_VEC3]]
205- ; CHECK-VS2-NEXT: [[BROADCAST_SPLATINSERT7 :%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[CONV]], i64 0
206- ; CHECK-VS2-NEXT: [[BROADCAST_SPLAT8 :%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT7 ]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
193+ ; CHECK-VS2-NEXT: [[BROADCAST_SPLATINSERT4 :%.*]] = insertelement <8 x i8> poison, i8 [[CONV]], i64 0
194+ ; CHECK-VS2-NEXT: [[BROADCAST_SPLAT5 :%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT4 ]], <8 x i8> poison, <8 x i32> zeroinitializer
207195; CHECK-VS2-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
208196; CHECK-VS2: [[VEC_EPILOG_VECTOR_BODY]]:
209197; CHECK-VS2-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
210198; CHECK-VS2-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX5]]
211199; CHECK-VS2-NEXT: [[TMP33:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[OFFSET_IDX]]
212- ; CHECK-VS2-NEXT: [[WIDE_LOAD6 :%.*]] = load <vscale x 4 x i8>, ptr [[TMP33]], align 1
213- ; CHECK-VS2-NEXT: [[TMP35 :%.*]] = add <vscale x 4 x i8> [[WIDE_LOAD6 ]], [[BROADCAST_SPLAT8 ]]
214- ; CHECK-VS2-NEXT: store <vscale x 4 x i8> [[TMP35 ]], ptr [[TMP33]], align 1
215- ; CHECK-VS2-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], [[TMP29]]
200+ ; CHECK-VS2-NEXT: [[WIDE_LOAD7 :%.*]] = load <8 x i8>, ptr [[TMP33]], align 1
201+ ; CHECK-VS2-NEXT: [[TMP23 :%.*]] = add <8 x i8> [[WIDE_LOAD7 ]], [[BROADCAST_SPLAT5 ]]
202+ ; CHECK-VS2-NEXT: store <8 x i8> [[TMP23 ]], ptr [[TMP33]], align 1
203+ ; CHECK-VS2-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 8
216204; CHECK-VS2-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC3]]
217205; CHECK-VS2-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
218206; CHECK-VS2: [[VEC_EPILOG_MIDDLE_BLOCK]]:
0 commit comments