@@ -580,6 +580,201 @@ exit:
580580 ret double %accum
581581}
582582
583+ define void @loaded_address_used_by_load_through_blend (i64 %start , ptr noalias %src , ptr noalias %src.2 , ptr noalias %dst ) #0 {
584+ ; I64-LABEL: define void @loaded_address_used_by_load_through_blend(
585+ ; I64-SAME: i64 [[START:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
586+ ; I64-NEXT: [[ENTRY:.*]]:
587+ ; I64-NEXT: br label %[[LOOP_HEADER:.*]]
588+ ; I64: [[LOOP_HEADER]]:
589+ ; I64-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
590+ ; I64-NEXT: [[IV_2:%.*]] = phi i64 [ [[START]], %[[ENTRY]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP_LATCH]] ]
591+ ; I64-NEXT: [[IV_1:%.*]] = add i64 [[IV]], 1
592+ ; I64-NEXT: [[GEP_SRC:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_1]]
593+ ; I64-NEXT: [[L_SRC:%.*]] = load float, ptr [[GEP_SRC]], align 4
594+ ; I64-NEXT: [[C:%.*]] = fcmp oeq float [[L_SRC]], 0.000000e+00
595+ ; I64-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]]
596+ ; I64: [[THEN]]:
597+ ; I64-NEXT: [[IV_MUL:%.*]] = mul i64 [[IV_1]], [[START]]
598+ ; I64-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[IV_MUL]]
599+ ; I64-NEXT: br label %[[LOOP_LATCH]]
600+ ; I64: [[LOOP_LATCH]]:
601+ ; I64-NEXT: [[MERGE_GEP:%.*]] = phi ptr [ [[GEP_SRC_2]], %[[THEN]] ], [ [[SRC_2]], %[[LOOP_HEADER]] ]
602+ ; I64-NEXT: [[L_2:%.*]] = load float, ptr [[MERGE_GEP]], align 4
603+ ; I64-NEXT: [[GEP_DST:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
604+ ; I64-NEXT: store float [[L_2]], ptr [[GEP_DST]], align 4
605+ ; I64-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
606+ ; I64-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], -1
607+ ; I64-NEXT: [[EC:%.*]] = icmp sgt i64 [[IV_2]], 100
608+ ; I64-NEXT: br i1 [[EC]], label %[[LOOP_HEADER]], label %[[EXIT:.*]]
609+ ; I64: [[EXIT]]:
610+ ; I64-NEXT: ret void
611+ ;
612+ ; I32-LABEL: define void @loaded_address_used_by_load_through_blend(
613+ ; I32-SAME: i64 [[START:%.*]], ptr noalias [[SRC:%.*]], ptr noalias [[SRC_2:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0]] {
614+ ; I32-NEXT: [[ENTRY:.*:]]
615+ ; I32-NEXT: [[TMP0:%.*]] = add i64 [[START]], 1
616+ ; I32-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[START]], i64 100)
617+ ; I32-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN]]
618+ ; I32-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 8
619+ ; I32-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
620+ ; I32: [[VECTOR_PH]]:
621+ ; I32-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
622+ ; I32-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
623+ ; I32-NEXT: [[TMP2:%.*]] = sub i64 [[START]], [[N_VEC]]
624+ ; I32-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[START]], i64 0
625+ ; I32-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
626+ ; I32-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x ptr> poison, ptr [[SRC_2]], i64 0
627+ ; I32-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x ptr> [[BROADCAST_SPLATINSERT1]], <8 x ptr> poison, <8 x i32> zeroinitializer
628+ ; I32-NEXT: br label %[[VECTOR_BODY:.*]]
629+ ; I32: [[VECTOR_BODY]]:
630+ ; I32-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
631+ ; I32-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
632+ ; I32-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1
633+ ; I32-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 2
634+ ; I32-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 3
635+ ; I32-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 4
636+ ; I32-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 5
637+ ; I32-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 6
638+ ; I32-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 7
639+ ; I32-NEXT: [[TMP11:%.*]] = add i64 [[TMP3]], 1
640+ ; I32-NEXT: [[TMP12:%.*]] = add i64 [[TMP4]], 1
641+ ; I32-NEXT: [[TMP13:%.*]] = add i64 [[TMP5]], 1
642+ ; I32-NEXT: [[TMP14:%.*]] = add i64 [[TMP6]], 1
643+ ; I32-NEXT: [[TMP15:%.*]] = add i64 [[TMP7]], 1
644+ ; I32-NEXT: [[TMP16:%.*]] = add i64 [[TMP8]], 1
645+ ; I32-NEXT: [[TMP17:%.*]] = add i64 [[TMP9]], 1
646+ ; I32-NEXT: [[TMP18:%.*]] = add i64 [[TMP10]], 1
647+ ; I32-NEXT: [[TMP19:%.*]] = insertelement <8 x i64> poison, i64 [[TMP11]], i32 0
648+ ; I32-NEXT: [[TMP20:%.*]] = insertelement <8 x i64> [[TMP19]], i64 [[TMP12]], i32 1
649+ ; I32-NEXT: [[TMP21:%.*]] = insertelement <8 x i64> [[TMP20]], i64 [[TMP13]], i32 2
650+ ; I32-NEXT: [[TMP22:%.*]] = insertelement <8 x i64> [[TMP21]], i64 [[TMP14]], i32 3
651+ ; I32-NEXT: [[TMP23:%.*]] = insertelement <8 x i64> [[TMP22]], i64 [[TMP15]], i32 4
652+ ; I32-NEXT: [[TMP24:%.*]] = insertelement <8 x i64> [[TMP23]], i64 [[TMP16]], i32 5
653+ ; I32-NEXT: [[TMP25:%.*]] = insertelement <8 x i64> [[TMP24]], i64 [[TMP17]], i32 6
654+ ; I32-NEXT: [[TMP26:%.*]] = insertelement <8 x i64> [[TMP25]], i64 [[TMP18]], i32 7
655+ ; I32-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP11]]
656+ ; I32-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]]
657+ ; I32-NEXT: [[TMP29:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
658+ ; I32-NEXT: [[TMP30:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP14]]
659+ ; I32-NEXT: [[TMP31:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP15]]
660+ ; I32-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP16]]
661+ ; I32-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP17]]
662+ ; I32-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP18]]
663+ ; I32-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP27]], align 4
664+ ; I32-NEXT: [[TMP36:%.*]] = load float, ptr [[TMP28]], align 4
665+ ; I32-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP29]], align 4
666+ ; I32-NEXT: [[TMP38:%.*]] = load float, ptr [[TMP30]], align 4
667+ ; I32-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP31]], align 4
668+ ; I32-NEXT: [[TMP40:%.*]] = load float, ptr [[TMP32]], align 4
669+ ; I32-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP33]], align 4
670+ ; I32-NEXT: [[TMP42:%.*]] = load float, ptr [[TMP34]], align 4
671+ ; I32-NEXT: [[TMP43:%.*]] = insertelement <8 x float> poison, float [[TMP35]], i32 0
672+ ; I32-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP36]], i32 1
673+ ; I32-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP37]], i32 2
674+ ; I32-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP38]], i32 3
675+ ; I32-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP39]], i32 4
676+ ; I32-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP40]], i32 5
677+ ; I32-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP41]], i32 6
678+ ; I32-NEXT: [[TMP50:%.*]] = insertelement <8 x float> [[TMP49]], float [[TMP42]], i32 7
679+ ; I32-NEXT: [[TMP51:%.*]] = fcmp oeq <8 x float> [[TMP50]], zeroinitializer
680+ ; I32-NEXT: [[TMP52:%.*]] = mul <8 x i64> [[TMP26]], [[BROADCAST_SPLAT]]
681+ ; I32-NEXT: [[TMP53:%.*]] = extractelement <8 x i64> [[TMP52]], i32 0
682+ ; I32-NEXT: [[TMP54:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP53]]
683+ ; I32-NEXT: [[TMP55:%.*]] = extractelement <8 x i64> [[TMP52]], i32 1
684+ ; I32-NEXT: [[TMP56:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP55]]
685+ ; I32-NEXT: [[TMP57:%.*]] = extractelement <8 x i64> [[TMP52]], i32 2
686+ ; I32-NEXT: [[TMP58:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP57]]
687+ ; I32-NEXT: [[TMP59:%.*]] = extractelement <8 x i64> [[TMP52]], i32 3
688+ ; I32-NEXT: [[TMP60:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP59]]
689+ ; I32-NEXT: [[TMP61:%.*]] = extractelement <8 x i64> [[TMP52]], i32 4
690+ ; I32-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP61]]
691+ ; I32-NEXT: [[TMP63:%.*]] = extractelement <8 x i64> [[TMP52]], i32 5
692+ ; I32-NEXT: [[TMP64:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP63]]
693+ ; I32-NEXT: [[TMP65:%.*]] = extractelement <8 x i64> [[TMP52]], i32 6
694+ ; I32-NEXT: [[TMP66:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP65]]
695+ ; I32-NEXT: [[TMP67:%.*]] = extractelement <8 x i64> [[TMP52]], i32 7
696+ ; I32-NEXT: [[TMP68:%.*]] = getelementptr i8, ptr [[SRC_2]], i64 [[TMP67]]
697+ ; I32-NEXT: [[TMP69:%.*]] = insertelement <8 x ptr> poison, ptr [[TMP54]], i32 0
698+ ; I32-NEXT: [[TMP70:%.*]] = insertelement <8 x ptr> [[TMP69]], ptr [[TMP56]], i32 1
699+ ; I32-NEXT: [[TMP71:%.*]] = insertelement <8 x ptr> [[TMP70]], ptr [[TMP58]], i32 2
700+ ; I32-NEXT: [[TMP72:%.*]] = insertelement <8 x ptr> [[TMP71]], ptr [[TMP60]], i32 3
701+ ; I32-NEXT: [[TMP73:%.*]] = insertelement <8 x ptr> [[TMP72]], ptr [[TMP62]], i32 4
702+ ; I32-NEXT: [[TMP74:%.*]] = insertelement <8 x ptr> [[TMP73]], ptr [[TMP64]], i32 5
703+ ; I32-NEXT: [[TMP75:%.*]] = insertelement <8 x ptr> [[TMP74]], ptr [[TMP66]], i32 6
704+ ; I32-NEXT: [[TMP76:%.*]] = insertelement <8 x ptr> [[TMP75]], ptr [[TMP68]], i32 7
705+ ; I32-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[TMP51]], <8 x ptr> [[TMP76]], <8 x ptr> [[BROADCAST_SPLAT2]]
706+ ; I32-NEXT: [[TMP77:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 0
707+ ; I32-NEXT: [[TMP78:%.*]] = load float, ptr [[TMP77]], align 4
708+ ; I32-NEXT: [[TMP79:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 1
709+ ; I32-NEXT: [[TMP80:%.*]] = load float, ptr [[TMP79]], align 4
710+ ; I32-NEXT: [[TMP81:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 2
711+ ; I32-NEXT: [[TMP82:%.*]] = load float, ptr [[TMP81]], align 4
712+ ; I32-NEXT: [[TMP83:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 3
713+ ; I32-NEXT: [[TMP84:%.*]] = load float, ptr [[TMP83]], align 4
714+ ; I32-NEXT: [[TMP85:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 4
715+ ; I32-NEXT: [[TMP86:%.*]] = load float, ptr [[TMP85]], align 4
716+ ; I32-NEXT: [[TMP87:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 5
717+ ; I32-NEXT: [[TMP88:%.*]] = load float, ptr [[TMP87]], align 4
718+ ; I32-NEXT: [[TMP89:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 6
719+ ; I32-NEXT: [[TMP90:%.*]] = load float, ptr [[TMP89]], align 4
720+ ; I32-NEXT: [[TMP91:%.*]] = extractelement <8 x ptr> [[PREDPHI]], i32 7
721+ ; I32-NEXT: [[TMP92:%.*]] = load float, ptr [[TMP91]], align 4
722+ ; I32-NEXT: [[TMP93:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
723+ ; I32-NEXT: [[TMP94:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
724+ ; I32-NEXT: [[TMP95:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
725+ ; I32-NEXT: [[TMP96:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
726+ ; I32-NEXT: [[TMP97:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
727+ ; I32-NEXT: [[TMP98:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
728+ ; I32-NEXT: [[TMP99:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP9]]
729+ ; I32-NEXT: [[TMP100:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
730+ ; I32-NEXT: store float [[TMP78]], ptr [[TMP93]], align 4
731+ ; I32-NEXT: store float [[TMP80]], ptr [[TMP94]], align 4
732+ ; I32-NEXT: store float [[TMP82]], ptr [[TMP95]], align 4
733+ ; I32-NEXT: store float [[TMP84]], ptr [[TMP96]], align 4
734+ ; I32-NEXT: store float [[TMP86]], ptr [[TMP97]], align 4
735+ ; I32-NEXT: store float [[TMP88]], ptr [[TMP98]], align 4
736+ ; I32-NEXT: store float [[TMP90]], ptr [[TMP99]], align 4
737+ ; I32-NEXT: store float [[TMP92]], ptr [[TMP100]], align 4
738+ ; I32-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
739+ ; I32-NEXT: [[TMP101:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
740+ ; I32-NEXT: br i1 [[TMP101]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
741+ ; I32: [[MIDDLE_BLOCK]]:
742+ ; I32-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
743+ ; I32-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
744+ ; I32: [[SCALAR_PH]]:
745+ ;
746+ entry:
747+ br label %loop.header
748+
749+ loop.header:
750+ %iv = phi i64 [ 0 , %entry ], [ %iv.next , %loop.latch ]
751+ %iv.2 = phi i64 [ %start , %entry ], [ %iv.2.next , %loop.latch ]
752+ %iv.1 = add i64 %iv , 1
753+ %gep.src = getelementptr i8 , ptr %src , i64 %iv.1
754+ %l.src = load float , ptr %gep.src , align 4
755+ %c = fcmp oeq float %l.src , 0 .000000e+00
756+ br i1 %c , label %then , label %loop.latch
757+
758+ then:
759+ %iv.mul = mul i64 %iv.1 , %start
760+ %gep.src.2 = getelementptr i8 , ptr %src.2 , i64 %iv.mul
761+ br label %loop.latch
762+
763+ loop.latch:
764+ %merge.gep = phi ptr [ %gep.src.2 , %then ], [ %src.2 , %loop.header ]
765+ %l.2 = load float , ptr %merge.gep , align 4
766+ %gep.dst = getelementptr i8 , ptr %dst , i64 %iv
767+ store float %l.2 , ptr %gep.dst , align 4
768+ %iv.next = add i64 %iv , 1
769+ %iv.2.next = add i64 %iv.2 , -1
770+ %ec = icmp sgt i64 %iv.2 , 100
771+ br i1 %ec , label %loop.header , label %exit
772+
773+ exit:
774+ ret void
775+ }
776+
777+ attributes #0 = { "target-cpu" ="znver3" }
583778attributes #0 = { "target-cpu" ="znver2" }
584779
585780!0 = distinct !{!0 , !1 }
0 commit comments