@@ -626,26 +626,35 @@ std::vector<Chain> Vectorizer::splitChainByContiguity(Chain &C) {
626626 std::vector<Chain> Ret;
627627 Ret.push_back ({C.front ()});
628628
629+ unsigned ElemBytes = DL.getTypeStoreSize (getChainElemTy (C));
630+ APInt PrevReadEnd = C[0 ].OffsetFromLeader +
631+ DL.getTypeStoreSize (getLoadStoreType (&*C[0 ].Inst ));
629632 for (auto It = std::next (C.begin ()), End = C.end (); It != End; ++It) {
630633 // `prev` accesses offsets [PrevDistFromBase, PrevReadEnd).
631634 auto &CurChain = Ret.back ();
632- const ChainElem &Prev = CurChain.back ();
633- unsigned SzBits = DL.getTypeSizeInBits (getLoadStoreType (&*Prev.Inst ));
634- assert (SzBits % 8 == 0 && " Non-byte sizes should have been filtered out by "
635- " collectEquivalenceClass" );
636- APInt PrevReadEnd = Prev.OffsetFromLeader + SzBits / 8 ;
635+ unsigned SzBytes = DL.getTypeStoreSize (getLoadStoreType (&*It->Inst ));
637636
638637 // Add this instruction to the end of the current chain, or start a new one.
639- bool AreContiguous = It->OffsetFromLeader == PrevReadEnd;
640- LLVM_DEBUG (dbgs () << " LSV: Instructions are "
641- << (AreContiguous ? " " : " not " ) << " contiguous: "
642- << *Prev.Inst << " (ends at offset " << PrevReadEnd
643- << " ) -> " << *It->Inst << " (starts at offset "
638+ assert (SzBytes % ElemBytes == 0 );
639+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
640+ // Allow redundancy: partial or full overlap counts as contiguous.
641+ bool AreContiguous = false ;
642+ if (It->OffsetFromLeader .sle (PrevReadEnd)) {
643+ uint64_t Overlap = (PrevReadEnd - It->OffsetFromLeader ).getZExtValue ();
644+ if (Overlap % ElemBytes == 0 )
645+ AreContiguous = true ;
646+ }
647+
648+ LLVM_DEBUG (dbgs () << " LSV: Instruction is "
649+ << (AreContiguous ? " contiguous" : " chain-breaker" )
650+ << *It->Inst << " (starts at offset "
644651 << It->OffsetFromLeader << " )\n " );
652+
645653 if (AreContiguous)
646654 CurChain.push_back (*It);
647655 else
648656 Ret.push_back ({*It});
657+ PrevReadEnd = APIntOps::smax (PrevReadEnd, ReadEnd);
649658 }
650659
651660 // Filter out length-1 chains, these are uninteresting.
@@ -874,15 +883,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
874883 Type *VecElemTy = getChainElemTy (C);
875884 bool IsLoadChain = isa<LoadInst>(C[0 ].Inst );
876885 unsigned AS = getLoadStoreAddressSpace (C[0 ].Inst );
877- unsigned ChainBytes = std::accumulate (
878- C.begin (), C.end (), 0u , [&](unsigned Bytes, const ChainElem &E) {
879- return Bytes + DL.getTypeStoreSize (getLoadStoreType (E.Inst ));
880- });
886+ unsigned BytesAdded = DL.getTypeStoreSize (getLoadStoreType (&*C[0 ].Inst ));
887+ APInt PrevReadEnd = C[0 ].OffsetFromLeader + BytesAdded;
888+ unsigned ChainBytes = BytesAdded;
889+ for (auto It = std::next (C.begin ()), End = C.end (); It != End; ++It) {
890+ unsigned SzBytes = DL.getTypeStoreSize (getLoadStoreType (&*It->Inst ));
891+ APInt ReadEnd = It->OffsetFromLeader + SzBytes;
892+ // Update ChainBytes considering possible overlap.
893+ BytesAdded =
894+ PrevReadEnd.sle (ReadEnd) ? (ReadEnd - PrevReadEnd).getSExtValue () : 0 ;
895+ ChainBytes += BytesAdded;
896+ PrevReadEnd = APIntOps::smax (PrevReadEnd, ReadEnd);
897+ }
898+
881899 assert (ChainBytes % DL.getTypeStoreSize (VecElemTy) == 0 );
882900 // VecTy is a power of 2 and 1 byte at smallest, but VecElemTy may be smaller
883901 // than 1 byte (e.g. VecTy == <32 x i1>).
884- Type *VecTy = FixedVectorType::get (
885- VecElemTy, 8 * ChainBytes / DL. getTypeSizeInBits (VecElemTy) );
902+ unsigned NumElem = 8 * ChainBytes / DL. getTypeSizeInBits (VecElemTy);
903+ Type *VecTy = FixedVectorType::get (VecElemTy, NumElem );
886904
887905 Align Alignment = getLoadStoreAlignment (C[0 ].Inst );
888906 // If this is a load/store of an alloca, we might have upgraded the alloca's
@@ -909,27 +927,31 @@ bool Vectorizer::vectorizeChain(Chain &C) {
909927 llvm::min_element (C, [](const auto &A, const auto &B) {
910928 return A.Inst ->comesBefore (B.Inst );
911929 })->Inst );
912-
930+ // This can happen due to a chain of redundant loads.
931+ // In this case, just use the element-type, and avoid ExtractElement.
932+ if (NumElem == 1 )
933+ VecTy = VecElemTy;
913934 // Chain is in offset order, so C[0] is the instr with the lowest offset,
914935 // i.e. the root of the vector.
915936 VecInst = Builder.CreateAlignedLoad (VecTy,
916937 getLoadStorePointerOperand (C[0 ].Inst ),
917938 Alignment);
918939
919- unsigned VecIdx = 0 ;
920940 for (const ChainElem &E : C) {
921941 Instruction *I = E.Inst ;
922942 Value *V;
923943 Type *T = getLoadStoreType (I);
944+ int EOffset = (E.OffsetFromLeader - C[0 ].OffsetFromLeader ).getSExtValue ();
945+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits (VecElemTy);
924946 if (auto *VT = dyn_cast<FixedVectorType>(T)) {
925947 auto Mask = llvm::to_vector<8 >(
926948 llvm::seq<int >(VecIdx, VecIdx + VT->getNumElements ()));
927949 V = Builder.CreateShuffleVector (VecInst, Mask, I->getName ());
928- VecIdx += VT->getNumElements ();
929- } else {
950+ } else if (VecTy != VecElemTy) {
930951 V = Builder.CreateExtractElement (VecInst, Builder.getInt32 (VecIdx),
931952 I->getName ());
932- ++VecIdx;
953+ } else {
954+ V = VecInst;
933955 }
934956 if (V->getType () != I->getType ())
935957 V = Builder.CreateBitOrPointerCast (V, I->getType ());
@@ -964,22 +986,24 @@ bool Vectorizer::vectorizeChain(Chain &C) {
964986
965987 // Build the vector to store.
966988 Value *Vec = PoisonValue::get (VecTy);
967- unsigned VecIdx = 0 ;
968- auto InsertElem = [&](Value *V) {
989+ auto InsertElem = [&](Value *V, unsigned VecIdx) {
969990 if (V->getType () != VecElemTy)
970991 V = Builder.CreateBitOrPointerCast (V, VecElemTy);
971- Vec = Builder.CreateInsertElement (Vec, V, Builder.getInt32 (VecIdx++ ));
992+ Vec = Builder.CreateInsertElement (Vec, V, Builder.getInt32 (VecIdx));
972993 };
973994 for (const ChainElem &E : C) {
974995 auto *I = cast<StoreInst>(E.Inst );
996+ int EOffset = (E.OffsetFromLeader - C[0 ].OffsetFromLeader ).getSExtValue ();
997+ int VecIdx = 8 * EOffset / DL.getTypeSizeInBits (VecElemTy);
975998 if (FixedVectorType *VT =
976999 dyn_cast<FixedVectorType>(getLoadStoreType (I))) {
9771000 for (int J = 0 , JE = VT->getNumElements (); J < JE; ++J) {
9781001 InsertElem (Builder.CreateExtractElement (I->getValueOperand (),
979- Builder.getInt32 (J)));
1002+ Builder.getInt32 (J)),
1003+ VecIdx++);
9801004 }
9811005 } else {
982- InsertElem (I->getValueOperand ());
1006+ InsertElem (I->getValueOperand (), VecIdx );
9831007 }
9841008 }
9851009
0 commit comments