@@ -805,8 +805,8 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
805805 VPValue *Op,
806806 ScalarEvolution &SE) {
807807 VPValue *Incoming, *Mask;
808- if (!match (Op, m_VPInstruction<VPInstruction::ExtractLane>(
809- m_FirstActiveLane ( m_VPValue (Mask)), m_VPValue (Incoming))))
808+ if (!match (Op, m_ExtractLane ( m_FirstActiveLane ( m_VPValue (Mask)),
809+ m_VPValue (Incoming))))
810810 return nullptr ;
811811
812812 auto *WideIV = getOptimizableIVOf (Incoming, SE);
@@ -1274,8 +1274,7 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
12741274 }
12751275
12761276 // Look through ExtractPenultimateElement (BuildVector ....).
1277- if (match (Def, m_VPInstruction<VPInstruction::ExtractPenultimateElement>(
1278- m_BuildVector ()))) {
1277+ if (match (Def, m_ExtractPenultimateElement (m_BuildVector ()))) {
12791278 auto *BuildVector = cast<VPInstruction>(Def->getOperand (0 ));
12801279 Def->replaceAllUsesWith (
12811280 BuildVector->getOperand (BuildVector->getNumOperands () - 2 ));
@@ -2056,6 +2055,32 @@ bool VPlanTransforms::adjustFixedOrderRecurrences(VPlan &Plan,
20562055 // Set the first operand of RecurSplice to FOR again, after replacing
20572056 // all users.
20582057 RecurSplice->setOperand (0 , FOR);
2058+
2059+ // Check for users extracting at the penultimate active lane of the FOR.
2060+ // If only a single lane is active in the current iteration, we need to
2061+ // select the last element from the previous iteration (from the FOR phi
2062+ // directly).
2063+ for (VPUser *U : RecurSplice->users ()) {
2064+ if (!match (U, m_ExtractLane (m_LastActiveLane (m_VPValue ()),
2065+ m_Specific (RecurSplice))))
2066+ continue ;
2067+
2068+ VPBuilder B (cast<VPInstruction>(U));
2069+ VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand (0 );
2070+ Type *I64Ty = Type::getInt64Ty (Plan.getContext ());
2071+ VPValue *Zero = Plan.getOrAddLiveIn (ConstantInt::get (I64Ty, 0 ));
2072+ VPValue *One = Plan.getOrAddLiveIn (ConstantInt::get (I64Ty, 1 ));
2073+ VPValue *PenultimateIndex =
2074+ B.createNaryOp (Instruction::Sub, {LastActiveLane, One});
2075+ VPValue *PenultimateLastIter =
2076+ B.createNaryOp (VPInstruction::ExtractLane,
2077+ {PenultimateIndex, FOR->getBackedgeValue ()});
2078+ VPValue *LastPrevIter =
2079+ B.createNaryOp (VPInstruction::ExtractLastElement, FOR);
2080+ VPValue *Cmp = B.createICmp (CmpInst::ICMP_EQ, LastActiveLane, Zero);
2081+ VPValue *Sel = B.createSelect (Cmp, LastPrevIter, PenultimateLastIter);
2082+ cast<VPInstruction>(U)->replaceAllUsesWith (Sel);
2083+ }
20592084 }
20602085 return true ;
20612086}
@@ -3445,6 +3470,34 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan) {
34453470 ToRemove.push_back (Expr);
34463471 }
34473472
3473+ // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3474+ auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3475+ if (LastActiveL &&
3476+ LastActiveL->getOpcode () == VPInstruction::LastActiveLane) {
3477+ // Create Not(Mask) for all operands.
3478+ SmallVector<VPValue *, 2 > NotMasks;
3479+ for (VPValue *Op : LastActiveL->operands ()) {
3480+ VPValue *NotMask = Builder.createNot (Op, LastActiveL->getDebugLoc ());
3481+ NotMasks.push_back (NotMask);
3482+ }
3483+
3484+ // Create FirstActiveLane on the inverted masks.
3485+ VPValue *FirstInactiveLane = Builder.createNaryOp (
3486+ VPInstruction::FirstActiveLane, NotMasks,
3487+ LastActiveL->getDebugLoc (), " first.inactive.lane" );
3488+
3489+ // Subtract 1 to get the last active lane.
3490+ VPValue *One = Plan.getOrAddLiveIn (
3491+ ConstantInt::get (Type::getInt64Ty (Plan.getContext ()), 1 ));
3492+ VPValue *LastLane = Builder.createNaryOp (
3493+ Instruction::Sub, {FirstInactiveLane, One},
3494+ LastActiveL->getDebugLoc (), " last.active.lane" );
3495+
3496+ LastActiveL->replaceAllUsesWith (LastLane);
3497+ ToRemove.push_back (LastActiveL);
3498+ continue ;
3499+ }
3500+
34483501 VPValue *VectorStep;
34493502 VPValue *ScalarStep;
34503503 if (!match (&R, m_VPInstruction<VPInstruction::WideIVStep>(
0 commit comments