@@ -393,7 +393,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
393393 Value *V,
394394 Value *const Identity) const {
395395 Type *AtomicTy = V->getType ();
396- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
397396 Module *M = B.GetInsertBlock ()->getModule ();
398397 Function *UpdateDPP =
399398 Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -409,34 +408,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
409408
410409 // Reduce within each pair of rows (i.e. 32 lanes).
411410 assert (ST->hasPermLaneX16 ());
412- V = B.CreateBitCast (V, IntNTy);
413411 Value *Permlanex16Call = B.CreateIntrinsic (
414412 V->getType (), Intrinsic::amdgcn_permlanex16,
415413 {V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
416- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
417- B.CreateBitCast (Permlanex16Call, AtomicTy));
414+ V = buildNonAtomicBinOp (B, Op, V, Permlanex16Call);
418415 if (ST->isWave32 ()) {
419416 return V;
420417 }
421418
422419 if (ST->hasPermLane64 ()) {
423420 // Reduce across the upper and lower 32 lanes.
424- V = B.CreateBitCast (V, IntNTy);
425421 Value *Permlane64Call =
426422 B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_permlane64, V);
427- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
428- B.CreateBitCast (Permlane64Call, AtomicTy));
423+ return buildNonAtomicBinOp (B, Op, V, Permlane64Call);
429424 }
430425
431426 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
432427 // combine them with a scalar operation.
433428 Function *ReadLane =
434- Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, B.getInt32Ty ());
435- V = B.CreateBitCast (V, IntNTy);
429+ Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, AtomicTy);
436430 Value *Lane0 = B.CreateCall (ReadLane, {V, B.getInt32 (0 )});
437431 Value *Lane32 = B.CreateCall (ReadLane, {V, B.getInt32 (32 )});
438- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (Lane0, AtomicTy),
439- B.CreateBitCast (Lane32, AtomicTy));
432+ return buildNonAtomicBinOp (B, Op, Lane0, Lane32);
440433}
441434
442435// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -445,8 +438,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
445438 AtomicRMWInst::BinOp Op, Value *V,
446439 Value *Identity) const {
447440 Type *AtomicTy = V->getType ();
448- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
449-
450441 Module *M = B.GetInsertBlock ()->getModule ();
451442 Function *UpdateDPP =
452443 Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -477,29 +468,25 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
477468 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
478469 // 48..63).
479470 assert (ST->hasPermLaneX16 ());
480- V = B.CreateBitCast (V, IntNTy);
481471 Value *PermX = B.CreateIntrinsic (
482472 V->getType (), Intrinsic::amdgcn_permlanex16,
483473 {V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
484474
485- Value *UpdateDPPCall =
486- B.CreateCall (UpdateDPP, {Identity, B.CreateBitCast (PermX, AtomicTy),
487- B.getInt32 (DPP::QUAD_PERM_ID), B.getInt32 (0xa ),
488- B.getInt32 (0xf ), B.getFalse ()});
489- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy), UpdateDPPCall);
475+ Value *UpdateDPPCall = B.CreateCall (
476+ UpdateDPP, {Identity, PermX, B.getInt32 (DPP::QUAD_PERM_ID),
477+ B.getInt32 (0xa ), B.getInt32 (0xf ), B.getFalse ()});
478+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
490479
491480 if (!ST->isWave32 ()) {
492481 // Combine lane 31 into lanes 32..63.
493- V = B.CreateBitCast (V, IntNTy);
494482 Value *const Lane31 = B.CreateIntrinsic (
495483 V->getType (), Intrinsic::amdgcn_readlane, {V, B.getInt32 (31 )});
496484
497485 Value *UpdateDPPCall = B.CreateCall (
498486 UpdateDPP, {Identity, Lane31, B.getInt32 (DPP::QUAD_PERM_ID),
499487 B.getInt32 (0xc ), B.getInt32 (0xf ), B.getFalse ()});
500488
501- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
502- UpdateDPPCall);
489+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
503490 }
504491 }
505492 return V;
@@ -510,8 +497,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
510497Value *AMDGPUAtomicOptimizerImpl::buildShiftRight (IRBuilder<> &B, Value *V,
511498 Value *Identity) const {
512499 Type *AtomicTy = V->getType ();
513- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
514-
515500 Module *M = B.GetInsertBlock ()->getModule ();
516501 Function *UpdateDPP =
517502 Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -521,10 +506,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
521506 {Identity, V, B.getInt32 (DPP::WAVE_SHR1), B.getInt32 (0xf ),
522507 B.getInt32 (0xf ), B.getFalse ()});
523508 } else {
524- Function *ReadLane = Intrinsic::getDeclaration (
525- M, Intrinsic::amdgcn_readlane, B. getInt32Ty () );
526- Function *WriteLane = Intrinsic::getDeclaration (
527- M, Intrinsic::amdgcn_writelane, B. getInt32Ty () );
509+ Function *ReadLane =
510+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_readlane, AtomicTy );
511+ Function *WriteLane =
512+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_writelane, AtomicTy );
528513
529514 // On GFX10 all DPP operations are confined to a single row. To get cross-
530515 // row operations we have to use permlane or readlane.
@@ -534,24 +519,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
534519 B.getInt32 (0xf ), B.getInt32 (0xf ), B.getFalse ()});
535520
536521 // Copy the old lane 15 to the new lane 16.
537- V = B.CreateCall (
538- WriteLane,
539- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy), B.getInt32 (15 )}),
540- B.getInt32 (16 ), B.CreateBitCast (V, IntNTy)});
541- V = B.CreateBitCast (V, AtomicTy);
522+ V = B.CreateCall (WriteLane, {B.CreateCall (ReadLane, {Old, B.getInt32 (15 )}),
523+ B.getInt32 (16 ), V});
524+
542525 if (!ST->isWave32 ()) {
543526 // Copy the old lane 31 to the new lane 32.
544- V = B.CreateBitCast (V, IntNTy);
545- V = B.CreateCall (WriteLane,
546- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy),
547- B.getInt32 (31 )}),
548- B.getInt32 (32 ), V});
527+ V = B.CreateCall (
528+ WriteLane,
529+ {B.CreateCall (ReadLane, {Old, B.getInt32 (31 )}), B.getInt32 (32 ), V});
549530
550531 // Copy the old lane 47 to the new lane 48.
551532 V = B.CreateCall (
552533 WriteLane,
553534 {B.CreateCall (ReadLane, {Old, B.getInt32 (47 )}), B.getInt32 (48 ), V});
554- V = B.CreateBitCast (V, AtomicTy);
555535 }
556536 }
557537
@@ -591,24 +571,18 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
591571 auto *FF1 =
592572 B.CreateIntrinsic (Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue ()});
593573
594- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
595- auto *LaneIdxInt = B.CreateTrunc (FF1, IntNTy);
574+ auto *LaneIdxInt = B.CreateTrunc (FF1, B.getInt32Ty ());
596575
597576 // Get the value required for atomic operation
598- V = B.CreateBitCast (V, IntNTy);
599577 Value *LaneValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_readlane,
600578 {V, LaneIdxInt});
601- LaneValue = B.CreateBitCast (LaneValue, Ty);
602579
603580 // Perform writelane if intermediate scan results are required later in the
604581 // kernel computations
605582 Value *OldValue = nullptr ;
606583 if (NeedResult) {
607- OldValue =
608- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_writelane,
609- {B.CreateBitCast (Accumulator, IntNTy), LaneIdxInt,
610- B.CreateBitCast (OldValuePhi, IntNTy)});
611- OldValue = B.CreateBitCast (OldValue, Ty);
584+ OldValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_writelane,
585+ {Accumulator, LaneIdxInt, OldValuePhi});
612586 OldValuePhi->addIncoming (OldValue, ComputeLoop);
613587 }
614588
@@ -710,10 +684,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
710684
711685 Type *const Ty = I.getType ();
712686 Type *Int32Ty = B.getInt32Ty ();
713- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
714687 bool isAtomicFloatingPointTy = Ty->isFloatingPointTy ();
715688 const unsigned TyBitWidth = DL->getTypeSizeInBits (Ty);
716- auto *const VecTy = FixedVectorType::get (Int32Ty, 2 );
717689
718690 // This is the value in the atomic operation we need to combine in order to
719691 // reduce the number of atomic operations.
@@ -768,13 +740,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
768740 if (ScanImpl == ScanOptions::DPP) {
769741 // First we need to set all inactive invocations to the identity value, so
770742 // that they can correctly contribute to the final result.
771- V = B.CreateBitCast (V, IntNTy);
772- Identity = B.CreateBitCast (Identity, IntNTy);
773- NewV = B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, IntNTy,
774- {V, Identity});
775- NewV = B.CreateBitCast (NewV, Ty);
776- V = B.CreateBitCast (V, Ty);
777- Identity = B.CreateBitCast (Identity, Ty);
743+ NewV =
744+ B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
778745 if (!NeedResult && ST->hasPermLaneX16 ()) {
779746 // On GFX10 the permlanex16 instruction helps us build a reduction
780747 // without too many readlanes and writelanes, which are generally bad
@@ -789,10 +756,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
789756 // which we will provide to the atomic operation.
790757 Value *const LastLaneIdx = B.getInt32 (ST->getWavefrontSize () - 1 );
791758 assert (TyBitWidth == 32 );
792- NewV = B.CreateBitCast (NewV, IntNTy);
793- NewV = B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readlane,
759+ NewV = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readlane,
794760 {NewV, LastLaneIdx});
795- NewV = B.CreateBitCast (NewV, Ty);
796761 }
797762 // Finally mark the readlanes in the WWM section.
798763 NewV = B.CreateIntrinsic (Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -931,30 +896,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
931896 // lane) to all other lanes in the wavefront. We use an intrinsic for this,
932897 // but have to handle 64-bit broadcasts with two calls to this intrinsic.
933898 Value *BroadcastI = nullptr ;
934-
935- if (TyBitWidth == 64 ) {
936- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
937- Value *const ExtractLo = B.CreateTrunc (CastedPhi, Int32Ty);
938- Value *const ExtractHi =
939- B.CreateTrunc (B.CreateLShr (CastedPhi, 32 ), Int32Ty);
940- CallInst *const ReadFirstLaneLo = B.CreateIntrinsic (
941- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
942- CallInst *const ReadFirstLaneHi = B.CreateIntrinsic (
943- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
944- Value *const PartialInsert = B.CreateInsertElement (
945- PoisonValue::get (VecTy), ReadFirstLaneLo, B.getInt32 (0 ));
946- Value *const Insert =
947- B.CreateInsertElement (PartialInsert, ReadFirstLaneHi, B.getInt32 (1 ));
948- BroadcastI = B.CreateBitCast (Insert, Ty);
949- } else if (TyBitWidth == 32 ) {
950- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
951- BroadcastI =
952- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
953- BroadcastI = B.CreateBitCast (BroadcastI, Ty);
954-
955- } else {
956- llvm_unreachable (" Unhandled atomic bit width" );
957- }
899+ BroadcastI = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readfirstlane, PHI);
958900
959901 // Now that we have the result of our single atomic operation, we need to
960902 // get our individual lane's slice into the result. We use the lane offset
0 commit comments