@@ -395,7 +395,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
395395 Value *V,
396396 Value *const Identity) const {
397397 Type *AtomicTy = V->getType ();
398- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
399398 Module *M = B.GetInsertBlock ()->getModule ();
400399 Function *UpdateDPP =
401400 Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -411,34 +410,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
411410
412411 // Reduce within each pair of rows (i.e. 32 lanes).
413412 assert (ST->hasPermLaneX16 ());
414- V = B.CreateBitCast (V, IntNTy);
415413 Value *Permlanex16Call = B.CreateIntrinsic (
416414 V->getType (), Intrinsic::amdgcn_permlanex16,
417415 {V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
418- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
419- B.CreateBitCast (Permlanex16Call, AtomicTy));
416+ V = buildNonAtomicBinOp (B, Op, V, Permlanex16Call);
420417 if (ST->isWave32 ()) {
421418 return V;
422419 }
423420
424421 if (ST->hasPermLane64 ()) {
425422 // Reduce across the upper and lower 32 lanes.
426- V = B.CreateBitCast (V, IntNTy);
427423 Value *Permlane64Call =
428424 B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_permlane64, V);
429- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
430- B.CreateBitCast (Permlane64Call, AtomicTy));
425+ return buildNonAtomicBinOp (B, Op, V, Permlane64Call);
431426 }
432427
433428 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
434429 // combine them with a scalar operation.
435430 Function *ReadLane =
436- Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, B.getInt32Ty ());
437- V = B.CreateBitCast (V, IntNTy);
431+ Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, AtomicTy);
438432 Value *Lane0 = B.CreateCall (ReadLane, {V, B.getInt32 (0 )});
439433 Value *Lane32 = B.CreateCall (ReadLane, {V, B.getInt32 (32 )});
440- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (Lane0, AtomicTy),
441- B.CreateBitCast (Lane32, AtomicTy));
434+ return buildNonAtomicBinOp (B, Op, Lane0, Lane32);
442435}
443436
444437// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -447,8 +440,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
447440 AtomicRMWInst::BinOp Op, Value *V,
448441 Value *Identity) const {
449442 Type *AtomicTy = V->getType ();
450- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
451-
452443 Module *M = B.GetInsertBlock ()->getModule ();
453444 Function *UpdateDPP =
454445 Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -479,29 +470,25 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
479470 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
480471 // 48..63).
481472 assert (ST->hasPermLaneX16 ());
482- V = B.CreateBitCast (V, IntNTy);
483473 Value *PermX = B.CreateIntrinsic (
484474 V->getType (), Intrinsic::amdgcn_permlanex16,
485475 {V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
486476
487- Value *UpdateDPPCall =
488- B.CreateCall (UpdateDPP, {Identity, B.CreateBitCast (PermX, AtomicTy),
489- B.getInt32 (DPP::QUAD_PERM_ID), B.getInt32 (0xa ),
490- B.getInt32 (0xf ), B.getFalse ()});
491- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy), UpdateDPPCall);
477+ Value *UpdateDPPCall = B.CreateCall (
478+ UpdateDPP, {Identity, PermX, B.getInt32 (DPP::QUAD_PERM_ID),
479+ B.getInt32 (0xa ), B.getInt32 (0xf ), B.getFalse ()});
480+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
492481
493482 if (!ST->isWave32 ()) {
494483 // Combine lane 31 into lanes 32..63.
495- V = B.CreateBitCast (V, IntNTy);
496484 Value *const Lane31 = B.CreateIntrinsic (
497485 V->getType (), Intrinsic::amdgcn_readlane, {V, B.getInt32 (31 )});
498486
499487 Value *UpdateDPPCall = B.CreateCall (
500488 UpdateDPP, {Identity, Lane31, B.getInt32 (DPP::QUAD_PERM_ID),
501489 B.getInt32 (0xc ), B.getInt32 (0xf ), B.getFalse ()});
502490
503- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
504- UpdateDPPCall);
491+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
505492 }
506493 }
507494 return V;
@@ -512,8 +499,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
512499Value *AMDGPUAtomicOptimizerImpl::buildShiftRight (IRBuilder<> &B, Value *V,
513500 Value *Identity) const {
514501 Type *AtomicTy = V->getType ();
515- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
516-
517502 Module *M = B.GetInsertBlock ()->getModule ();
518503 Function *UpdateDPP =
519504 Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -523,10 +508,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
523508 {Identity, V, B.getInt32 (DPP::WAVE_SHR1), B.getInt32 (0xf ),
524509 B.getInt32 (0xf ), B.getFalse ()});
525510 } else {
526- Function *ReadLane = Intrinsic::getDeclaration (
527- M, Intrinsic::amdgcn_readlane, B. getInt32Ty () );
528- Function *WriteLane = Intrinsic::getDeclaration (
529- M, Intrinsic::amdgcn_writelane, B. getInt32Ty () );
511+ Function *ReadLane =
512+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_readlane, AtomicTy );
513+ Function *WriteLane =
514+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_writelane, AtomicTy );
530515
531516 // On GFX10 all DPP operations are confined to a single row. To get cross-
532517 // row operations we have to use permlane or readlane.
@@ -536,24 +521,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
536521 B.getInt32 (0xf ), B.getInt32 (0xf ), B.getFalse ()});
537522
538523 // Copy the old lane 15 to the new lane 16.
539- V = B.CreateCall (
540- WriteLane,
541- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy), B.getInt32 (15 )}),
542- B.getInt32 (16 ), B.CreateBitCast (V, IntNTy)});
543- V = B.CreateBitCast (V, AtomicTy);
524+ V = B.CreateCall (WriteLane, {B.CreateCall (ReadLane, {Old, B.getInt32 (15 )}),
525+ B.getInt32 (16 ), V});
526+
544527 if (!ST->isWave32 ()) {
545528 // Copy the old lane 31 to the new lane 32.
546- V = B.CreateBitCast (V, IntNTy);
547- V = B.CreateCall (WriteLane,
548- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy),
549- B.getInt32 (31 )}),
550- B.getInt32 (32 ), V});
529+ V = B.CreateCall (
530+ WriteLane,
531+ {B.CreateCall (ReadLane, {Old, B.getInt32 (31 )}), B.getInt32 (32 ), V});
551532
552533 // Copy the old lane 47 to the new lane 48.
553534 V = B.CreateCall (
554535 WriteLane,
555536 {B.CreateCall (ReadLane, {Old, B.getInt32 (47 )}), B.getInt32 (48 ), V});
556- V = B.CreateBitCast (V, AtomicTy);
557537 }
558538 }
559539
@@ -593,24 +573,18 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
593573 auto *FF1 =
594574 B.CreateIntrinsic (Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue ()});
595575
596- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
597- auto *LaneIdxInt = B.CreateTrunc (FF1, IntNTy);
576+ auto *LaneIdxInt = B.CreateTrunc (FF1, B.getInt32Ty ());
598577
599578 // Get the value required for atomic operation
600- V = B.CreateBitCast (V, IntNTy);
601579 Value *LaneValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_readlane,
602580 {V, LaneIdxInt});
603- LaneValue = B.CreateBitCast (LaneValue, Ty);
604581
605582 // Perform writelane if intermediate scan results are required later in the
606583 // kernel computations
607584 Value *OldValue = nullptr ;
608585 if (NeedResult) {
609- OldValue =
610- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_writelane,
611- {B.CreateBitCast (Accumulator, IntNTy), LaneIdxInt,
612- B.CreateBitCast (OldValuePhi, IntNTy)});
613- OldValue = B.CreateBitCast (OldValue, Ty);
586+ OldValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_writelane,
587+ {Accumulator, LaneIdxInt, OldValuePhi});
614588 OldValuePhi->addIncoming (OldValue, ComputeLoop);
615589 }
616590
@@ -709,10 +683,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
709683
710684 Type *const Ty = I.getType ();
711685 Type *Int32Ty = B.getInt32Ty ();
712- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
713686 bool isAtomicFloatingPointTy = Ty->isFloatingPointTy ();
714687 const unsigned TyBitWidth = DL->getTypeSizeInBits (Ty);
715- auto *const VecTy = FixedVectorType::get (Int32Ty, 2 );
716688
717689 // This is the value in the atomic operation we need to combine in order to
718690 // reduce the number of atomic operations.
@@ -767,13 +739,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
767739 if (ScanImpl == ScanOptions::DPP) {
768740 // First we need to set all inactive invocations to the identity value, so
769741 // that they can correctly contribute to the final result.
770- V = B.CreateBitCast (V, IntNTy);
771- Identity = B.CreateBitCast (Identity, IntNTy);
772- NewV = B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, IntNTy,
773- {V, Identity});
774- NewV = B.CreateBitCast (NewV, Ty);
775- V = B.CreateBitCast (V, Ty);
776- Identity = B.CreateBitCast (Identity, Ty);
742+ NewV =
743+ B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
777744 if (!NeedResult && ST->hasPermLaneX16 ()) {
778745 // On GFX10 the permlanex16 instruction helps us build a reduction
779746 // without too many readlanes and writelanes, which are generally bad
@@ -788,10 +755,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
788755 // which we will provide to the atomic operation.
789756 Value *const LastLaneIdx = B.getInt32 (ST->getWavefrontSize () - 1 );
790757 assert (TyBitWidth == 32 );
791- NewV = B.CreateBitCast (NewV, IntNTy);
792- NewV = B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readlane,
758+ NewV = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readlane,
793759 {NewV, LastLaneIdx});
794- NewV = B.CreateBitCast (NewV, Ty);
795760 }
796761 // Finally mark the readlanes in the WWM section.
797762 NewV = B.CreateIntrinsic (Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -930,30 +895,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
930895 // lane) to all other lanes in the wavefront. We use an intrinsic for this,
931896 // but have to handle 64-bit broadcasts with two calls to this intrinsic.
932897 Value *BroadcastI = nullptr ;
933-
934- if (TyBitWidth == 64 ) {
935- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
936- Value *const ExtractLo = B.CreateTrunc (CastedPhi, Int32Ty);
937- Value *const ExtractHi =
938- B.CreateTrunc (B.CreateLShr (CastedPhi, 32 ), Int32Ty);
939- CallInst *const ReadFirstLaneLo = B.CreateIntrinsic (
940- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
941- CallInst *const ReadFirstLaneHi = B.CreateIntrinsic (
942- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
943- Value *const PartialInsert = B.CreateInsertElement (
944- PoisonValue::get (VecTy), ReadFirstLaneLo, B.getInt32 (0 ));
945- Value *const Insert =
946- B.CreateInsertElement (PartialInsert, ReadFirstLaneHi, B.getInt32 (1 ));
947- BroadcastI = B.CreateBitCast (Insert, Ty);
948- } else if (TyBitWidth == 32 ) {
949- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
950- BroadcastI =
951- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
952- BroadcastI = B.CreateBitCast (BroadcastI, Ty);
953-
954- } else {
955- llvm_unreachable (" Unhandled atomic bit width" );
956- }
898+ BroadcastI = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readfirstlane, PHI);
957899
958900 // Now that we have the result of our single atomic operation, we need to
959901 // get our individual lane's slice into the result. We use the lane offset
0 commit comments