@@ -386,7 +386,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
386386 Value *V,
387387 Value *const Identity) const {
388388 Type *AtomicTy = V->getType ();
389- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
390389 Module *M = B.GetInsertBlock ()->getModule ();
391390 Function *UpdateDPP =
392391 Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -402,34 +401,28 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
402401
403402 // Reduce within each pair of rows (i.e. 32 lanes).
404403 assert (ST->hasPermLaneX16 ());
405- V = B.CreateBitCast (V, IntNTy);
406404 Value *Permlanex16Call = B.CreateIntrinsic (
407405 V->getType (), Intrinsic::amdgcn_permlanex16,
408406 {V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
409- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
410- B.CreateBitCast (Permlanex16Call, AtomicTy));
407+ V = buildNonAtomicBinOp (B, Op, V, Permlanex16Call);
411408 if (ST->isWave32 ()) {
412409 return V;
413410 }
414411
415412 if (ST->hasPermLane64 ()) {
416413 // Reduce across the upper and lower 32 lanes.
417- V = B.CreateBitCast (V, IntNTy);
418414 Value *Permlane64Call =
419415 B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_permlane64, V);
420- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
421- B.CreateBitCast (Permlane64Call, AtomicTy));
416+ return buildNonAtomicBinOp (B, Op, V, Permlane64Call);
422417 }
423418
424419 // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
425420 // combine them with a scalar operation.
426421 Function *ReadLane =
427- Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, B.getInt32Ty ());
428- V = B.CreateBitCast (V, IntNTy);
422+ Intrinsic::getDeclaration (M, Intrinsic::amdgcn_readlane, AtomicTy);
429423 Value *Lane0 = B.CreateCall (ReadLane, {V, B.getInt32 (0 )});
430424 Value *Lane32 = B.CreateCall (ReadLane, {V, B.getInt32 (32 )});
431- return buildNonAtomicBinOp (B, Op, B.CreateBitCast (Lane0, AtomicTy),
432- B.CreateBitCast (Lane32, AtomicTy));
425+ return buildNonAtomicBinOp (B, Op, Lane0, Lane32);
433426}
434427
435428// Use the builder to create an inclusive scan of V across the wavefront, with
@@ -438,8 +431,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
438431 AtomicRMWInst::BinOp Op, Value *V,
439432 Value *Identity) const {
440433 Type *AtomicTy = V->getType ();
441- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
442-
443434 Module *M = B.GetInsertBlock ()->getModule ();
444435 Function *UpdateDPP =
445436 Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -470,29 +461,25 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
470461 // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
471462 // 48..63).
472463 assert (ST->hasPermLaneX16 ());
473- V = B.CreateBitCast (V, IntNTy);
474464 Value *PermX = B.CreateIntrinsic (
475465 V->getType (), Intrinsic::amdgcn_permlanex16,
476466 {V, V, B.getInt32 (-1 ), B.getInt32 (-1 ), B.getFalse (), B.getFalse ()});
477467
478- Value *UpdateDPPCall =
479- B.CreateCall (UpdateDPP, {Identity, B.CreateBitCast (PermX, AtomicTy),
480- B.getInt32 (DPP::QUAD_PERM_ID), B.getInt32 (0xa ),
481- B.getInt32 (0xf ), B.getFalse ()});
482- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy), UpdateDPPCall);
468+ Value *UpdateDPPCall = B.CreateCall (
469+ UpdateDPP, {Identity, PermX, B.getInt32 (DPP::QUAD_PERM_ID),
470+ B.getInt32 (0xa ), B.getInt32 (0xf ), B.getFalse ()});
471+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
483472
484473 if (!ST->isWave32 ()) {
485474 // Combine lane 31 into lanes 32..63.
486- V = B.CreateBitCast (V, IntNTy);
487475 Value *const Lane31 = B.CreateIntrinsic (
488476 V->getType (), Intrinsic::amdgcn_readlane, {V, B.getInt32 (31 )});
489477
490478 Value *UpdateDPPCall = B.CreateCall (
491479 UpdateDPP, {Identity, Lane31, B.getInt32 (DPP::QUAD_PERM_ID),
492480 B.getInt32 (0xc ), B.getInt32 (0xf ), B.getFalse ()});
493481
494- V = buildNonAtomicBinOp (B, Op, B.CreateBitCast (V, AtomicTy),
495- UpdateDPPCall);
482+ V = buildNonAtomicBinOp (B, Op, V, UpdateDPPCall);
496483 }
497484 }
498485 return V;
@@ -503,8 +490,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
503490Value *AMDGPUAtomicOptimizerImpl::buildShiftRight (IRBuilder<> &B, Value *V,
504491 Value *Identity) const {
505492 Type *AtomicTy = V->getType ();
506- Type *IntNTy = B.getIntNTy (AtomicTy->getPrimitiveSizeInBits ());
507-
508493 Module *M = B.GetInsertBlock ()->getModule ();
509494 Function *UpdateDPP =
510495 Intrinsic::getDeclaration (M, Intrinsic::amdgcn_update_dpp, AtomicTy);
@@ -514,10 +499,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
514499 {Identity, V, B.getInt32 (DPP::WAVE_SHR1), B.getInt32 (0xf ),
515500 B.getInt32 (0xf ), B.getFalse ()});
516501 } else {
517- Function *ReadLane = Intrinsic::getDeclaration (
518- M, Intrinsic::amdgcn_readlane, B. getInt32Ty () );
519- Function *WriteLane = Intrinsic::getDeclaration (
520- M, Intrinsic::amdgcn_writelane, B. getInt32Ty () );
502+ Function *ReadLane =
503+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_readlane, AtomicTy );
504+ Function *WriteLane =
505+ Intrinsic::getDeclaration ( M, Intrinsic::amdgcn_writelane, AtomicTy );
521506
522507 // On GFX10 all DPP operations are confined to a single row. To get cross-
523508 // row operations we have to use permlane or readlane.
@@ -527,24 +512,19 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
527512 B.getInt32 (0xf ), B.getInt32 (0xf ), B.getFalse ()});
528513
529514 // Copy the old lane 15 to the new lane 16.
530- V = B.CreateCall (
531- WriteLane,
532- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy), B.getInt32 (15 )}),
533- B.getInt32 (16 ), B.CreateBitCast (V, IntNTy)});
534- V = B.CreateBitCast (V, AtomicTy);
515+ V = B.CreateCall (WriteLane, {B.CreateCall (ReadLane, {Old, B.getInt32 (15 )}),
516+ B.getInt32 (16 ), V});
517+
535518 if (!ST->isWave32 ()) {
536519 // Copy the old lane 31 to the new lane 32.
537- V = B.CreateBitCast (V, IntNTy);
538- V = B.CreateCall (WriteLane,
539- {B.CreateCall (ReadLane, {B.CreateBitCast (Old, IntNTy),
540- B.getInt32 (31 )}),
541- B.getInt32 (32 ), V});
520+ V = B.CreateCall (
521+ WriteLane,
522+ {B.CreateCall (ReadLane, {Old, B.getInt32 (31 )}), B.getInt32 (32 ), V});
542523
543524 // Copy the old lane 47 to the new lane 48.
544525 V = B.CreateCall (
545526 WriteLane,
546527 {B.CreateCall (ReadLane, {Old, B.getInt32 (47 )}), B.getInt32 (48 ), V});
547- V = B.CreateBitCast (V, AtomicTy);
548528 }
549529 }
550530
@@ -584,24 +564,18 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
584564 auto *FF1 =
585565 B.CreateIntrinsic (Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue ()});
586566
587- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
588- auto *LaneIdxInt = B.CreateTrunc (FF1, IntNTy);
567+ auto *LaneIdxInt = B.CreateTrunc (FF1, B.getInt32Ty ());
589568
590569 // Get the value required for atomic operation
591- V = B.CreateBitCast (V, IntNTy);
592570 Value *LaneValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_readlane,
593571 {V, LaneIdxInt});
594- LaneValue = B.CreateBitCast (LaneValue, Ty);
595572
596573 // Perform writelane if intermediate scan results are required later in the
597574 // kernel computations
598575 Value *OldValue = nullptr ;
599576 if (NeedResult) {
600- OldValue =
601- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_writelane,
602- {B.CreateBitCast (Accumulator, IntNTy), LaneIdxInt,
603- B.CreateBitCast (OldValuePhi, IntNTy)});
604- OldValue = B.CreateBitCast (OldValue, Ty);
577+ OldValue = B.CreateIntrinsic (V->getType (), Intrinsic::amdgcn_writelane,
578+ {Accumulator, LaneIdxInt, OldValuePhi});
605579 OldValuePhi->addIncoming (OldValue, ComputeLoop);
606580 }
607581
@@ -700,10 +674,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
700674
701675 Type *const Ty = I.getType ();
702676 Type *Int32Ty = B.getInt32Ty ();
703- Type *IntNTy = B.getIntNTy (Ty->getPrimitiveSizeInBits ());
704677 bool isAtomicFloatingPointTy = Ty->isFloatingPointTy ();
705678 const unsigned TyBitWidth = DL->getTypeSizeInBits (Ty);
706- auto *const VecTy = FixedVectorType::get (Int32Ty, 2 );
707679
708680 // This is the value in the atomic operation we need to combine in order to
709681 // reduce the number of atomic operations.
@@ -758,13 +730,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
758730 if (ScanImpl == ScanOptions::DPP) {
759731 // First we need to set all inactive invocations to the identity value, so
760732 // that they can correctly contribute to the final result.
761- V = B.CreateBitCast (V, IntNTy);
762- Identity = B.CreateBitCast (Identity, IntNTy);
763- NewV = B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, IntNTy,
764- {V, Identity});
765- NewV = B.CreateBitCast (NewV, Ty);
766- V = B.CreateBitCast (V, Ty);
767- Identity = B.CreateBitCast (Identity, Ty);
733+ NewV =
734+ B.CreateIntrinsic (Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
768735 if (!NeedResult && ST->hasPermLaneX16 ()) {
769736 // On GFX10 the permlanex16 instruction helps us build a reduction
770737 // without too many readlanes and writelanes, which are generally bad
@@ -779,10 +746,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
779746 // which we will provide to the atomic operation.
780747 Value *const LastLaneIdx = B.getInt32 (ST->getWavefrontSize () - 1 );
781748 assert (TyBitWidth == 32 );
782- NewV = B.CreateBitCast (NewV, IntNTy);
783- NewV = B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readlane,
749+ NewV = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readlane,
784750 {NewV, LastLaneIdx});
785- NewV = B.CreateBitCast (NewV, Ty);
786751 }
787752 // Finally mark the readlanes in the WWM section.
788753 NewV = B.CreateIntrinsic (Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -922,26 +887,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
922887 // but have to handle 64-bit broadcasts with two calls to this intrinsic.
923888 Value *BroadcastI = nullptr ;
924889
925- if (TyBitWidth == 64 ) {
926- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
927- Value *const ExtractLo = B.CreateTrunc (CastedPhi, Int32Ty);
928- Value *const ExtractHi =
929- B.CreateTrunc (B.CreateLShr (CastedPhi, 32 ), Int32Ty);
930- CallInst *const ReadFirstLaneLo = B.CreateIntrinsic (
931- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractLo);
932- CallInst *const ReadFirstLaneHi = B.CreateIntrinsic (
933- Int32Ty, Intrinsic::amdgcn_readfirstlane, ExtractHi);
934- Value *const PartialInsert = B.CreateInsertElement (
935- PoisonValue::get (VecTy), ReadFirstLaneLo, B.getInt32 (0 ));
936- Value *const Insert =
937- B.CreateInsertElement (PartialInsert, ReadFirstLaneHi, B.getInt32 (1 ));
938- BroadcastI = B.CreateBitCast (Insert, Ty);
939- } else if (TyBitWidth == 32 ) {
940- Value *CastedPhi = B.CreateBitCast (PHI, IntNTy);
941- BroadcastI =
942- B.CreateIntrinsic (IntNTy, Intrinsic::amdgcn_readfirstlane, CastedPhi);
943- BroadcastI = B.CreateBitCast (BroadcastI, Ty);
944-
890+ if (TyBitWidth == 32 || TyBitWidth == 64 ) {
891+ BroadcastI = B.CreateIntrinsic (Ty, Intrinsic::amdgcn_readfirstlane, PHI);
945892 } else {
946893 llvm_unreachable (" Unhandled atomic bit width" );
947894 }
0 commit comments