diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 27b5a0d37b679..e733f680dc345 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -15137,7 +15137,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { return foldedExt; } else if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && - TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { + TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index bb4bb1195f78b..997c33f1f6a76 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -702,57 +702,66 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // intrinsics. setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - // Turn FP extload into load/fpextend - setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); - setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); - // Turn FP truncstore into trunc + store. - // FIXME: vector types should also be expanded - setTruncStoreAction(MVT::f32, MVT::f16, Expand); - setTruncStoreAction(MVT::f64, MVT::f16, Expand); - setTruncStoreAction(MVT::f32, MVT::bf16, Expand); - setTruncStoreAction(MVT::f64, MVT::bf16, Expand); - setTruncStoreAction(MVT::f64, MVT::f32, Expand); - setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand); - setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand); + // FP extload/truncstore is not legal in PTX. We need to expand all these. + for (auto FloatVTs : + {MVT::fp_valuetypes(), MVT::fp_fixedlen_vector_valuetypes()}) { + for (MVT ValVT : FloatVTs) { + for (MVT MemVT : FloatVTs) { + setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand); + setTruncStoreAction(ValVT, MemVT, Expand); + } + } + } - // PTX does not support load / store predicate registers - setOperationAction(ISD::LOAD, MVT::i1, Custom); - setOperationAction(ISD::STORE, MVT::i1, Custom); + // To improve CodeGen we'll legalize any-extend loads to zext loads. This is + // how they'll be lowered in ISel anyway, and by doing this a little earlier + // we allow for more DAG combine opportunities. + for (auto IntVTs : + {MVT::integer_valuetypes(), MVT::integer_fixedlen_vector_valuetypes()}) + for (MVT ValVT : IntVTs) + for (MVT MemVT : IntVTs) + if (isTypeLegal(ValVT)) + setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom); + // PTX does not support load / store predicate registers + setOperationAction({ISD::LOAD, ISD::STORE}, MVT::i1, Custom); for (MVT VT : MVT::integer_valuetypes()) { - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote); - setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote); + setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MVT::i1, + Promote); setTruncStoreAction(VT, MVT::i1, Expand); } + // Disable generations of extload/truncstore for v2i16/v2i8. The generic + // expansion for these nodes when they are unaligned is incorrect if the + // type is a vector. + // + // TODO: Fix the generic expansion for these nodes found in + // TargetLowering::expandUnalignedLoad/Store. + setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16, + MVT::v2i8, Expand); + setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); + + // Register custom handling for illegal type loads/stores. We'll try to custom + // lower almost all illegal types and logic in the lowering will discard cases + // we can't handle. + setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::f128}, Custom); + for (MVT VT : MVT::fixedlen_vector_valuetypes()) + if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256) + setOperationAction({ISD::STORE, ISD::LOAD}, VT, Custom); + + // Custom legalization for LDU intrinsics. + // TODO: The logic to lower these is not very robust and we should rewrite it. + // Perhaps LDU should not be represented as an intrinsic at all. + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); + for (MVT VT : MVT::fixedlen_vector_valuetypes()) + if (IsPTXVectorType(VT)) + setOperationAction(ISD::INTRINSIC_W_CHAIN, VT, Custom); + setCondCodeAction({ISD::SETNE, ISD::SETEQ, ISD::SETUGE, ISD::SETULE, ISD::SETUGT, ISD::SETULT, ISD::SETGT, ISD::SETLT, ISD::SETGE, ISD::SETLE}, MVT::i1, Expand); - // expand extload of vector of integers. - setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::v2i16, - MVT::v2i8, Expand); - setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); - // This is legal in NVPTX setOperationAction(ISD::ConstantFP, MVT::f64, Legal); setOperationAction(ISD::ConstantFP, MVT::f32, Legal); @@ -767,24 +776,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // DEBUGTRAP can be lowered to PTX brkpt setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); - // Register custom handling for vector loads/stores - for (MVT VT : MVT::fixedlen_vector_valuetypes()) - if (IsPTXVectorType(VT)) - setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, VT, - Custom); - - setOperationAction({ISD::LOAD, ISD::STORE, ISD::INTRINSIC_W_CHAIN}, - {MVT::i128, MVT::f128}, Custom); - // Support varargs. setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Custom); setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::VAEND, MVT::Other, Expand); - // Custom handling for i8 intrinsics - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); - setOperationAction({ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, {MVT::i16, MVT::i32, MVT::i64}, Legal); @@ -3092,39 +3089,14 @@ static void replaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl &Results, const NVPTXSubtarget &STI); -SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType() == MVT::i1) - return LowerLOADi1(Op, DAG); - - EVT VT = Op.getValueType(); - - if (NVPTX::isPackedVectorTy(VT)) { - // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to - // handle unaligned loads and have to handle it here. - LoadSDNode *Load = cast(Op); - EVT MemVT = Load->getMemoryVT(); - if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - MemVT, *Load->getMemOperand())) { - SDValue Ops[2]; - std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); - return DAG.getMergeValues(Ops, SDLoc(Op)); - } - } - - return SDValue(); -} - // v = ld i1* addr // => // v1 = ld i8* addr (-> i16) // v = trunc i16 to i1 -SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { - SDNode *Node = Op.getNode(); - LoadSDNode *LD = cast(Node); - SDLoc dl(Node); +static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG) { + SDLoc dl(LD); assert(LD->getExtensionType() == ISD::NON_EXTLOAD); - assert(Node->getValueType(0) == MVT::i1 && - "Custom lowering for i1 load only"); + assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only"); SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), MVT::i8, LD->getAlign(), @@ -3133,8 +3105,27 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { // The legalizer (the caller) is expecting two values from the legalized // load, so we build a MergeValues node for it. See ExpandUnalignedLoad() // in LegalizeDAG.cpp which also uses MergeValues. - SDValue Ops[] = { result, LD->getChain() }; - return DAG.getMergeValues(Ops, dl); + return DAG.getMergeValues({result, LD->getChain()}, dl); +} + +SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { + LoadSDNode *LD = cast(Op); + + if (Op.getValueType() == MVT::i1) + return lowerLOADi1(LD, DAG); + + // To improve CodeGen we'll legalize any-extend loads to zext loads. This is + // how they'll be lowered in ISel anyway, and by doing this a little earlier + // we allow for more DAG combine opportunities. + if (LD->getExtensionType() == ISD::EXTLOAD) { + assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() && + "Unexpected fpext-load"); + return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(), + LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(), + LD->getMemOperand()); + } + + llvm_unreachable("Unexpected custom lowering for load"); } SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { @@ -3144,17 +3135,6 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { if (VT == MVT::i1) return LowerSTOREi1(Op, DAG); - // v2f32/v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to - // handle unaligned stores and have to handle it here. - if (NVPTX::isPackedVectorTy(VT) && - !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), - VT, *Store->getMemOperand())) - return expandUnalignedStore(Store, DAG); - - // v2f16/v2bf16/v2i16 don't need special handling. - if (NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()) - return SDValue(); - // Lower store of any other vector type, including v2f32 as we want to break // it apart since this is not a widely-supported type. return LowerSTOREVector(Op, DAG); @@ -4010,14 +3990,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( case Intrinsic::nvvm_ldu_global_i: case Intrinsic::nvvm_ldu_global_f: case Intrinsic::nvvm_ldu_global_p: { - auto &DL = I.getDataLayout(); Info.opc = ISD::INTRINSIC_W_CHAIN; - if (Intrinsic == Intrinsic::nvvm_ldu_global_i) - Info.memVT = getValueType(DL, I.getType()); - else if(Intrinsic == Intrinsic::nvvm_ldu_global_p) - Info.memVT = getPointerTy(DL); - else - Info.memVT = getValueType(DL, I.getType()); + Info.memVT = getValueType(I.getDataLayout(), I.getType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 27f099e220976..e7f1a4b4c98c4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -309,8 +309,6 @@ class NVPTXTargetLowering : public TargetLowering { SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/test/CodeGen/Mips/implicit-sret.ll b/llvm/test/CodeGen/Mips/implicit-sret.ll index c8400abacaf8c..88e5119c5ed5b 100644 --- a/llvm/test/CodeGen/Mips/implicit-sret.ll +++ b/llvm/test/CodeGen/Mips/implicit-sret.ll @@ -19,9 +19,7 @@ define internal void @test() unnamed_addr nounwind { ; CHECK-NEXT: ld $6, 24($sp) ; CHECK-NEXT: ld $5, 16($sp) ; CHECK-NEXT: ld $7, 32($sp) -; CHECK-NEXT: lw $1, 0($sp) -; CHECK-NEXT: # implicit-def: $a0_64 -; CHECK-NEXT: move $4, $1 +; CHECK-NEXT: lw $4, 0($sp) ; CHECK-NEXT: jal use_sret ; CHECK-NEXT: nop ; CHECK-NEXT: ld $ra, 56($sp) # 8-byte Folded Reload @@ -64,15 +62,9 @@ define internal void @test2() unnamed_addr nounwind { ; CHECK-NEXT: daddiu $4, $sp, 0 ; CHECK-NEXT: jal implicit_sret_decl2 ; CHECK-NEXT: nop -; CHECK-NEXT: lw $1, 20($sp) -; CHECK-NEXT: lw $2, 12($sp) -; CHECK-NEXT: lw $3, 4($sp) -; CHECK-NEXT: # implicit-def: $a0_64 -; CHECK-NEXT: move $4, $3 -; CHECK-NEXT: # implicit-def: $a1_64 -; CHECK-NEXT: move $5, $2 -; CHECK-NEXT: # implicit-def: $a2_64 -; CHECK-NEXT: move $6, $1 +; CHECK-NEXT: lw $6, 20($sp) +; CHECK-NEXT: lw $5, 12($sp) +; CHECK-NEXT: lw $4, 4($sp) ; CHECK-NEXT: jal use_sret2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld $ra, 24($sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/msa/basic_operations.ll b/llvm/test/CodeGen/Mips/msa/basic_operations.ll index 4fc3f57aa002d..c3889372b322e 100644 --- a/llvm/test/CodeGen/Mips/msa/basic_operations.ll +++ b/llvm/test/CodeGen/Mips/msa/basic_operations.ll @@ -1904,7 +1904,7 @@ define void @insert_v16i8_vidx(i32 signext %a) nounwind { ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v16i8_vidx))) ; N64-NEXT: ld $2, %got_disp(i32)($1) -; N64-NEXT: lw $2, 0($2) +; N64-NEXT: lwu $2, 0($2) ; N64-NEXT: andi $2, $2, 15 ; N64-NEXT: ld $1, %got_disp(v16i8)($1) ; N64-NEXT: daddu $1, $1, $2 @@ -1953,7 +1953,7 @@ define void @insert_v8i16_vidx(i32 signext %a) nounwind { ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v8i16_vidx))) ; N64-NEXT: ld $2, %got_disp(i32)($1) -; N64-NEXT: lw $2, 0($2) +; N64-NEXT: lwu $2, 0($2) ; N64-NEXT: andi $2, $2, 7 ; N64-NEXT: ld $1, %got_disp(v8i16)($1) ; N64-NEXT: dlsa $1, $2, $1, 1 @@ -2002,7 +2002,7 @@ define void @insert_v4i32_vidx(i32 signext %a) nounwind { ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v4i32_vidx))) ; N64-NEXT: ld $2, %got_disp(i32)($1) -; N64-NEXT: lw $2, 0($2) +; N64-NEXT: lwu $2, 0($2) ; N64-NEXT: andi $2, $2, 3 ; N64-NEXT: ld $1, %got_disp(v4i32)($1) ; N64-NEXT: dlsa $1, $2, $1, 2 @@ -2053,7 +2053,7 @@ define void @insert_v2i64_vidx(i64 signext %a) nounwind { ; N64-NEXT: daddu $1, $1, $25 ; N64-NEXT: daddiu $1, $1, %lo(%neg(%gp_rel(insert_v2i64_vidx))) ; N64-NEXT: ld $2, %got_disp(i32)($1) -; N64-NEXT: lw $2, 0($2) +; N64-NEXT: lwu $2, 0($2) ; N64-NEXT: andi $2, $2, 1 ; N64-NEXT: ld $1, %got_disp(v2i64)($1) ; N64-NEXT: dlsa $1, $2, $1, 3 diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index bd4c7775354ae..6c4ae1937e158 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -711,11 +711,11 @@ define <2 x bfloat> @test_copysign(<2 x bfloat> %a, <2 x bfloat> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [test_copysign_param_1]; -; CHECK-NEXT: and.b32 %r3, %r2, -2147450880; -; CHECK-NEXT: and.b32 %r4, %r1, 2147450879; -; CHECK-NEXT: or.b32 %r5, %r4, %r3; +; CHECK-NEXT: ld.param.b32 %r1, [test_copysign_param_1]; +; CHECK-NEXT: and.b32 %r2, %r1, -2147450880; +; CHECK-NEXT: ld.param.b32 %r3, [test_copysign_param_0]; +; CHECK-NEXT: and.b32 %r4, %r3, 2147450879; +; CHECK-NEXT: or.b32 %r5, %r4, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-NEXT: ret; %r = call <2 x bfloat> @llvm.copysign.f16(<2 x bfloat> %a, <2 x bfloat> %b) diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 6e480996e7e6a..9717efb960f18 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -7,7 +7,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -22,23 +22,22 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB0_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB0_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB0_1; ; SM60-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r12; @@ -52,7 +51,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -67,23 +66,22 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB1_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB1_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB1_1; ; SM60-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -98,7 +96,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -114,23 +112,22 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB2_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB2_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB2_1; ; SM60-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -145,7 +142,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -160,23 +157,22 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB3_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB3_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB3_1; ; SM60-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -191,7 +187,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -206,23 +202,22 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB4_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB4_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB4_1; ; SM60-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -237,7 +232,7 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -253,23 +248,22 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB5_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB5_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB5_1; ; SM60-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -284,7 +278,7 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -300,23 +294,22 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB6_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB6_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB6_1; ; SM60-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM60-NEXT: st.param.b32 [func_retval0], %r12; @@ -330,7 +323,7 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -346,23 +339,22 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB7_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB7_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB7_1; ; SM60-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -377,7 +369,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -393,23 +385,22 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB8_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB8_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB8_1; ; SM60-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -424,7 +415,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -440,23 +431,22 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB9_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB9_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB9_1; ; SM60-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -471,7 +461,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -487,23 +477,22 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB10_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB10_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB10_1; ; SM60-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -518,7 +507,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -534,23 +523,22 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB11_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB11_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB11_1; ; SM60-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -565,7 +553,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -581,23 +569,22 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB12_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB12_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB12_1; ; SM60-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -612,7 +599,7 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -628,23 +615,22 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB13_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB13_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB13_1; ; SM60-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -659,7 +645,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -675,23 +661,22 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB14_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB14_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB14_1; ; SM60-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -1899,7 +1884,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -1915,23 +1900,22 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.global.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.global.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.sys.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.sys.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB60_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB60_1; ; SM60-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.sys; @@ -1997,7 +1981,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -2013,23 +1997,22 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB64_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB64_1; ; SM60-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; @@ -2044,7 +2027,7 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60: { ; SM60-NEXT: .reg .pred %p<3>; ; SM60-NEXT: .reg .b16 %rs<2>; -; SM60-NEXT: .reg .b32 %r<18>; +; SM60-NEXT: .reg .b32 %r<17>; ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: @@ -2060,23 +2043,22 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: shl.b32 %r11, %r10, %r1; ; SM60-NEXT: not.b32 %r2, %r11; ; SM60-NEXT: cvt.u32.u16 %r12, %rs1; -; SM60-NEXT: and.b32 %r13, %r12, 255; -; SM60-NEXT: shl.b32 %r3, %r13, %r1; +; SM60-NEXT: shl.b32 %r3, %r12, %r1; ; SM60-NEXT: shl.b32 %r4, %r7, %r1; -; SM60-NEXT: ld.shared.b32 %r14, [%rd1]; -; SM60-NEXT: and.b32 %r17, %r14, %r2; +; SM60-NEXT: ld.shared.b32 %r13, [%rd1]; +; SM60-NEXT: and.b32 %r16, %r13, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 -; SM60-NEXT: or.b32 %r15, %r17, %r3; -; SM60-NEXT: or.b32 %r16, %r17, %r4; -; SM60-NEXT: atom.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15; -; SM60-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM60-NEXT: or.b32 %r14, %r16, %r3; +; SM60-NEXT: or.b32 %r15, %r16, %r4; +; SM60-NEXT: atom.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14; +; SM60-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM60-NEXT: @%p1 bra $L__BB65_3; ; SM60-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM60-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM60-NEXT: and.b32 %r6, %r5, %r2; -; SM60-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM60-NEXT: mov.b32 %r17, %r6; +; SM60-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM60-NEXT: mov.b32 %r16, %r6; ; SM60-NEXT: @%p2 bra $L__BB65_1; ; SM60-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM60-NEXT: membar.cta; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index 065b89c7ebf74..2cadd7d65c085 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -7,7 +7,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -22,23 +22,22 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r12; @@ -52,7 +51,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -67,23 +66,22 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -98,7 +96,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -114,23 +112,22 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -145,7 +142,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -160,23 +157,22 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -191,7 +187,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -206,23 +202,22 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -237,7 +232,7 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -253,23 +248,22 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB5_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB5_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB5_1; ; SM70-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -284,7 +278,7 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -300,23 +294,22 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB6_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB6_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB6_1; ; SM70-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r12; @@ -330,7 +323,7 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -346,23 +339,22 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB7_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB7_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB7_1; ; SM70-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -377,7 +369,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -393,23 +385,22 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB8_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB8_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB8_1; ; SM70-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -424,7 +415,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -440,23 +431,22 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB9_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB9_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB9_1; ; SM70-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -471,7 +461,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -487,23 +477,22 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB10_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB10_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB10_1; ; SM70-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -518,7 +507,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -534,23 +523,22 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB11_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB11_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB11_1; ; SM70-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -565,7 +553,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -581,23 +569,22 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB12_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB12_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB12_1; ; SM70-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -612,7 +599,7 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -628,23 +615,22 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB13_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB13_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB13_1; ; SM70-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -659,7 +645,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -675,23 +661,22 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB14_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB14_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB14_1; ; SM70-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -1899,7 +1884,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -1915,23 +1900,22 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.global.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.global.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB60_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB60_1; ; SM70-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -1997,7 +1981,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -2013,23 +1997,22 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB64_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB64_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB64_1; ; SM70-NEXT: $L__BB64_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; @@ -2044,7 +2027,7 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -2060,23 +2043,22 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.shared.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.shared.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB65_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB65_1; ; SM70-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.cta; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index e4433570bdd70..adcf5da5a6e3a 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -7,7 +7,7 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -22,23 +22,22 @@ define i8 @monotonic_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB0_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB0_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB0_1; ; SM90-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r12; @@ -52,7 +51,7 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -67,23 +66,22 @@ define i8 @monotonic_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB1_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB1_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB1_1; ; SM90-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -98,7 +96,7 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -114,23 +112,22 @@ define i8 @monotonic_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB2_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB2_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB2_1; ; SM90-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -145,7 +142,7 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -160,23 +157,22 @@ define i8 @acquire_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB3_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB3_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB3_1; ; SM90-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -191,7 +187,7 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -206,23 +202,22 @@ define i8 @acquire_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB4_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB4_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB4_1; ; SM90-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -237,7 +232,7 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -253,23 +248,22 @@ define i8 @acquire_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB5_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB5_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB5_1; ; SM90-NEXT: $L__BB5_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -284,7 +278,7 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -300,23 +294,22 @@ define i8 @release_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB6_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB6_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB6_1; ; SM90-NEXT: $L__BB6_3: // %partword.cmpxchg.end ; SM90-NEXT: st.param.b32 [func_retval0], %r12; @@ -330,7 +323,7 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -346,23 +339,22 @@ define i8 @release_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB7_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB7_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB7_1; ; SM90-NEXT: $L__BB7_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -377,7 +369,7 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -393,23 +385,22 @@ define i8 @release_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB8_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB8_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB8_1; ; SM90-NEXT: $L__BB8_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -424,7 +415,7 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -440,23 +431,22 @@ define i8 @acq_rel_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB9_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB9_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB9_1; ; SM90-NEXT: $L__BB9_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -471,7 +461,7 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -487,23 +477,22 @@ define i8 @acq_rel_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB10_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB10_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB10_1; ; SM90-NEXT: $L__BB10_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -518,7 +507,7 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -534,23 +523,22 @@ define i8 @acq_rel_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB11_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB11_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB11_1; ; SM90-NEXT: $L__BB11_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -565,7 +553,7 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -581,23 +569,22 @@ define i8 @seq_cst_monotonic_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 % ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB12_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB12_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB12_1; ; SM90-NEXT: $L__BB12_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -612,7 +599,7 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -628,23 +615,22 @@ define i8 @seq_cst_acquire_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB13_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB13_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB13_1; ; SM90-NEXT: $L__BB13_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -659,7 +645,7 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -675,23 +661,22 @@ define i8 @seq_cst_seq_cst_i8_global_cta(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB14_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB14_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB14_1; ; SM90-NEXT: $L__BB14_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -1899,7 +1884,7 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -1915,23 +1900,22 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.global.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.global.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.sys.global.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB60_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB60_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB60_1; ; SM90-NEXT: $L__BB60_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.sys; @@ -2014,7 +1998,7 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -2030,23 +2014,22 @@ define i8 @acq_rel_acquire_i8_generic_cta(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB65_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB65_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB65_1; ; SM90-NEXT: $L__BB65_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; @@ -2061,7 +2044,7 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90: { ; SM90-NEXT: .reg .pred %p<3>; ; SM90-NEXT: .reg .b16 %rs<2>; -; SM90-NEXT: .reg .b32 %r<18>; +; SM90-NEXT: .reg .b32 %r<17>; ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: @@ -2077,23 +2060,22 @@ define i8 @acq_rel_acquire_i8_shared_cta(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: shl.b32 %r11, %r10, %r1; ; SM90-NEXT: not.b32 %r2, %r11; ; SM90-NEXT: cvt.u32.u16 %r12, %rs1; -; SM90-NEXT: and.b32 %r13, %r12, 255; -; SM90-NEXT: shl.b32 %r3, %r13, %r1; +; SM90-NEXT: shl.b32 %r3, %r12, %r1; ; SM90-NEXT: shl.b32 %r4, %r7, %r1; -; SM90-NEXT: ld.shared.b32 %r14, [%rd1]; -; SM90-NEXT: and.b32 %r17, %r14, %r2; +; SM90-NEXT: ld.shared.b32 %r13, [%rd1]; +; SM90-NEXT: and.b32 %r16, %r13, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 -; SM90-NEXT: or.b32 %r15, %r17, %r3; -; SM90-NEXT: or.b32 %r16, %r17, %r4; -; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r16, %r15; -; SM90-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM90-NEXT: or.b32 %r14, %r16, %r3; +; SM90-NEXT: or.b32 %r15, %r16, %r4; +; SM90-NEXT: atom.relaxed.cta.shared.cas.b32 %r5, [%rd1], %r15, %r14; +; SM90-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM90-NEXT: @%p1 bra $L__BB66_3; ; SM90-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM90-NEXT: // in Loop: Header=BB66_1 Depth=1 ; SM90-NEXT: and.b32 %r6, %r5, %r2; -; SM90-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM90-NEXT: mov.b32 %r17, %r6; +; SM90-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM90-NEXT: mov.b32 %r16, %r6; ; SM90-NEXT: @%p2 bra $L__BB66_1; ; SM90-NEXT: $L__BB66_3: // %partword.cmpxchg.end ; SM90-NEXT: fence.acquire.cta; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index 997df7a8ad8b8..edf553e427f55 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -14,7 +14,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<18>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: @@ -29,23 +29,22 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: shl.b32 %r11, %r10, %r1; ; SM30-NEXT: not.b32 %r2, %r11; ; SM30-NEXT: cvt.u32.u16 %r12, %rs1; -; SM30-NEXT: and.b32 %r13, %r12, 255; -; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; ; SM30-NEXT: shl.b32 %r4, %r7, %r1; -; SM30-NEXT: ld.b32 %r14, [%rd1]; -; SM30-NEXT: and.b32 %r17, %r14, %r2; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r15, %r17, %r3; -; SM30-NEXT: or.b32 %r16, %r17, %r4; -; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; -; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB0_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB0_1 Depth=1 ; SM30-NEXT: and.b32 %r6, %r5, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM30-NEXT: mov.b32 %r17, %r6; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB0_1; ; SM30-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM30-NEXT: st.param.b32 [func_retval0], %r12; @@ -55,7 +54,7 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -70,23 +69,22 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB0_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB0_1; ; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r12; @@ -140,7 +138,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<18>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: @@ -155,23 +153,22 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: shl.b32 %r11, %r10, %r1; ; SM30-NEXT: not.b32 %r2, %r11; ; SM30-NEXT: cvt.u32.u16 %r12, %rs1; -; SM30-NEXT: and.b32 %r13, %r12, 255; -; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; ; SM30-NEXT: shl.b32 %r4, %r7, %r1; -; SM30-NEXT: ld.b32 %r14, [%rd1]; -; SM30-NEXT: and.b32 %r17, %r14, %r2; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r15, %r17, %r3; -; SM30-NEXT: or.b32 %r16, %r17, %r4; -; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; -; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB1_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB1_1 Depth=1 ; SM30-NEXT: and.b32 %r6, %r5, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM30-NEXT: mov.b32 %r17, %r6; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB1_1; ; SM30-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; @@ -182,7 +179,7 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -197,23 +194,22 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB1_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB1_1; ; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -269,7 +265,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<18>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: @@ -285,23 +281,22 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: shl.b32 %r11, %r10, %r1; ; SM30-NEXT: not.b32 %r2, %r11; ; SM30-NEXT: cvt.u32.u16 %r12, %rs1; -; SM30-NEXT: and.b32 %r13, %r12, 255; -; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; ; SM30-NEXT: shl.b32 %r4, %r7, %r1; -; SM30-NEXT: ld.b32 %r14, [%rd1]; -; SM30-NEXT: and.b32 %r17, %r14, %r2; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r15, %r17, %r3; -; SM30-NEXT: or.b32 %r16, %r17, %r4; -; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; -; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB2_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB2_1 Depth=1 ; SM30-NEXT: and.b32 %r6, %r5, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM30-NEXT: mov.b32 %r17, %r6; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB2_1; ; SM30-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM30-NEXT: st.param.b32 [func_retval0], %r12; @@ -311,7 +306,7 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -327,23 +322,22 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB2_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB2_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB2_1; ; SM70-NEXT: $L__BB2_3: // %partword.cmpxchg.end ; SM70-NEXT: st.param.b32 [func_retval0], %r12; @@ -398,7 +392,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<18>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: @@ -414,23 +408,22 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: shl.b32 %r11, %r10, %r1; ; SM30-NEXT: not.b32 %r2, %r11; ; SM30-NEXT: cvt.u32.u16 %r12, %rs1; -; SM30-NEXT: and.b32 %r13, %r12, 255; -; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; ; SM30-NEXT: shl.b32 %r4, %r7, %r1; -; SM30-NEXT: ld.b32 %r14, [%rd1]; -; SM30-NEXT: and.b32 %r17, %r14, %r2; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r15, %r17, %r3; -; SM30-NEXT: or.b32 %r16, %r17, %r4; -; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; -; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB3_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB3_1 Depth=1 ; SM30-NEXT: and.b32 %r6, %r5, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM30-NEXT: mov.b32 %r17, %r6; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB3_1; ; SM30-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; @@ -441,7 +434,7 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -457,23 +450,22 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB3_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB3_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB3_1; ; SM70-NEXT: $L__BB3_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; @@ -530,7 +522,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30: { ; SM30-NEXT: .reg .pred %p<3>; ; SM30-NEXT: .reg .b16 %rs<2>; -; SM30-NEXT: .reg .b32 %r<18>; +; SM30-NEXT: .reg .b32 %r<17>; ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: @@ -546,23 +538,22 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: shl.b32 %r11, %r10, %r1; ; SM30-NEXT: not.b32 %r2, %r11; ; SM30-NEXT: cvt.u32.u16 %r12, %rs1; -; SM30-NEXT: and.b32 %r13, %r12, 255; -; SM30-NEXT: shl.b32 %r3, %r13, %r1; +; SM30-NEXT: shl.b32 %r3, %r12, %r1; ; SM30-NEXT: shl.b32 %r4, %r7, %r1; -; SM30-NEXT: ld.b32 %r14, [%rd1]; -; SM30-NEXT: and.b32 %r17, %r14, %r2; +; SM30-NEXT: ld.b32 %r13, [%rd1]; +; SM30-NEXT: and.b32 %r16, %r13, %r2; ; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 -; SM30-NEXT: or.b32 %r15, %r17, %r3; -; SM30-NEXT: or.b32 %r16, %r17, %r4; -; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r15; -; SM30-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM30-NEXT: or.b32 %r14, %r16, %r3; +; SM30-NEXT: or.b32 %r15, %r16, %r4; +; SM30-NEXT: atom.cas.b32 %r5, [%rd1], %r15, %r14; +; SM30-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM30-NEXT: @%p1 bra $L__BB4_3; ; SM30-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM30-NEXT: // in Loop: Header=BB4_1 Depth=1 ; SM30-NEXT: and.b32 %r6, %r5, %r2; -; SM30-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM30-NEXT: mov.b32 %r17, %r6; +; SM30-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM30-NEXT: mov.b32 %r16, %r6; ; SM30-NEXT: @%p2 bra $L__BB4_1; ; SM30-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM30-NEXT: membar.sys; @@ -573,7 +564,7 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70: { ; SM70-NEXT: .reg .pred %p<3>; ; SM70-NEXT: .reg .b16 %rs<2>; -; SM70-NEXT: .reg .b32 %r<18>; +; SM70-NEXT: .reg .b32 %r<17>; ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: @@ -589,23 +580,22 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: shl.b32 %r11, %r10, %r1; ; SM70-NEXT: not.b32 %r2, %r11; ; SM70-NEXT: cvt.u32.u16 %r12, %rs1; -; SM70-NEXT: and.b32 %r13, %r12, 255; -; SM70-NEXT: shl.b32 %r3, %r13, %r1; +; SM70-NEXT: shl.b32 %r3, %r12, %r1; ; SM70-NEXT: shl.b32 %r4, %r7, %r1; -; SM70-NEXT: ld.b32 %r14, [%rd1]; -; SM70-NEXT: and.b32 %r17, %r14, %r2; +; SM70-NEXT: ld.b32 %r13, [%rd1]; +; SM70-NEXT: and.b32 %r16, %r13, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 -; SM70-NEXT: or.b32 %r15, %r17, %r3; -; SM70-NEXT: or.b32 %r16, %r17, %r4; -; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r16, %r15; -; SM70-NEXT: setp.eq.b32 %p1, %r5, %r16; +; SM70-NEXT: or.b32 %r14, %r16, %r3; +; SM70-NEXT: or.b32 %r15, %r16, %r4; +; SM70-NEXT: atom.relaxed.sys.cas.b32 %r5, [%rd1], %r15, %r14; +; SM70-NEXT: setp.eq.b32 %p1, %r5, %r15; ; SM70-NEXT: @%p1 bra $L__BB4_3; ; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure ; SM70-NEXT: // in Loop: Header=BB4_1 Depth=1 ; SM70-NEXT: and.b32 %r6, %r5, %r2; -; SM70-NEXT: setp.ne.b32 %p2, %r17, %r6; -; SM70-NEXT: mov.b32 %r17, %r6; +; SM70-NEXT: setp.ne.b32 %p2, %r16, %r6; +; SM70-NEXT: mov.b32 %r16, %r6; ; SM70-NEXT: @%p2 bra $L__BB4_1; ; SM70-NEXT: $L__BB4_3: // %partword.cmpxchg.end ; SM70-NEXT: fence.acq_rel.sys; diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll index f4053d84593a5..db19495b5a4ba 100644 --- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll @@ -132,5 +132,120 @@ define <2 x float> @test_uitofp_2xi8(<2 x i8> %a) { %1 = uitofp <2 x i8> %a to <2 x float> ret <2 x float> %1 } + +define void @test_store_i8x2_unaligned(ptr %ptr, <2 x i8> %a) { +; O0-LABEL: test_store_i8x2_unaligned( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b32 %r<2>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_param_0]; +; O0-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_store_i8x2_unaligned_param_1]; +; O0-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; O0-NEXT: st.b8 [%rd1+1], %rs2; +; O0-NEXT: st.b8 [%rd1], %rs1; +; O0-NEXT: ret; +; +; O3-LABEL: test_store_i8x2_unaligned( +; O3: { +; O3-NEXT: .reg .b16 %rs<3>; +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_param_0]; +; O3-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [test_store_i8x2_unaligned_param_1]; +; O3-NEXT: st.b8 [%rd1+1], %rs2; +; O3-NEXT: st.b8 [%rd1], %rs1; +; O3-NEXT: ret; + store <2 x i8> %a, ptr %ptr, align 1 + ret void +} + +define void @test_store_i8x2_unaligned_immediate(ptr %ptr) { +; O0-LABEL: test_store_i8x2_unaligned_immediate( +; O0: { +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_immediate_param_0]; +; O0-NEXT: st.b8 [%rd1+1], 2; +; O0-NEXT: st.b8 [%rd1], 1; +; O0-NEXT: ret; +; +; O3-LABEL: test_store_i8x2_unaligned_immediate( +; O3: { +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_store_i8x2_unaligned_immediate_param_0]; +; O3-NEXT: st.b8 [%rd1+1], 2; +; O3-NEXT: st.b8 [%rd1], 1; +; O3-NEXT: ret; + store <2 x i8> , ptr %ptr, align 1 + ret void +} + +define i32 @test_zext_load_i8x2_unaligned(ptr %ptr) { +; O0-LABEL: test_zext_load_i8x2_unaligned( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_zext_load_i8x2_unaligned_param_0]; +; O0-NEXT: ld.b8 %rs1, [%rd1+1]; +; O0-NEXT: ld.b8 %rs2, [%rd1]; +; O0-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O0-NEXT: ret; +; +; O3-LABEL: test_zext_load_i8x2_unaligned( +; O3: { +; O3-NEXT: .reg .b16 %rs<3>; +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_zext_load_i8x2_unaligned_param_0]; +; O3-NEXT: ld.b8 %rs1, [%rd1+1]; +; O3-NEXT: ld.b8 %rs2, [%rd1]; +; O3-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O3-NEXT: ret; + %a = load <2 x i8>, ptr %ptr, align 1 + %b = zext <2 x i8> %a to <2 x i16> + %c = bitcast <2 x i16> %b to i32 + ret i32 %c +} + +define i32 @test_sext_load_i8x2_unaligned(ptr %ptr) { +; O0-LABEL: test_sext_load_i8x2_unaligned( +; O0: { +; O0-NEXT: .reg .b16 %rs<3>; +; O0-NEXT: .reg .b64 %rd<2>; +; O0-EMPTY: +; O0-NEXT: // %bb.0: +; O0-NEXT: ld.param.b64 %rd1, [test_sext_load_i8x2_unaligned_param_0]; +; O0-NEXT: ld.s8 %rs1, [%rd1+1]; +; O0-NEXT: ld.s8 %rs2, [%rd1]; +; O0-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O0-NEXT: ret; +; +; O3-LABEL: test_sext_load_i8x2_unaligned( +; O3: { +; O3-NEXT: .reg .b16 %rs<3>; +; O3-NEXT: .reg .b64 %rd<2>; +; O3-EMPTY: +; O3-NEXT: // %bb.0: +; O3-NEXT: ld.param.b64 %rd1, [test_sext_load_i8x2_unaligned_param_0]; +; O3-NEXT: ld.s8 %rs1, [%rd1+1]; +; O3-NEXT: ld.s8 %rs2, [%rd1]; +; O3-NEXT: st.param.v2.b16 [func_retval0], {%rs2, %rs1}; +; O3-NEXT: ret; + %a = load <2 x i8>, ptr %ptr, align 1 + %b = sext <2 x i8> %a to <2 x i16> + %c = bitcast <2 x i16> %b to i32 + ret i32 %c +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; COMMON: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll index 0039370e6dcf5..be6d1581bc460 100644 --- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll +++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -185,44 +185,40 @@ define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) { define void @s1(ptr %p1, <4 x float> %v) { ; CHECK-LABEL: s1( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<18>; +; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [s1_param_0]; ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s1_param_1]; -; CHECK-NEXT: cvt.u64.u32 %rd2, %r4; -; CHECK-NEXT: st.b8 [%rd1+12], %rd2; -; CHECK-NEXT: cvt.u64.u32 %rd3, %r3; -; CHECK-NEXT: st.b8 [%rd1+8], %rd3; -; CHECK-NEXT: cvt.u64.u32 %rd4, %r2; -; CHECK-NEXT: st.b8 [%rd1+4], %rd4; -; CHECK-NEXT: cvt.u64.u32 %rd5, %r1; -; CHECK-NEXT: st.b8 [%rd1], %rd5; -; CHECK-NEXT: shr.u64 %rd6, %rd2, 24; -; CHECK-NEXT: st.b8 [%rd1+15], %rd6; -; CHECK-NEXT: shr.u64 %rd7, %rd2, 16; -; CHECK-NEXT: st.b8 [%rd1+14], %rd7; -; CHECK-NEXT: shr.u64 %rd8, %rd2, 8; -; CHECK-NEXT: st.b8 [%rd1+13], %rd8; -; CHECK-NEXT: shr.u64 %rd9, %rd3, 24; -; CHECK-NEXT: st.b8 [%rd1+11], %rd9; -; CHECK-NEXT: shr.u64 %rd10, %rd3, 16; -; CHECK-NEXT: st.b8 [%rd1+10], %rd10; -; CHECK-NEXT: shr.u64 %rd11, %rd3, 8; -; CHECK-NEXT: st.b8 [%rd1+9], %rd11; -; CHECK-NEXT: shr.u64 %rd12, %rd4, 24; -; CHECK-NEXT: st.b8 [%rd1+7], %rd12; -; CHECK-NEXT: shr.u64 %rd13, %rd4, 16; -; CHECK-NEXT: st.b8 [%rd1+6], %rd13; -; CHECK-NEXT: shr.u64 %rd14, %rd4, 8; -; CHECK-NEXT: st.b8 [%rd1+5], %rd14; -; CHECK-NEXT: shr.u64 %rd15, %rd5, 24; -; CHECK-NEXT: st.b8 [%rd1+3], %rd15; -; CHECK-NEXT: shr.u64 %rd16, %rd5, 16; -; CHECK-NEXT: st.b8 [%rd1+2], %rd16; -; CHECK-NEXT: shr.u64 %rd17, %rd5, 8; -; CHECK-NEXT: st.b8 [%rd1+1], %rd17; +; CHECK-NEXT: st.b8 [%rd1+12], %r4; +; CHECK-NEXT: st.b8 [%rd1+8], %r3; +; CHECK-NEXT: st.b8 [%rd1+4], %r2; +; CHECK-NEXT: st.b8 [%rd1], %r1; +; CHECK-NEXT: shr.u32 %r5, %r4, 24; +; CHECK-NEXT: st.b8 [%rd1+15], %r5; +; CHECK-NEXT: shr.u32 %r6, %r4, 16; +; CHECK-NEXT: st.b8 [%rd1+14], %r6; +; CHECK-NEXT: shr.u32 %r7, %r4, 8; +; CHECK-NEXT: st.b8 [%rd1+13], %r7; +; CHECK-NEXT: shr.u32 %r8, %r3, 24; +; CHECK-NEXT: st.b8 [%rd1+11], %r8; +; CHECK-NEXT: shr.u32 %r9, %r3, 16; +; CHECK-NEXT: st.b8 [%rd1+10], %r9; +; CHECK-NEXT: shr.u32 %r10, %r3, 8; +; CHECK-NEXT: st.b8 [%rd1+9], %r10; +; CHECK-NEXT: shr.u32 %r11, %r2, 24; +; CHECK-NEXT: st.b8 [%rd1+7], %r11; +; CHECK-NEXT: shr.u32 %r12, %r2, 16; +; CHECK-NEXT: st.b8 [%rd1+6], %r12; +; CHECK-NEXT: shr.u32 %r13, %r2, 8; +; CHECK-NEXT: st.b8 [%rd1+5], %r13; +; CHECK-NEXT: shr.u32 %r14, %r1, 24; +; CHECK-NEXT: st.b8 [%rd1+3], %r14; +; CHECK-NEXT: shr.u32 %r15, %r1, 16; +; CHECK-NEXT: st.b8 [%rd1+2], %r15; +; CHECK-NEXT: shr.u32 %r16, %r1, 8; +; CHECK-NEXT: st.b8 [%rd1+1], %r16; ; CHECK-NEXT: ret; store <4 x float> %v, ptr %p1, align 1 ret void diff --git a/llvm/test/CodeGen/NVPTX/mulwide.ll b/llvm/test/CodeGen/NVPTX/mulwide.ll index 17220340d4b07..bde57fb7b95b0 100644 --- a/llvm/test/CodeGen/NVPTX/mulwide.ll +++ b/llvm/test/CodeGen/NVPTX/mulwide.ll @@ -118,17 +118,15 @@ define i32 @mulwideu8(i8 %a, i8 %b) { ; NOOPT-LABEL: mulwideu8( ; NOOPT: { ; NOOPT-NEXT: .reg .b16 %rs<3>; -; NOOPT-NEXT: .reg .b32 %r<6>; +; NOOPT-NEXT: .reg .b32 %r<4>; ; NOOPT-EMPTY: ; NOOPT-NEXT: // %bb.0: ; NOOPT-NEXT: ld.param.b8 %rs2, [mulwideu8_param_1]; ; NOOPT-NEXT: ld.param.b8 %rs1, [mulwideu8_param_0]; ; NOOPT-NEXT: cvt.u32.u16 %r1, %rs1; -; NOOPT-NEXT: and.b32 %r2, %r1, 255; -; NOOPT-NEXT: cvt.u32.u16 %r3, %rs2; -; NOOPT-NEXT: and.b32 %r4, %r3, 255; -; NOOPT-NEXT: mul.lo.s32 %r5, %r2, %r4; -; NOOPT-NEXT: st.param.b32 [func_retval0], %r5; +; NOOPT-NEXT: cvt.u32.u16 %r2, %rs2; +; NOOPT-NEXT: mul.lo.s32 %r3, %r1, %r2; +; NOOPT-NEXT: st.param.b32 [func_retval0], %r3; ; NOOPT-NEXT: ret; %val0 = zext i8 %a to i32 %val1 = zext i8 %b to i32