llvm
diff --git a/‎llvm/docs/AMDGPUUsage.rst‎
Lines changed: 121 additions & 97 deletions b/‎llvm/docs/AMDGPUUsage.rst‎
Lines changed: 121 additions & 97 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp‎
Lines changed: 0 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp‎
Lines changed: 9 additions & 9 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp‎
Lines changed: 9 additions & 9 deletions
@@ -353,9 +353,9 @@ supported for the ``amdgcn`` target.
   (scratch), and group (LDS) memory depending on if the address is within one
   of the aperture ranges. Flat access to scratch requires hardware aperture
   setup and setup in the kernel prologue (see
-  :ref:`amdgpu-amdhsa-flat-scratch`). Flat access to LDS requires hardware
-  aperture setup and M0 (GFX7-GFX8) register setup (see
-  :ref:`amdgpu-amdhsa-m0`).
+  :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`). Flat access to LDS requires
+  hardware aperture setup and M0 (GFX7-GFX8) register setup (see
+  :ref:`amdgpu-amdhsa-kernel-prolog-m0`).
 
   To convert between a private or group address space address (termed a segment
   address) and a flat address the base address of the corresponding aperture
@@ -5954,7 +5954,7 @@ SGPR register initial state is defined in
                                                   must be used to set up FLAT
                                                   SCRATCH for flat addressing
                                                   (see
-                                                  :ref:`amdgpu-amdhsa-flat-scratch`).
+                                                  :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`).
      ========== ========================== ====== ==============================
 
 The order of the VGPR registers is defined, but the compiler can specify which
@@ -6020,7 +6020,23 @@ following properties:
 Kernel Prolog
 ~~~~~~~~~~~~~
 
-.. _amdgpu-amdhsa-m0:
+The compiler performs initialization in the kernel prologue depending on the
+target and information about things like stack usage in the kernel and called
+functions. Some of this initialization requires the compiler to request certain
+User and System SGPRs be present in the
+:ref:`amdgpu-amdhsa-initial-kernel-execution-state` via the
+:ref:`amdgpu-amdhsa-kernel-descriptor`.
+
+.. _amdgpu-amdhsa-kernel-prolog-cfi:
+
+CFI
++++
+
+1. The CFI return address is undefined.
+2. The CFI CFA is defined using an expression which evaluates to a memory
+   location description for the private segment address ``0``.
+
+.. _amdgpu-amdhsa-kernel-prolog-m0:
 
 M0
 ++
@@ -6035,15 +6051,35 @@ GFX9-GFX10
   The M0 register is not used for range checking LDS accesses and so does not
   need to be initialized in the prolog.
 
-.. _amdgpu-amdhsa-flat-scratch:
+.. _amdgpu-amdhsa-kernel-prolog-stack-pointer:
+
+Stack Pointer
++++++++++++++
+
+If the kernel has function calls it must set up the ABI stack pointer described
+in :ref:`amdgpu-amdhsa-function-call-convention-non-kernel-functions` by
+setting SGPR32 to the the unswizzled scratch offset of the address past the
+last local allocation.
+
+.. _amdgpu-amdhsa-kernel-prolog-frame-pointer:
+
+Frame Pointer
++++++++++++++
+
+If the kernel needs a frame pointer for the reasons defined in
+``SIFrameLowering`` then SGPR34 is used and is always set to ``0`` in the
+kernel prolog. If a frame pointer is not required then all uses of the frame
+pointer are replaced with immediate ``0`` offsets.
+
+.. _amdgpu-amdhsa-kernel-prolog-flat-scratch:
 
 Flat Scratch
 ++++++++++++
 
-If the kernel may use flat operations to access scratch memory, the prolog code
-must set up FLAT_SCRATCH register pair (FLAT_SCRATCH_LO/FLAT_SCRATCH_HI which
-are in SGPRn-4/SGPRn-3). Initialization uses Flat Scratch Init and Scratch
-Wavefront Offset SGPR registers (see
+If the kernel or any function it calls may use flat operations to access
+scratch memory, the prolog code must set up the FLAT_SCRATCH register pair
+(FLAT_SCRATCH_LO/FLAT_SCRATCH_HI which are in SGPRn-4/SGPRn-3). Initialization
+uses Flat Scratch Init and Scratch Wavefront Offset SGPR registers (see
 :ref:`amdgpu-amdhsa-initial-kernel-execution-state`):
 
 GFX6
@@ -6074,6 +6110,52 @@ GFX9-GFX10
   FLAT_SCRATCH pair for use as the flat scratch base in flat memory
   instructions.
 
+.. _amdgpu-amdhsa-kernel-prolog-private-segment-buffer:
+
+Private Segment Buffer
+++++++++++++++++++++++
+
+A set of four SGPRs beginning at a four-aligned SGPR index are always selected
+to serve as the scratch V# for the kernel as follows:
+
+  - If it is know during instruction selection that there is stack usage,
+    SGPR0-3 is reserved for use as the scratch V#.  Stack usage is assumed if
+    optimisations are disabled (``-O0``), if stack objects already exist (for
+    locals, etc.), or if there are any function calls.
+
+  - Otherwise, four high numbered SGPRs beginning at a four-aligned SGPR index
+    are reserved for the tentative scratch V#. These will be used if it is
+    determined that spilling is needed.
+
+    - If no use is made of the tentative scratch V#, then it is unreserved
+      and the register count is determined ignoring it.
+    - If use is made of the tenatative scratch V#, then its register numbers
+      are shifted to the first four-aligned SGPR index after the highest one
+      allocated by the register allocator, and all uses are updated. The
+      register count includes them in the shifted location.
+    - In either case, if the processor has the SGPR allocation bug, the
+      tentative allocation is not shifted or unreserved in order to ensure
+      the register count is higher to workaround the bug.
+
+    .. note::
+
+      This approach of using a tentative scratch V# and shifting the register
+      numbers if used avoids having to perform register allocation a second
+      time if the tentative V# is eliminated. This is more efficient and
+      avoids the problem that the second register allocation may perform
+      spilling which will fail as there is no longer a scratch V#.
+
+When the kernel prolog code is being emitted it is known whether the scratch V#
+described above is actually used. If it is, the prolog code must set it up by
+copying the Private Segment Buffer to the scratch V# registers and then adding
+the Private Segment Wavefront Offset to the queue base address in the V#. The
+result is a V# with a base address pointing to the beginning of the wavefront
+scratch backing memory.
+
+The Private Segment Buffer is always requested, but the Private Segment
+Wavefront Offset is only requested if it is used (see
+:ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
+
 .. _amdgpu-amdhsa-memory-model:
 
 Memory Model
@@ -8514,6 +8596,8 @@ Call Convention
 See :ref:`amdgpu-dwarf-address-space-mapping` for information on swizzled
 addresses. Unswizzled addresses are normal linear addresses.
 
+.. _amdgpu-amdhsa-function-call-convention-kernel-functions:
+
 Kernel Functions
 ++++++++++++++++
 
@@ -8537,86 +8621,56 @@ how the AMDGPU implements function calls:
         by-value struct?
       - What is ABI for lambda values?
 
-2.  The CFI return address is undefined.
-3.  If the kernel contains no calls then:
-
-    - If using the ``amdhsa`` OS ABI (see :ref:`amdgpu-os-table`), and know
-      during ISel that there is stack usage SGPR0-3 is reserved for use as the
-      scratch SRD and SGPR33 reserved for the wave scratch offset. Stack usage
-      is assumed if ``-O0``, if already aware of stack objects for locals, etc.,
-      or if there are any function calls.
-    - Otherwise, five high numbered SGPRs are reserved for the tentative scratch
-      SRD and wave scratch offset. These will be used if determine need to do
-      spilling.
-
-      - If no use is made of the tentative scratch SRD or wave scratch offset,
-        then they are unreserved and the register count is determined ignoring
-        them.
-      - If use is made of the tenatative scratch SRD or wave scratch offset,
-        then the register numbers used are shifted to be after the highest one
-        allocated by the register allocator, and all uses updated. The register
-        count will include them in the shifted location. Since register
-        allocation may introduce spills, this shifting allows them to be
-        eliminated without having to perform register allocation again.
-      - In either case, if the processor has the SGPR allocation bug, the
-        tentative allocation is not shifted or unreserved inorder to ensure the
-        register count is higher to workaround the bug.
-
-4.  If the kernel contains function calls:
-
-    - SP is set to the wave scratch offset.
-
-      - Since SP is an unswizzled address relative to the queue scratch base, an
-        wave scratch offset is an unswizzle offset, this means that if SP is
-        used to access swizzled scratch memory, it will access the private
-        segment address 0.
-
-      .. note::
+4.  The kernel performs certain setup in its prolog, as described in
+    :ref:`amdgpu-amdhsa-kernel-prolog`.
 
-        This is planned to be changed to be the unswizzled base address of the
-        wavefront scratch backing memory.
+.. _amdgpu-amdhsa-function-call-convention-non-kernel-functions:
 
 Non-Kernel Functions
 ++++++++++++++++++++
 
 This section describes the call convention ABI for functions other than the
 outer kernel function.
 
-If a kernel has function calls then scratch is always allocated and used for the
-call stack which grows from low address to high address using the swizzled
+If a kernel has function calls then scratch is always allocated and used for
+the call stack which grows from low address to high address using the swizzled
 scratch address space.
 
 On entry to a function:
 
-1.  SGPR0-3 contain a V# with the following properties:
-
-    * Base address of the queue scratch backing memory.
-
-      .. note::
-
-        This is planned to be changed to be the unswizzled base address of the
-        wavefront scratch backing memory.
+1.  SGPR0-3 contain a V# with the following properties (see
+    :ref:`amdgpu-amdhsa-kernel-prolog-private-segment-buffer`):
 
+    * Base address pointing to the beginning of the wavefront scratch backing
+      memory.
     * Swizzled with dword element size and stride of wavefront size elements.
 
 2.  The FLAT_SCRATCH register pair is setup. See
-    :ref:`amdgpu-amdhsa-flat-scratch`.
-3.  GFX6-8: M0 register set to the size of LDS in bytes.
+    :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`.
+3.  GFX6-8: M0 register set to the size of LDS in bytes. See
+    :ref:`amdgpu-amdhsa-kernel-prolog-m0`.
 4.  The EXEC register is set to the lanes active on entry to the function.
 5.  MODE register: *TBD*
 6.  VGPR0-31 and SGPR4-29 are used to pass function input arguments as described
     below.
 7.  SGPR30-31 return address (RA). The code address that the function must
     return to when it completes. The value is undefined if the function is *no
     return*.
-8.  SGPR32 is used for the stack pointer (SP). It is an unswizzled
-    scratch offset relative to the beginning of the queue scratch backing
-    memory.
+8.  SGPR32 is used for the stack pointer (SP). It is an unswizzled scratch
+    offset relative to the beginning of the wavefront scratch backing memory.
 
     The unswizzled SP can be used with buffer instructions as an unswizzled SGPR
     offset with the scratch V# in SGPR0-3 to access the stack in a swizzled
     manner.
 
+    The unswizzled SP value can be converted into the swizzled SP value by:
+
+      | swizzled SP = unswizzled SP / wavefront size
+
+    This may be used to obtain the private address space address of stack
+    objects and to convert this address to a flat address by adding the flat
+    scratch aperture base address.
+
     The swizzled SP value is always 4 bytes aligned for the ``r600``
     architecture and 16 byte aligned for the ``amdgcn`` architecture.
 
@@ -8639,41 +8693,14 @@ On entry to a function:
     arguments after the last local allocation and adjust SGPR32 to the address
     after the last local allocation.
 
-    .. note::
-
-      The SP value is planned to be changed to be the unswizzled offset relative
-      to the wavefront scratch backing memory.
-
-9.  SGPR33 wavefront scratch base offset. The unswizzled offset from the queue
-    scratch backing memory base to the base of the wavefront scratch backing
-    memory.
-
-    It is used to convert the unswizzled SP value to swizzled address in the
-    private address space by:
-
-      | private address = (unswizzled SP - wavefront scratch base offset) /
-        wavefront size
-
-    This may be used to obtain the private address of stack objects and to
-    convert these address to a flat address by adding the flat scratch aperture
-    base address.
-
-    .. note::
-
-      This is planned to be eliminated when SP is changed to be the unswizzled
-      offset relative to the wavefront scratch backing memory. The the
-      conversion simplifies to:
-
-        | private address = unswizzled SP / wavefront size
-
-10. All other registers are unspecified.
-11. Any necessary ``waitcnt`` has been performed to ensure memory is available
+9.  All other registers are unspecified.
+10. Any necessary ``waitcnt`` has been performed to ensure memory is available
     to the function.
 
 On exit from a function:
 
 1.  VGPR0-31 and SGPR4-29 are used to pass function result arguments as
-    described below. Any registers used are considered clobbered registers,
+    described below. Any registers used are considered clobbered registers.
 2.  The following registers are preserved and have the same value as on entry:
 
     * FLAT_SCRATCH
@@ -8872,7 +8899,7 @@ describes how the AMDGPU implements function calls:
 
 1.  SGPR34 is used as a frame pointer (FP) if necessary. Like the SP it is an
     unswizzled scratch address. It is only needed if runtime sized ``alloca``
-    are used, or for the reasons defined in ``SiFrameLowering``.
+    are used, or for the reasons defined in ``SIFrameLowering``.
 2.  Runtime stack alignment is not currently supported.
 
     .. TODO::
@@ -8886,14 +8913,11 @@ describes how the AMDGPU implements function calls:
 
     ..note::
 
-      Before CFI is generated, the call convention will be changed so that SP is
-      an unswizzled address relative to the wave scratch base.
-
       CFI will be generated that defines the CFA as the unswizzled address
       relative to the wave scratch base in the unswizzled private address space
       of the lowest address stack allocated local variable.
 
-      ``DW_AT_frame_base`` will be defined as the swizelled address in the
+      ``DW_AT_frame_base`` will be defined as the swizzled address in the
       swizzled private address space by dividing the CFA by the wavefront size
       (since CFA is always at least dword aligned which matches the scratch
       swizzle element size).
 
@@ -800,8 +800,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
-    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
-    CCInfo.AllocateReg(Info->getFrameOffsetReg());
     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
 
@@ -1474,6 +1474,7 @@ static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
 }
 
 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+  SDLoc DL(N);
   const MachineFunction &MF = CurDAG->getMachineFunction();
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -1488,9 +1489,8 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const
   }
 
   // If we don't know this private access is a local stack object, it needs to
-  // be relative to the entry point's scratch wave offset register.
-  return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(),
-                                               MVT::i32));
+  // be relative to the entry point's scratch wave offset.
+  return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32));
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
@@ -1515,10 +1515,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
     // In a call sequence, stores to the argument stack area are relative to the
     // stack pointer.
     const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
-    unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
-      Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
 
-    SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
+    SOffset = isStackPtrRelative(PtrInfo)
+                  ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
+                  : CurDAG->getTargetConstant(0, DL, MVT::i32);
     ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
     return true;
   }
@@ -1576,12 +1576,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
 
   const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
-  unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
-    Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
 
   // FIXME: Get from MachinePointerInfo? We should only be using the frame
   // offset if we know this is in a call sequence.
-  SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
+  SOffset = isStackPtrRelative(PtrInfo)
+                ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
+                : CurDAG->getTargetConstant(0, DL, MVT::i32);
 
   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
   return true;
Original file line number	Diff line number	Diff line change
`@@ -800,8 +800,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(`
`800`	`800`	`TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);`
`801`	`801`	`} else {`
`802`	`802`	`CCInfo.AllocateReg(Info->getScratchRSrcReg());`
`803`		`- CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());`
`804`		`- CCInfo.AllocateReg(Info->getFrameOffsetReg());`
`805`	`803`	`TLI.allocateSpecialInputSGPRs(CCInfo, MF, TRI, Info);`
`806`	`804`	`}`
`807`	`805`