Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/arm_sme.td
Original file line number Diff line number Diff line change
Expand Up @@ -817,4 +817,9 @@ multiclass ZAReadzArray<string vg_num>{

defm SVREADZ_VG2 : ZAReadzArray<"2">;
defm SVREADZ_VG4 : ZAReadzArray<"4">;

let SMETargetGuard = "sme2,sme-lutv2" in {
def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
}

} // let SVETargetGuard = InvalidMode
42 changes: 42 additions & 0 deletions clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5

// REQUIRES: aarch64-registered-target

// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -o /dev/null %s


#include <arm_sme.h>

// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_luti4_zt_u8_x4(
// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z19test_luti4_zt_u8_x411svuint8x2_t(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") {
return svluti4_zt_u8_x4(0, op);
}

// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_luti4_zt_s8_x4(
// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z19test_luti4_zt_s8_x411svuint8x2_t(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
svint8x4_t test_luti4_zt_s8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") {
return svluti4_zt_s8_x4(0, op);
}
5 changes: 5 additions & 0 deletions clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,8 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16,
svsudot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
}

void test_luti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") {
// Check Zt tile 0
svluti4_zt_u8_x4(1, op); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
}
7 changes: 7 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -3769,6 +3769,12 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;

def int_aarch64_sme_luti4_zt_x4
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;


//
// Register scaling
Expand All @@ -3794,6 +3800,7 @@ let TargetPrefix = "aarch64" in {
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
[IntrNoMem]>;

}

// SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
Expand Down
54 changes: 45 additions & 9 deletions llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,8 +400,10 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
}

void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
uint32_t MaxImm);
void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm);

void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc);

template <unsigned MaxIdx, unsigned Scale>
bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
Expand Down Expand Up @@ -1975,16 +1977,18 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode);
}

void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm) {
void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc,
uint32_t MaxImm) {
if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(4)))
if (Imm->getZExtValue() > MaxImm)
return;

SDValue ZtValue;
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
return;

SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
SDLoc DL(Node);
EVT VT = Node->getValueType(0);
Expand All @@ -2003,6 +2007,34 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
CurDAG->RemoveDeadNode(Node);
}

void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc) {

SDValue ZtValue;
SmallVector<SDValue, 4> Ops;
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
return;

Ops.push_back(ZtValue);
Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)}));
SDLoc DL(Node);
EVT VT = Node->getValueType(0);

SDNode *Instruction =
CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
SDValue SuperReg = SDValue(Instruction, 0);

for (unsigned I = 0; I < NumOutVecs; ++I)
ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
AArch64::zsub0 + I, DL, VT, SuperReg));

// Copy chain
unsigned ChainIdx = NumOutVecs;
ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1));
CurDAG->RemoveDeadNode(Node);
}

void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs,
unsigned Op) {
SDLoc DL(N);
Expand Down Expand Up @@ -5478,15 +5510,15 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
{AArch64::LUTI2_4ZTZI_B, AArch64::LUTI2_4ZTZI_H,
AArch64::LUTI2_4ZTZI_S}))
// Second Immediate must be <= 3:
SelectMultiVectorLuti(Node, 4, Opc, 3);
SelectMultiVectorLutiLane(Node, 4, Opc, 3);
return;
}
case Intrinsic::aarch64_sme_luti4_lane_zt_x4: {
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
Node->getValueType(0),
{0, AArch64::LUTI4_4ZTZI_H, AArch64::LUTI4_4ZTZI_S}))
// Second Immediate must be <= 1:
SelectMultiVectorLuti(Node, 4, Opc, 1);
SelectMultiVectorLutiLane(Node, 4, Opc, 1);
return;
}
case Intrinsic::aarch64_sme_luti2_lane_zt_x2: {
Expand All @@ -5495,7 +5527,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
{AArch64::LUTI2_2ZTZI_B, AArch64::LUTI2_2ZTZI_H,
AArch64::LUTI2_2ZTZI_S}))
// Second Immediate must be <= 7:
SelectMultiVectorLuti(Node, 2, Opc, 7);
SelectMultiVectorLutiLane(Node, 2, Opc, 7);
return;
}
case Intrinsic::aarch64_sme_luti4_lane_zt_x2: {
Expand All @@ -5504,7 +5536,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
{AArch64::LUTI4_2ZTZI_B, AArch64::LUTI4_2ZTZI_H,
AArch64::LUTI4_2ZTZI_S}))
// Second Immediate must be <= 3:
SelectMultiVectorLuti(Node, 2, Opc, 3);
SelectMultiVectorLutiLane(Node, 2, Opc, 3);
return;
}
case Intrinsic::aarch64_sme_luti4_zt_x4: {
SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z);
return;
}
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -940,7 +940,7 @@ defm FAMIN_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>;

let Predicates = [HasSME2, HasSME_LUTv2] in {
defm MOVT : sme2_movt_zt_to_zt<"movt", 0b0011111>;
def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
} //[HasSME2, HasSME_LUTv2]

let Predicates = [HasSME2p1, HasSME_LUTv2] in {
Expand Down
17 changes: 17 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s

target triple = "aarch64-linux"

define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @test_luti4_zt_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 {
; CHECK-LABEL: test_luti4_zt_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 }
; CHECK-NEXT: ret
%res = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1)
ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
}

attributes #0 = { "target-features"="+sme2,+sme-lutv2"}
Loading