diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 163bf9ba5b089..23bfe063fee06 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -18,6 +18,7 @@ #include "WebAssemblySubtarget.h" #include "WebAssemblyTargetMachine.h" #include "WebAssemblyUtilities.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -91,6 +92,19 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setOperationAction(ISD::LOAD, T, Custom); setOperationAction(ISD::STORE, T, Custom); } + + // Likewise, transform zext/sext/anyext extending loads from address space 1 + // (WASM globals) + setLoadExtAction({ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD}, MVT::i32, + {MVT::i8, MVT::i16}, Custom); + setLoadExtAction({ISD::EXTLOAD, ISD::ZEXTLOAD, ISD::SEXTLOAD}, MVT::i64, + {MVT::i8, MVT::i16, MVT::i32}, Custom); + + // Compensate for the EXTLOADs being custom by reimplementing some combiner + // logic + setTargetDAGCombine(ISD::AND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); + if (Subtarget->hasSIMD128()) { for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64, MVT::v2f64}) { @@ -1683,6 +1697,11 @@ static bool IsWebAssemblyGlobal(SDValue Op) { if (const GlobalAddressSDNode *GA = dyn_cast(Op)) return WebAssembly::isWasmVarAddressSpace(GA->getAddressSpace()); + if (Op->getOpcode() == WebAssemblyISD::Wrapper) + if (const GlobalAddressSDNode *GA = + dyn_cast(Op->getOperand(0))) + return WebAssembly::isWasmVarAddressSpace(GA->getAddressSpace()); + return false; } @@ -1740,16 +1759,109 @@ SDValue WebAssemblyTargetLowering::LowerLoad(SDValue Op, LoadSDNode *LN = cast(Op.getNode()); const SDValue &Base = LN->getBasePtr(); const SDValue &Offset = LN->getOffset(); + ISD::LoadExtType ExtType = LN->getExtensionType(); + EVT ResultType = LN->getValueType(0); if (IsWebAssemblyGlobal(Base)) { if (!Offset->isUndef()) report_fatal_error( "unexpected offset when loading from webassembly global", false); - SDVTList Tys = DAG.getVTList(LN->getValueType(0), MVT::Other); - SDValue Ops[] = {LN->getChain(), Base}; - return DAG.getMemIntrinsicNode(WebAssemblyISD::GLOBAL_GET, DL, Tys, Ops, - LN->getMemoryVT(), LN->getMemOperand()); + if (!ResultType.isInteger() && !ResultType.isFloatingPoint()) { + SDVTList Tys = DAG.getVTList(ResultType, MVT::Other); + SDValue Ops[] = {LN->getChain(), Base}; + SDValue GlobalGetNode = + DAG.getMemIntrinsicNode(WebAssemblyISD::GLOBAL_GET, DL, Tys, Ops, + LN->getMemoryVT(), LN->getMemOperand()); + return GlobalGetNode; + } + + EVT GT = MVT::INVALID_SIMPLE_VALUE_TYPE; + + if (auto *GA = dyn_cast( + Base->getOpcode() == WebAssemblyISD::Wrapper ? Base->getOperand(0) + : Base)) + GT = EVT::getEVT(GA->getGlobal()->getValueType()); + + if (GT != MVT::i8 && GT != MVT::i16 && GT != MVT::i32 && GT != MVT::i64 && + GT != MVT::f32 && GT != MVT::f64) + report_fatal_error("encountered unexpected global type for Base when " + "loading from webassembly global", + false); + + EVT PromotedGT = getTypeToTransformTo(*DAG.getContext(), GT); + + if (ExtType == ISD::NON_EXTLOAD) { + // A normal, non-extending load may try to load more or less than the + // underlying global, which is invalid. We lower this to a load of the + // global (i32 or i64) then truncate or extend as needed + + // Modify the MMO to load the full global + // This is assumed to be safe without copy/dup, as the original load will be removed + MachineMemOperand *MMO = LN->getMemOperand(); + MMO->setType(LLT(PromotedGT.getSimpleVT())); + + SDVTList Tys = DAG.getVTList(PromotedGT, MVT::Other); + SDValue Ops[] = {LN->getChain(), Base}; + SDValue GlobalGetNode = DAG.getMemIntrinsicNode( + WebAssemblyISD::GLOBAL_GET, DL, Tys, Ops, PromotedGT, MMO); + + if (ResultType.bitsEq(PromotedGT)) { + return GlobalGetNode; + } + + SDValue ValRes; + if (ResultType.isFloatingPoint()) + ValRes = DAG.getFPExtendOrRound(GlobalGetNode, DL, ResultType); + else + ValRes = DAG.getAnyExtOrTrunc(GlobalGetNode, DL, ResultType); + + return DAG.getMergeValues({ValRes, GlobalGetNode.getValue(1)}, DL); + } + + if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD) { + // Turn the unsupported load into an EXTLOAD followed by an + // explicit zero/sign extend inreg. Same as Expand + + SDValue Result = + DAG.getExtLoad(ISD::EXTLOAD, DL, ResultType, LN->getChain(), Base, + LN->getMemoryVT(), LN->getMemOperand()); + SDValue ValRes; + if (ExtType == ISD::SEXTLOAD) + ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Result.getValueType(), + Result, DAG.getValueType(LN->getMemoryVT())); + else + ValRes = DAG.getZeroExtendInReg(Result, DL, LN->getMemoryVT()); + + return DAG.getMergeValues({ValRes, Result.getValue(1)}, DL); + } + + if (ExtType == ISD::EXTLOAD) { + // Expand the EXTLOAD into a regular LOAD of the global, and if + // needed, a zero-extension + + EVT OldLoadType = LN->getMemoryVT(); + EVT NewLoadType = getTypeToTransformTo(*DAG.getContext(), OldLoadType); + + // Modify the MMO to load a whole WASM "register"'s worth + // This is assumed to be safe without copy/dup, as the original load will be removed + MachineMemOperand *MMO = LN->getMemOperand(); + MMO->setType(LLT(NewLoadType.getSimpleVT())); + + SDValue Result = + DAG.getLoad(NewLoadType, DL, LN->getChain(), Base, MMO); + + if (NewLoadType != ResultType) { + SDValue ValRes = DAG.getNode(ISD::ANY_EXTEND, DL, ResultType, Result); + return DAG.getMergeValues({ValRes, Result.getValue(1)}, DL); + } + + return Result; + } + + report_fatal_error( + "encountered unexpected ExtType when loading from webassembly global", + false); } if (std::optional Local = IsWebAssemblyLocal(Base, DAG)) { @@ -3525,6 +3637,184 @@ static SDValue performMulCombine(SDNode *N, } } +static SDValue performANDCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Copied and modified from DAGCombiner::visitAND(SDNode *N) + // We have to do this because the original combiner doesn't work when ZEXTLOAD + // has custom lowering + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SDLoc DL(N); + + // fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) -> + // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must + // already be zero by virtue of the width of the base type of the load. + // + // the 'X' node here can either be nothing or an extract_vector_elt to catch + // more cases. + if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() && + N0.getOperand(0).getOpcode() == ISD::LOAD && + N0.getOperand(0).getResNo() == 0) || + (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) { + auto *Load = + cast((N0.getOpcode() == ISD::LOAD) ? N0 : N0.getOperand(0)); + + // Get the constant (if applicable) the zero'th operand is being ANDed with. + // This can be a pure constant or a vector splat, in which case we treat the + // vector as a scalar and use the splat value. + APInt Constant = APInt::getZero(1); + if (const ConstantSDNode *C = isConstOrConstSplat( + N1, /*AllowUndefs=*/false, /*AllowTruncation=*/true)) { + Constant = C->getAPIntValue(); + } else if (BuildVectorSDNode *Vector = dyn_cast(N1)) { + unsigned EltBitWidth = Vector->getValueType(0).getScalarSizeInBits(); + APInt SplatValue, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + // Endianness should not matter here. Code below makes sure that we only + // use the result if the SplatBitSize is a multiple of the vector element + // size. And after that we AND all element sized parts of the splat + // together. So the end result should be the same regardless of in which + // order we do those operations. + const bool IsBigEndian = false; + bool IsSplat = + Vector->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, + HasAnyUndefs, EltBitWidth, IsBigEndian); + + // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a + // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value. + if (IsSplat && (SplatBitSize % EltBitWidth) == 0) { + // Undef bits can contribute to a possible optimisation if set, so + // set them. + SplatValue |= SplatUndef; + + // The splat value may be something like "0x00FFFFFF", which means 0 for + // the first vector value and FF for the rest, repeating. We need a mask + // that will apply equally to all members of the vector, so AND all the + // lanes of the constant together. + Constant = APInt::getAllOnes(EltBitWidth); + for (unsigned i = 0, n = (SplatBitSize / EltBitWidth); i < n; ++i) + Constant &= SplatValue.extractBits(EltBitWidth, i * EltBitWidth); + } + } + + // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is + // actually legal and isn't going to get expanded, else this is a false + // optimisation. + + /*bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD, + Load->getValueType(0), + Load->getMemoryVT());*/ + // MODIFIED: this is the one difference in the logic; we allow ZEXT combine + // only in addrspace 0, where it's legal + bool CanZextLoadProfitably = Load->getAddressSpace() == 0; + + // Resize the constant to the same size as the original memory access before + // extension. If it is still the AllOnesValue then this AND is completely + // unneeded. + Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits()); + + bool B; + switch (Load->getExtensionType()) { + default: + B = false; + break; + case ISD::EXTLOAD: + B = CanZextLoadProfitably; + break; + case ISD::ZEXTLOAD: + case ISD::NON_EXTLOAD: + B = true; + break; + } + + if (B && Constant.isAllOnes()) { + // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to + // preserve semantics once we get rid of the AND. + SDValue NewLoad(Load, 0); + + // Fold the AND away. NewLoad may get replaced immediately. + DCI.CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); + + if (Load->getExtensionType() == ISD::EXTLOAD) { + NewLoad = DCI.DAG.getLoad( + Load->getAddressingMode(), ISD::ZEXTLOAD, Load->getValueType(0), + SDLoc(Load), Load->getChain(), Load->getBasePtr(), + Load->getOffset(), Load->getMemoryVT(), Load->getMemOperand()); + // Replace uses of the EXTLOAD with the new ZEXTLOAD. + if (Load->getNumValues() == 3) { + // PRE/POST_INC loads have 3 values. + SDValue To[] = {NewLoad.getValue(0), NewLoad.getValue(1), + NewLoad.getValue(2)}; + DCI.CombineTo(Load, ArrayRef(To, 3), true); + } else { + DCI.CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1)); + } + } + + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + } + return SDValue(); +} + +static SDValue +performSIGN_EXTEND_INREGCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + // Copied and modified from DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) + // We have to do this because the original combiner doesn't work when SEXTLOAD + // has custom lowering + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + EVT ExtVT = cast(N1)->getVT(); + SDLoc DL(N); + + // fold (sext_inreg (extload x)) -> (sextload x) + // If sextload is not supported by target, we can only do the combine when + // load has one use. Doing otherwise can block folding the extload with other + // extends that the target does support. + + // MODIFIED: replaced TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) with + // cast(N0)->getAddressSpace() == 0) + if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && + ExtVT == cast(N0)->getMemoryVT() && + ((!DCI.isAfterLegalizeDAG() && cast(N0)->isSimple() && + N0.hasOneUse()) || + cast(N0)->getAddressSpace() == 0)) { + auto *LN0 = cast(N0); + SDValue ExtLoad = + DCI.DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), + LN0->getBasePtr(), ExtVT, LN0->getMemOperand()); + DCI.CombineTo(N, ExtLoad); + DCI.CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); + DCI.AddToWorklist(ExtLoad.getNode()); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + + // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use + + // MODIFIED: replaced TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) with + // cast(N0)->getAddressSpace() == 0) + if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && + N0.hasOneUse() && ExtVT == cast(N0)->getMemoryVT() && + ((!DCI.isAfterLegalizeDAG() && cast(N0)->isSimple()) && + cast(N0)->getAddressSpace() == 0)) { + auto *LN0 = cast(N0); + SDValue ExtLoad = + DCI.DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(), + LN0->getBasePtr(), ExtVT, LN0->getMemOperand()); + DCI.CombineTo(N, ExtLoad); + DCI.CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + + return SDValue(); +} + SDValue WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -3557,5 +3847,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, return performAnyAllCombine(N, DCI.DAG); case ISD::MUL: return performMulCombine(N, DCI); + case ISD::AND: + return performANDCombine(N, DCI); + case ISD::SIGN_EXTEND_INREG: + return performSIGN_EXTEND_INREGCombine(N, DCI); } } diff --git a/llvm/test/CodeGen/WebAssembly/lower-load-wasm-global.ll b/llvm/test/CodeGen/WebAssembly/lower-load-wasm-global.ll new file mode 100644 index 0000000000000..0112296df1aa8 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/lower-load-wasm-global.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s | FileCheck %s + +; Test various loads from WASM (address space 1) globals lower as intended + +target triple = "wasm32-unknown-unknown" + + +@globalI8 = local_unnamed_addr addrspace(1) global i8 undef +@globalI32 = local_unnamed_addr addrspace(1) global i32 undef +@globalI64 = local_unnamed_addr addrspace(1) global i64 undef + + +define i32 @zext_i8_i32() { +; CHECK-LABEL: zext_i8_i32: +; CHECK: .functype zext_i8_i32 () -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI32 +; CHECK-NEXT: i32.const 255 +; CHECK-NEXT: i32.and +; CHECK-NEXT: # fallthrough-return + %v = load i8, ptr addrspace(1) @globalI32 + %e = zext i8 %v to i32 + ret i32 %e +} + +define i32 @sext_i8_i32() { +; CHECK-LABEL: sext_i8_i32: +; CHECK: .functype sext_i8_i32 () -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI32 +; CHECK-NEXT: i32.extend8_s +; CHECK-NEXT: # fallthrough-return + %v = load i8, ptr addrspace(1) @globalI32 + %e = sext i8 %v to i32 + ret i32 %e +} + +define i32 @zext_i16_i32() { +; CHECK-LABEL: zext_i16_i32: +; CHECK: .functype zext_i16_i32 () -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI32 +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32.and +; CHECK-NEXT: # fallthrough-return + %v = load i16, ptr addrspace(1) @globalI32 + %e = zext i16 %v to i32 + ret i32 %e +} + +define i32 @sext_i16_i32() { +; CHECK-LABEL: sext_i16_i32: +; CHECK: .functype sext_i16_i32 () -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI32 +; CHECK-NEXT: i32.extend16_s +; CHECK-NEXT: # fallthrough-return + %v = load i16, ptr addrspace(1) @globalI32 + %e = sext i16 %v to i32 + ret i32 %e +} + + +define i64 @zext_i8_i64() { +; CHECK-LABEL: zext_i8_i64: +; CHECK: .functype zext_i8_i64 () -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI64 +; CHECK-NEXT: i64.const 255 +; CHECK-NEXT: i64.and +; CHECK-NEXT: # fallthrough-return + %v = load i8, ptr addrspace(1) @globalI64 + %e = zext i8 %v to i64 + ret i64 %e +} + +define i64 @sext_i8_i64() { +; CHECK-LABEL: sext_i8_i64: +; CHECK: .functype sext_i8_i64 () -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI64 +; CHECK-NEXT: i64.extend8_s +; CHECK-NEXT: # fallthrough-return + %v = load i8, ptr addrspace(1) @globalI64 + %e = sext i8 %v to i64 + ret i64 %e +} + +define i64 @zext_i16_i64() { +; CHECK-LABEL: zext_i16_i64: +; CHECK: .functype zext_i16_i64 () -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI64 +; CHECK-NEXT: i64.const 65535 +; CHECK-NEXT: i64.and +; CHECK-NEXT: # fallthrough-return + %v = load i16, ptr addrspace(1) @globalI64 + %e = zext i16 %v to i64 + ret i64 %e +} + +define i64 @sext_i16_i64() { +; CHECK-LABEL: sext_i16_i64: +; CHECK: .functype sext_i16_i64 () -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI64 +; CHECK-NEXT: i64.extend16_s +; CHECK-NEXT: # fallthrough-return + %v = load i16, ptr addrspace(1) @globalI64 + %e = sext i16 %v to i64 + ret i64 %e +} + +define i64 @zext_i32_i64() { +; CHECK-LABEL: zext_i32_i64: +; CHECK: .functype zext_i32_i64 () -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI64 +; CHECK-NEXT: i64.const 4294967295 +; CHECK-NEXT: i64.and +; CHECK-NEXT: # fallthrough-return + %v = load i32, ptr addrspace(1) @globalI64 + %e = zext i32 %v to i64 + ret i64 %e +} + +define i64 @sext_i32_i64() { +; CHECK-LABEL: sext_i32_i64: +; CHECK: .functype sext_i32_i64 () -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI64 +; CHECK-NEXT: i64.extend32_s +; CHECK-NEXT: # fallthrough-return + %v = load i32, ptr addrspace(1) @globalI64 + %e = sext i32 %v to i64 + ret i64 %e +} + + +define i64 @load_i64_from_i32() { +; CHECK-LABEL: load_i64_from_i32: +; CHECK: .functype load_i64_from_i32 () -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI32 +; CHECK-NEXT: i64.extend_i32_u +; CHECK-NEXT: # fallthrough-return + %v = load i64, ptr addrspace(1) @globalI32 + ret i64 %v +} + +define i32 @load_i32_from_i64() { +; CHECK-LABEL: load_i32_from_i64: +; CHECK: .functype load_i32_from_i64 () -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI64 +; CHECK-NEXT: i32.wrap_i64 +; CHECK-NEXT: # fallthrough-return + %v = load i32, ptr addrspace(1) @globalI64 + ret i32 %v +} + +define i8 @load_i8() { +; CHECK-LABEL: load_i8: +; CHECK: .functype load_i8 () -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI8 +; CHECK-NEXT: # fallthrough-return + %v = load i8, ptr addrspace(1) @globalI8 + ret i8 %v +} + +define i64 @load_i16_from_i8_zext_to_i64() { +; CHECK-LABEL: load_i16_from_i8_zext_to_i64: +; CHECK: .functype load_i16_from_i8_zext_to_i64 () -> (i64) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: global.get globalI8 +; CHECK-NEXT: i64.extend_i32_u +; CHECK-NEXT: i64.const 65535 +; CHECK-NEXT: i64.and +; CHECK-NEXT: # fallthrough-return + %v = load i16, ptr addrspace(1) @globalI8 + %e = zext i16 %v to i64 + ret i64 %e +} diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll index d23c2272d9c0d..973940b389976 100644 --- a/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/int-mac-reduction-costs.ll @@ -6,9 +6,9 @@ target triple = "wasm32" define hidden i32 @i32_mac_s8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { ; CHECK-LABEL: 'i32_mac_s8' ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8, ptr %arrayidx, align 1 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv = sext i8 %0 to i32 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv = sext i8 %0 to i32 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load i8, ptr %arrayidx1, align 1 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv2 = sext i8 %1 to i32 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv2 = sext i8 %1 to i32 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nsw i32 %conv2, %conv ; CHECK: LV: Found an estimated cost of 3 for VF 2 For instruction: %0 = load i8, ptr %arrayidx, align 1 @@ -50,9 +50,9 @@ for.body: ; preds = %entry, %for.body define hidden i32 @i32_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { ; CHECK-LABEL: 'i32_mac_s16' ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i16, ptr %arrayidx, align 2 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv = sext i16 %0 to i32 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv = sext i16 %0 to i32 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load i16, ptr %arrayidx1, align 2 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv2 = sext i16 %1 to i32 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv2 = sext i16 %1 to i32 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nsw i32 %conv2, %conv ; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %0 = load i16, ptr %arrayidx, align 2 @@ -94,9 +94,9 @@ for.body: ; preds = %entry, %for.body define hidden i64 @i64_mac_s16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { ; CHECK-LABEL: 'i64_mac_s16' ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i16, ptr %arrayidx, align 2 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv = sext i16 %0 to i64 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv = sext i16 %0 to i64 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load i16, ptr %arrayidx1, align 2 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv2 = sext i16 %1 to i64 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv2 = sext i16 %1 to i64 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nsw i64 %conv2, %conv ; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %0 = load i16, ptr %arrayidx, align 2 @@ -167,9 +167,9 @@ for.body: ; preds = %entry, %for.body define hidden i32 @i32_mac_u8(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { ; CHECK-LABEL: 'i32_mac_u8' ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i8, ptr %arrayidx, align 1 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv = zext i8 %0 to i32 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv = zext i8 %0 to i32 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load i8, ptr %arrayidx1, align 1 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv2 = zext i8 %1 to i32 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv2 = zext i8 %1 to i32 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nuw nsw i32 %conv2, %conv ; CHECK: LV: Found an estimated cost of 3 for VF 2 For instruction: %0 = load i8, ptr %arrayidx, align 1 @@ -211,9 +211,9 @@ for.body: ; preds = %entry, %for.body define hidden i32 @i32_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { ; CHECK-LABEL: 'i32_mac_u16' ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i16, ptr %arrayidx, align 2 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv = zext i16 %0 to i32 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv = zext i16 %0 to i32 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load i16, ptr %arrayidx1, align 2 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv2 = zext i16 %1 to i32 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv2 = zext i16 %1 to i32 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nuw nsw i32 %conv2, %conv ; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %0 = load i16, ptr %arrayidx, align 2 @@ -255,9 +255,9 @@ for.body: ; preds = %entry, %for.body define hidden i64 @i64_mac_u16(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) { ; CHECK-LABEL: 'i64_mac_u16' ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %0 = load i16, ptr %arrayidx, align 2 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv = zext i16 %0 to i64 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv = zext i16 %0 to i64 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %1 = load i16, ptr %arrayidx1, align 2 -; CHECK: LV: Found an estimated cost of 0 for VF 1 For instruction: %conv2 = zext i16 %1 to i64 +; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %conv2 = zext i16 %1 to i64 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %mul = mul nuw nsw i64 %conv2, %conv ; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %0 = load i16, ptr %arrayidx, align 2 diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll index c8d20dccbb32b..e393276ac6416 100644 --- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll @@ -629,7 +629,7 @@ define hidden void @three_bytes_interleave_op(ptr noalias nocapture noundef writ ; CHECK-LABEL: four_bytes_same_op ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 -; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 4 ; CHECK: LV: Scalar loop costs: 20. ; CHECK: LV: Vector loop of width 2 costs: 40. @@ -690,7 +690,7 @@ define hidden void @four_bytes_same_op(ptr noalias nocapture noundef writeonly % ; CHECK-LABEL: four_bytes_split_op ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 -; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 4 ; CHECK: LV: Scalar loop costs: 20. ; CHECK: LV: Vector loop of width 2 costs: 45. @@ -755,7 +755,7 @@ define hidden void @four_bytes_split_op(ptr noalias nocapture noundef writeonly ; CHECK-LABEL: four_bytes_interleave_op ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 -; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4 +; CHECK: Cost of 26 for VF 8: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 132 for VF 16: INTERLEAVE-GROUP with factor 4 ; CHECK: LV: Scalar loop costs: 20. ; CHECK: LV: Vector loop of width 2 costs: 40 @@ -1055,7 +1055,7 @@ define hidden void @eight_bytes_interleave_op(ptr noalias nocapture noundef writ } ; CHECK-LABEL: four_bytes_into_four_ints_same_op -; CHECK: LV: Scalar loop costs: 28. +; CHECK: LV: Scalar loop costs: 36. ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i8 ; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: %17 = load i32 ; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32 @@ -1127,7 +1127,7 @@ define hidden void @four_bytes_into_four_ints_same_op(ptr noalias nocapture noun ; CHECK: Cost of 14 for VF 2: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4 ; CHECK: Cost of 24 for VF 4: INTERLEAVE-GROUP with factor 4 -; CHECK: LV: Scalar loop costs: 21. +; CHECK: LV: Scalar loop costs: 27. ; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction: %10 = load i8 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %11 = zext i8 ; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i32