diff --git a/llvm/lib/Target/LinxISA/AsmParser/LinxISAAsmParser.cpp b/llvm/lib/Target/LinxISA/AsmParser/LinxISAAsmParser.cpp index 3af58adff7163..13b827c9c5ed6 100644 --- a/llvm/lib/Target/LinxISA/AsmParser/LinxISAAsmParser.cpp +++ b/llvm/lib/Target/LinxISA/AsmParser/LinxISAAsmParser.cpp @@ -44,6 +44,14 @@ static std::string toUpperStr(StringRef S) { return Out; } +static bool isArchivedRawVectorOperandName(StringRef Name) { + const std::string Upper = toUpperStr(Name.trim()); + return StringSwitch(Upper) + .Cases({"TE", "TF", "TG", "TH"}, true) + .Cases({"TO1", "TO2", "TO3"}, true) + .Default(false); +} + static std::optional parseRegCode(StringRef Name) { StringRef N = Name.trim(); if (N.size() >= 2 && (N[0] == 'r' || N[0] == 'R') && @@ -66,6 +74,7 @@ static std::optional parseRegCode(StringRef Name) { // // Encoding: (Class << 5) | Index // Class 1: ri0..ri31 (ordered argument namespace from B.IOR) + // Class 2: p (kernel EXEC mask; only index 28 is canonical) // Class 3: lc0..lc2 (hardware loop counters) // Class 4: vt / vt#n (vector T-hand queue; index 0 is the push selector) // Class 5: vu / vu#n (vector U-hand queue) @@ -106,6 +115,8 @@ static std::optional parseRegCode(StringRef Name) { if (auto V = parsePrefixedIndex("RI", /*Class=*/1, /*MaxIndex=*/31)) return *V; + if (Upper == "P") + return (2u << 5) | 28u; if (auto V = parsePrefixedIndex("LC", /*Class=*/3, /*MaxIndex=*/2)) return *V; @@ -1215,6 +1226,12 @@ bool LinxISAAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc, ParseStatus S = tryParseRegister(Reg, StartLoc, EndLoc); if (S.isSuccess()) return false; + if (getTok().is(AsmToken::Identifier) && + isArchivedRawVectorOperandName(getTok().getString())) { + return Error(getTok().getLoc(), + "archived raw vector operand name is not allowed in canonical " + "v0.4; use TA/TB/TC/TD/TO/TS"); + } return Error(getTok().getLoc(), "expected register"); } @@ -1233,10 +1250,16 @@ bool LinxISAAsmParser::parseRegOperand(ParsedReg &Out) { StringRef Tok = getTok().getString(); StringRef Base = Tok; StringRef Suffix; - if (size_t Dot = Tok.find('.'); Dot != StringRef::npos) { + if (size_t Dot = Tok.rfind('.'); Dot != StringRef::npos) { Base = Tok.take_front(Dot); Suffix = Tok.drop_front(Dot + 1); } + if (Base.ends_with_insensitive(".reuse")) + Base = Base.drop_back(strlen(".reuse")); + if (isArchivedRawVectorOperandName(Base)) + return Error(getTok().getLoc(), + "archived raw vector operand name is not allowed in canonical " + "v0.4; use TA/TB/TC/TD/TO/TS"); auto Code = parseRegCode(Base); if (!Code) diff --git a/llvm/lib/Target/LinxISA/LinxISABlockify.cpp b/llvm/lib/Target/LinxISA/LinxISABlockify.cpp index 8a7b094b409a9..d4593c1549536 100644 --- a/llvm/lib/Target/LinxISA/LinxISABlockify.cpp +++ b/llvm/lib/Target/LinxISA/LinxISABlockify.cpp @@ -93,6 +93,12 @@ static std::optional tileSizeCodeToBytes(unsigned SizeCode) { return 1ull << (SizeCode + 4u); } +static std::optional tileBytesToSizeCode(uint64_t Bytes) { + if (Bytes < 16u || Bytes > 4096u || !isPowerOf2_64(Bytes)) + return std::nullopt; + return static_cast(Log2_64(Bytes) - 4u); +} + static bool isStrictTileSizeCode(unsigned SizeCode) { std::optional Bytes = tileSizeCodeToBytes(SizeCode); return Bytes && *Bytes >= 512u && *Bytes <= 4096u; @@ -219,6 +225,14 @@ static uint64_t requirePositiveDimImm(int64_t Dim, StringRef DimName, return static_cast(Dim); } +static bool isArchivedRawVectorOperandName(StringRef Name) { + std::string Upper = Name.trim().upper(); + return StringSwitch(Upper) + .Cases({"TE", "TF", "TG", "TH"}, true) + .Cases({"TO1", "TO2", "TO3"}, true) + .Default(false); +} + static uint64_t computeTileBytesOrDie(StringRef Context, uint64_t Dim0, uint64_t Dim1, uint64_t Dim2, uint64_t ElemBits) { @@ -611,6 +625,13 @@ class LinxISABlockify : public MachineFunctionPass { unsigned Shamt = 0; }; + struct VecPipeCursorState { + unsigned NextVt = 1; + unsigned NextVu = 1; + unsigned NextVm = 1; + unsigned NextVn = 1; + }; + auto toUpperStr = [](StringRef S) -> std::string { std::string Out; Out.reserve(S.size()); @@ -624,7 +645,7 @@ class LinxISABlockify : public MachineFunctionPass { std::string Up = toUpperStr(Suffix); if (Up == "SW") return 0u; - if (Up == "UW") + if (Up == "UW" || Up == "UH") return 1u; if (Up == "NEG" || Up == "NOT") return 2u; @@ -767,6 +788,8 @@ class LinxISABlockify : public MachineFunctionPass { return 30u; if (Upper == "U#4" || Upper == "T") return 31u; + if (Upper == "P") + return 92u; return std::nullopt; }; @@ -792,6 +815,12 @@ class LinxISABlockify : public MachineFunctionPass { Base = T.take_front(Dot).trim(); Suffix = T.drop_front(Dot + 1).trim(); } + if (Base.ends_with_insensitive(".reuse")) + Base = Base.drop_back(strlen(".reuse")).trim(); + if (isArchivedRawVectorOperandName(Base)) + report_fatal_error( + "Linx blockify: archived raw vector operand name is not allowed " + "in canonical v0.4; use TA/TB/TC/TD/TO/TS"); auto RegCode = parseRegCode(Base); if (!RegCode) @@ -815,8 +844,8 @@ class LinxISABlockify : public MachineFunctionPass { } }; - auto parseMemTriple = [&](StringRef MemExpr, ParsedVReg &Base, - ParsedVReg &Index) -> bool { + auto parseMemTriple = [&](StringRef MemExpr, unsigned WantLaneShamt, + ParsedVReg &Base, ParsedVReg &Index) -> bool { const size_t L = MemExpr.find('['); const size_t R = MemExpr.rfind(']'); if (L == StringRef::npos || R == StringRef::npos || R <= L) @@ -831,15 +860,40 @@ class LinxISABlockify : public MachineFunctionPass { auto IndexOp = parseVecRegToken(Parts[2]); if (!BaseOp || !LaneOp || !IndexOp) return false; - // Bring-up contract: lane selector is lc0<<2. + // Bring-up contract: vector body memory lanes are lc0 shifted by the + // element size. const unsigned WantLc0Code = (3u << 5) | 0u; - if (LaneOp->Code != WantLc0Code || LaneOp->Shamt != 2u) + if (LaneOp->Code != WantLc0Code || LaneOp->Shamt != WantLaneShamt) return false; Base = *BaseOp; Index = *IndexOp; return true; }; + auto parseMemImm = [&](StringRef MemExpr, unsigned WantLaneShamt, + ParsedVReg &Base, int64_t &Imm) -> bool { + const size_t L = MemExpr.find('['); + const size_t R = MemExpr.rfind(']'); + if (L == StringRef::npos || R == StringRef::npos || R <= L) + return false; + StringRef Inside = MemExpr.slice(L + 1, R).trim(); + SmallVector Parts; + splitCSV(Inside, Parts); + if (Parts.size() != 3) + return false; + auto BaseOp = parseVecRegToken(Parts[0]); + auto LaneOp = parseVecRegToken(Parts[1]); + if (!BaseOp || !LaneOp) + return false; + const unsigned WantLc0Code = (3u << 5) | 0u; + if (LaneOp->Code != WantLc0Code || LaneOp->Shamt != WantLaneShamt) + return false; + if (Parts[2].getAsInteger(/*Radix=*/0, Imm)) + return false; + Base = *BaseOp; + return true; + }; + auto normalizeLabel = [&](StringRef Label) -> std::string { std::string Out; StringRef L = Label.trim(); @@ -853,8 +907,73 @@ class LinxISABlockify : public MachineFunctionPass { return Out; }; + auto getHeadQueueClass = [&](StringRef DstPart) -> std::optional { + StringRef Base = DstPart.trim(); + if (size_t Dot = Base.rfind('.'); Dot != StringRef::npos) + Base = Base.take_front(Dot).trim(); + std::string Upper = toUpperStr(Base); + if (Upper == "VT") + return 4u; + if (Upper == "VU") + return 5u; + if (Upper == "VM") + return 6u; + if (Upper == "VN") + return 7u; + return std::nullopt; + }; + + auto noteExplicitVecPipeDest = [&](StringRef DstPart, + VecPipeCursorState &PipeState) { + StringRef Base = DstPart.trim(); + if (size_t Dot = Base.rfind('.'); Dot != StringRef::npos) + Base = Base.take_front(Dot).trim(); + std::string Upper = toUpperStr(Base); + auto bumpCounter = [&](StringRef Prefix, unsigned &NextIndex) { + StringRef UpperRef(Upper); + if (!UpperRef.starts_with(Prefix)) + return; + StringRef Tail = UpperRef.drop_front(Prefix.size()); + if (!Tail.consume_front("#")) + return; + unsigned Index = 0; + if (Tail.getAsInteger(10, Index) || Index == 0) + return; + NextIndex = std::max(NextIndex, Index + 1); + }; + bumpCounter("VT", PipeState.NextVt); + bumpCounter("VU", PipeState.NextVu); + bumpCounter("VM", PipeState.NextVm); + bumpCounter("VN", PipeState.NextVn); + }; + + auto assignVecPipeDstCode = [&](StringRef DstPart, unsigned ParsedCode, + VecPipeCursorState &PipeState) + -> unsigned { + auto nextCode = [&](unsigned Class, unsigned &NextIndex) { + return (Class << 5) | (NextIndex++ & 0x1fu); + }; + if (auto HeadClass = getHeadQueueClass(DstPart)) { + switch (*HeadClass) { + case 4: + return nextCode(*HeadClass, PipeState.NextVt); + case 5: + return nextCode(*HeadClass, PipeState.NextVu); + case 6: + return nextCode(*HeadClass, PipeState.NextVm); + case 7: + return nextCode(*HeadClass, PipeState.NextVn); + default: + llvm_unreachable("unexpected vector head queue class"); + } + } + noteExplicitVecPipeDest(DstPart, PipeState); + return ParsedCode; + }; + auto emitVectorBodyLine = [&](MachineBasicBlock &BodyBB, StringRef RawLine, StringRef CtxName, + VecPipeCursorState &PipeState, function_ref LookupLabelSym) { StringRef Line = RawLine; if (size_t Semi = Line.find(';'); Semi != StringRef::npos) @@ -908,6 +1027,20 @@ class LinxISABlockify : public MachineFunctionPass { return; } + if (Head.equals_insensitive("b.z") || Head.equals_insensitive("b.nz")) { + std::string Label = normalizeLabel(Rest); + if (Label.empty()) + fail("missing label in b.z/b.nz"); + MCSymbol *Sym = LookupLabelSym(Label); + if (!Sym) + fail("undefined vector body label"); + const unsigned Opc = Head.equals_insensitive("b.z") + ? LinxISA::PSEUDO_V_B_Z + : LinxISA::PSEUDO_V_B_NZ; + BuildMI(BodyBB, BodyBB.end(), DebugLoc(), TII.get(Opc)).addSym(Sym); + return; + } + if (Head.equals_insensitive("b.eq") || Head.equals_insensitive("b.ne") || Head.equals_insensitive("b.lt") || Head.equals_insensitive("b.ge") || Head.equals_insensitive("b.ltu") || Head.equals_insensitive("b.geu")) { @@ -957,9 +1090,26 @@ class LinxISABlockify : public MachineFunctionPass { if (!DstOp) return false; Dst = *DstOp; + Dst.Code = assignVecPipeDstCode(DstPart, DstOp->Code, PipeState); return true; }; + auto parseArrowDstCode = [&](StringRef Expr, StringRef &SrcPart, + unsigned &DstCode) -> bool { + size_t Arrow = Expr.find("->"); + if (Arrow == StringRef::npos) + return false; + SrcPart = Expr.take_front(Arrow).trim(); + StringRef DstPart = Expr.drop_front(Arrow + 2).trim(); + if (size_t Comma = DstPart.find(','); Comma != StringRef::npos) + DstPart = DstPart.take_front(Comma).trim(); + auto DstOp = parseVecRegToken(DstPart); + if (!DstOp) + return false; + DstCode = assignVecPipeDstCode(DstPart, DstOp->Code, PipeState); + return true; + }; + if (Head.equals_insensitive("c.movr")) { StringRef SrcPart; ParsedVReg Dst; @@ -995,11 +1145,12 @@ class LinxISABlockify : public MachineFunctionPass { const unsigned Opc = Head.equals_insensitive("v.add") ? LinxISA::PSEUDO_V_ADD : LinxISA::PSEUDO_V_SUB; + const unsigned SrcRType = (SrcR->SrcRType == 3u) ? 0u : SrcR->SrcRType; BuildMI(BodyBB, BodyBB.end(), DebugLoc(), TII.get(Opc)) .addImm(Dst.Code) .addImm(SrcL->Code) .addImm(SrcR->Code) - .addImm(SrcR->SrcRType) + .addImm(SrcRType) .addImm(SrcR->Shamt); return; } @@ -1075,8 +1226,8 @@ class LinxISABlockify : public MachineFunctionPass { if (Head.starts_with_insensitive("v.cmp.")) { StringRef SrcPart; - ParsedVReg Dst; - if (!parseArrow(Rest, SrcPart, Dst)) + unsigned DstCode = 0; + if (!parseArrowDstCode(Rest, SrcPart, DstCode)) fail("expected '->Dst' in vector compare op"); SmallVector Ops; splitCSV(SrcPart, Ops); @@ -1104,7 +1255,7 @@ class LinxISABlockify : public MachineFunctionPass { fail("unsupported vector compare op"); BuildMI(BodyBB, BodyBB.end(), DebugLoc(), TII.get(Opc)) - .addImm(Dst.Code) + .addImm(DstCode) .addImm(SrcL->Code) .addImm(SrcR->Code); return; @@ -1113,8 +1264,8 @@ class LinxISABlockify : public MachineFunctionPass { if (Head.equals_insensitive("v.feq") || Head.equals_insensitive("v.fne") || Head.equals_insensitive("v.flt") || Head.equals_insensitive("v.fge")) { StringRef SrcPart; - ParsedVReg Dst; - if (!parseArrow(Rest, SrcPart, Dst)) + unsigned DstCode = 0; + if (!parseArrowDstCode(Rest, SrcPart, DstCode)) fail("expected '->Dst' in vector FP compare op"); SmallVector Ops; splitCSV(SrcPart, Ops); @@ -1135,7 +1286,7 @@ class LinxISABlockify : public MachineFunctionPass { else if (Head.equals_insensitive("v.fge")) Opc = LinxISA::PSEUDO_V_FGE; BuildMI(BodyBB, BodyBB.end(), DebugLoc(), TII.get(Opc)) - .addImm(Dst.Code) + .addImm(DstCode) .addImm(SrcL->Code) .addImm(SrcR->Code); return; @@ -1182,44 +1333,93 @@ class LinxISABlockify : public MachineFunctionPass { return; } - if (Head.equals_insensitive("v.csel")) { + if (Head.equals_insensitive("v.csel") || + Head.equals_insensitive("v.psel")) { StringRef SrcPart; ParsedVReg Dst; if (!parseArrow(Rest, SrcPart, Dst)) - fail("expected '->Dst' in v.csel"); + fail("expected '->Dst' in vector select"); SmallVector Ops; splitCSV(SrcPart, Ops); + if (Head.equals_insensitive("v.psel")) { + if (Ops.size() != 2) + fail("expected two source operands for v.psel"); + auto SrcP = parseVecRegToken(Ops[0]); + auto SrcL = parseVecRegToken(Ops[1]); + if (!SrcP || !SrcL) + fail("failed to parse source operands for v.psel"); + BuildMI(BodyBB, BodyBB.end(), DebugLoc(), TII.get(LinxISA::PSEUDO_V_PSEL)) + .addImm(Dst.Code) + .addImm(SrcP->Code) + .addImm(SrcL->Code); + return; + } if (Ops.size() != 3) fail("expected three source operands for v.csel"); auto SrcP = parseVecRegToken(Ops[0]); auto SrcL = parseVecRegToken(Ops[1]); auto SrcR = parseVecRegToken(Ops[2]); if (!SrcP || !SrcL || !SrcR) - fail("failed to parse source operands for v.csel"); + fail("failed to parse source operands for vector select"); + const unsigned SrcRType = (SrcR->SrcRType == 3u) ? 0u : SrcR->SrcRType; BuildMI(BodyBB, BodyBB.end(), DebugLoc(), TII.get(LinxISA::PSEUDO_V_CSEL)) .addImm(Dst.Code) .addImm(SrcP->Code) .addImm(SrcL->Code) .addImm(SrcR->Code) - .addImm(SrcR->SrcRType); + .addImm(SrcRType); return; } - if (Head.starts_with_insensitive("v.lw.brg") || + if (Head.starts_with_insensitive("v.lb.brg") || + Head.equals_insensitive("v.lb.local") || + Head.starts_with_insensitive("v.lh.brg") || + Head.equals_insensitive("v.lh.local") || + Head.starts_with_insensitive("v.lbu.brg") || + Head.equals_insensitive("v.lbu.local") || + Head.starts_with_insensitive("v.lhu.brg") || + Head.equals_insensitive("v.lhu.local") || + Head.starts_with_insensitive("v.lw.brg") || Head.equals_insensitive("v.lw.local")) { StringRef SrcPart; ParsedVReg Dst; if (!parseArrow(Rest, SrcPart, Dst)) - fail("expected '->Dst' in v.lw"); + fail("expected '->Dst' in v.l[b|h|w]"); ParsedVReg Base, Index; - if (!parseMemTriple(SrcPart, Base, Index)) - fail("expected memory form [base, lc0<<2, idx] in v.lw"); + const bool IsSignedByte = + Head.starts_with_insensitive("v.lb.brg") || + Head.equals_insensitive("v.lb.local"); + const bool IsSignedHalf = + Head.starts_with_insensitive("v.lh.brg") || + Head.equals_insensitive("v.lh.local"); + const bool IsByte = + Head.starts_with_insensitive("v.lbu.brg") || + Head.equals_insensitive("v.lbu.local"); + const bool IsHalf = + Head.starts_with_insensitive("v.lhu.brg") || + Head.equals_insensitive("v.lhu.local"); + const bool IsNarrowByte = IsSignedByte || IsByte; + const bool IsNarrowHalf = IsSignedHalf || IsHalf; + const unsigned WantLaneShamt = + IsNarrowByte ? 0u : (IsNarrowHalf ? 1u : 2u); + if (!parseMemTriple(SrcPart, WantLaneShamt, Base, Index)) + fail("expected memory form [base, lc0<Dst' in v.lwi/v.ldi"); + ParsedVReg Base; + int64_t Imm = 0; + const bool IsDword = Head.starts_with_insensitive("v.ldi.u"); + const unsigned LaneShamt = IsDword ? 3u : 2u; + if (!parseMemImm(SrcPart, LaneShamt, Base, Imm)) + fail("expected memory form [base, lc0<= 2"); - const unsigned EncodedShamt = Index.Shamt - 2; + const bool IsByte = + Head.starts_with_insensitive("v.sb.brg") || + Head.equals_insensitive("v.sb.local"); + const bool IsHalf = + Head.starts_with_insensitive("v.sh.brg") || + Head.equals_insensitive("v.sh.local"); + const unsigned WantLaneShamt = IsByte ? 0u : (IsHalf ? 1u : 2u); + if (!SrcD || !parseMemTriple(MemPart, WantLaneShamt, Base, Index)) + fail("expected v.s[b|h|w] SrcD, [base, lc0<= lane shift"); + const unsigned EncodedShamt = + IsByte ? Index.Shamt + : (IsZeroIndex ? 0u : (Index.Shamt - WantLaneShamt)); const unsigned LocalBit = (Head.contains_insensitive(".local") || + Head.equals_insensitive("v.sb.local") || + Head.equals_insensitive("v.sh.local") || Head.equals_insensitive("v.sw.local")) ? 1u : 0u; - BuildMI(BodyBB, BodyBB.end(), DebugLoc(), TII.get(LinxISA::PSEUDO_V_SW_BRG)) + const unsigned Opc = IsByte ? LinxISA::PSEUDO_V_SB_BRG + : IsHalf ? LinxISA::PSEUDO_V_SH_BRG + : LinxISA::PSEUDO_V_SW_BRG; + BuildMI(BodyBB, BodyBB.end(), DebugLoc(), TII.get(Opc)) .addImm(SrcD->Code) .addImm(Base.Code) .addImm(Index.Code) @@ -1258,6 +1502,35 @@ class LinxISABlockify : public MachineFunctionPass { return; } + if (Head.starts_with_insensitive("v.swi.u") || + Head.starts_with_insensitive("v.sdi.u")) { + const size_t LBr = Rest.find('['); + if (LBr == StringRef::npos) + fail("expected memory operand in v.swi/v.sdi"); + StringRef ValuePart = Rest.take_front(LBr).trim(); + if (ValuePart.ends_with(",")) + ValuePart = ValuePart.drop_back().trim(); + StringRef MemPart = Rest.drop_front(LBr).trim(); + auto SrcD = parseVecRegToken(ValuePart); + ParsedVReg Base; + int64_t Imm = 0; + const bool IsDword = Head.starts_with_insensitive("v.sdi.u"); + const unsigned LaneShamt = IsDword ? 3u : 2u; + if (!SrcD || !parseMemImm(MemPart, LaneShamt, Base, Imm)) + fail("expected v.swi/v.sdi Src, [base, lc0<Code) + .addImm(Base.Code) + .addImm(Imm) + .addImm(LocalBit) + .addImm(BrgBit); + return; + } + fail("unsupported vector body statement"); }; @@ -1324,8 +1597,10 @@ class LinxISABlockify : public MachineFunctionPass { return It->second; }; + VecPipeCursorState PipeState; for (StringRef RawLine : Lines) - emitVectorBodyLine(BodyBB, RawLine, CtxName, lookupLabelSym); + emitVectorBodyLine(BodyBB, RawLine, CtxName, PipeState, + lookupLabelSym); }; auto getOrCreateVBlockBodySym = [&]() -> MCSymbol * { @@ -1354,6 +1629,9 @@ class LinxISABlockify : public MachineFunctionPass { BodyText = MFI->getVBlockBodyAsm(); } emitVectorBodyText(*VBlockBodyBB, BodyText, "vblock body"); + BuildMI(*VBlockBodyBB, VBlockBodyBB->end(), DebugLoc(), + TII.get(TargetOpcode::EH_LABEL)) + .addSym(Ctx.getOrCreateSymbol(VBlockBodySym->getName() + ".end")); DecoupledBodyBBs.insert(VBlockBodyBB); Changed = true; @@ -1385,6 +1663,9 @@ class LinxISABlockify : public MachineFunctionPass { " v.sw.local vt#1, [to, lc0<<2, lc1<<8]\n" " C.BSTOP\n"; emitVectorBodyText(*VTileAddBodyBB, StringRef(kBodyAsm), "vtile add body"); + BuildMI(*VTileAddBodyBB, VTileAddBodyBB->end(), DebugLoc(), + TII.get(TargetOpcode::EH_LABEL)) + .addSym(Ctx.getOrCreateSymbol(VTileAddBodySym->getName() + ".end")); DecoupledBodyBBs.insert(VTileAddBodyBB); Changed = true; @@ -1416,6 +1697,9 @@ class LinxISABlockify : public MachineFunctionPass { " v.sw.local vt#1, [to, lc0<<2, lc1<<8]\n" " C.BSTOP\n"; emitVectorBodyText(*VTileSubBodyBB, StringRef(kBodyAsm), "vtile sub body"); + BuildMI(*VTileSubBodyBB, VTileSubBodyBB->end(), DebugLoc(), + TII.get(TargetOpcode::EH_LABEL)) + .addSym(Ctx.getOrCreateSymbol(VTileSubBodySym->getName() + ".end")); DecoupledBodyBBs.insert(VTileSubBodyBB); Changed = true; @@ -2542,6 +2826,26 @@ class LinxISABlockify : public MachineFunctionPass { report_fatal_error("Linx: vblock.launch attr_bits must fit 22 bits"); const uint32_t Attr = static_cast(AttrBits); const uint32_t AttrAQRLMask = (1u << 18) | (1u << 21); + bool EmitLocalScratch = false; + unsigned LocalScratchSizeCode = 0; + Attribute ScratchAttr = + MF.getFunction().getFnAttribute("linx-vblock-ts-bytes"); + if (ScratchAttr.isStringAttribute()) { + uint64_t ScratchBytes = 0; + if (ScratchAttr.getValueAsString().getAsInteger(10, ScratchBytes)) { + report_fatal_error( + "Linx: linx-vblock-ts-bytes must be a decimal byte count"); + } + if (ScratchBytes != 0) { + auto SizeCode = tileBytesToSizeCode(ScratchBytes); + if (!SizeCode) { + report_fatal_error( + "Linx: linx-vblock-ts-bytes must be a power-of-two byte size in [16,4096]"); + } + EmitLocalScratch = true; + LocalScratchSizeCode = *SizeCode; + } + } if ((Attr & ~AttrAQRLMask) != 0u) { report_fatal_error( "Linx: vblock.launch only supports aq/rl B.ATTR bits in canonical v0.4"); @@ -2577,6 +2881,29 @@ class LinxISABlockify : public MachineFunctionPass { emitIOR(Bind6, Bind7, Bind8); emitIOR(Bind9, Bind10, Bind11); + if (EmitLocalScratch) { + // Reserve the first two output descriptors for TO/TS so the body + // can use the canonical `.local` output-tile order. + BuildMI(MBB, InsertPt, DL, TII.get(LinxISA::B_IOTI_G1)) + .addImm(dstTileFieldFromHand(TileHand::T)) + .addImm(0) + .addImm(1) + .addImm(0) + .addImm(1) + .addImm(0) + .addImm(0) + .addImm(0); + BuildMI(MBB, InsertPt, DL, TII.get(LinxISA::B_IOTI_G1)) + .addImm(dstTileFieldFromHand(TileHand::U)) + .addImm(0) + .addImm(1) + .addImm(0) + .addImm(1) + .addImm(0) + .addImm(8) + .addImm(LocalScratchSizeCode); + } + emitDim(MBB, InsertPt, /*LoopNest=*/0, Dim0); if (DynDim1) emitDimReg(MBB, InsertPt, /*LoopNest=*/1, Dim1Reg); @@ -5331,8 +5658,12 @@ class LinxISABlockify : public MachineFunctionPass { if (NextMI.getOperand(0).getReg() != Dst) continue; - // If Dst is used again later, keep the ADDW. - if (hasAnyUseAfter(Dst, std::next(NextIt))) + // If Dst is used again later, or is live-out to a successor, keep + // the ADDW. The SETC immediate fold only rewrites the local compare; + // removing the defining copy would strand any cross-block uses of + // the original architectural register. + if (hasAnyUseAfter(Dst, std::next(NextIt)) || + isPhysRegLiveOutOfBlock(Dst)) continue; NextMI.getOperand(0).setReg(Src); diff --git a/llvm/lib/Target/LinxISA/LinxISAFrameLowering.cpp b/llvm/lib/Target/LinxISA/LinxISAFrameLowering.cpp index c6e71dfff651f..aac0182710a11 100644 --- a/llvm/lib/Target/LinxISA/LinxISAFrameLowering.cpp +++ b/llvm/lib/Target/LinxISA/LinxISAFrameLowering.cpp @@ -38,6 +38,11 @@ static bool shouldEmitFrameMacros(const MachineFunction &MF) { return MFI.getStackSize() != 0 || !MFI.getCalleeSavedInfo().empty(); } +bool LinxISAFrameLowering::hasFPImpl(const MachineFunction &MF) const { + const MachineFrameInfo &MFI = MF.getFrameInfo(); + return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken(); +} + static std::pair getFentryRangeEnc(const MachineFunction &MF) { const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); @@ -105,6 +110,10 @@ void LinxISAFrameLowering::determineCalleeSaves(MachineFunction &MF, // force saving `ra` so that FRET.STK has a valid restore slot. const MachineFrameInfo &MFI = MF.getFrameInfo(); const bool HasFrame = MFI.hasStackObjects(); + const bool HasFP = hasFPImpl(MF); + + if (HasFP) + SavedRegs.set(LinxISA::R18); const MCPhysReg *CSRs = MF.getSubtarget().getRegisterInfo()->getCalleeSavedRegs(&MF); @@ -179,6 +188,12 @@ void LinxISAFrameLowering::emitPrologue(MachineFunction &MF, .addImm(RegEndEnc) .addImm(MacroStack); emitStackAdjustChunks(*PrologueBB, PrologueBB->end(), TII, ExtraStack, true); + if (hasFPImpl(MF)) { + BuildMI(*PrologueBB, PrologueBB->end(), DebugLoc(), TII.get(LinxISA::ADDIri), + LinxISA::R18) + .addReg(LinxISA::R1) + .addImm(0); + } } void LinxISAFrameLowering::emitEpilogue(MachineFunction &MF, @@ -240,6 +255,12 @@ void LinxISAFrameLowering::emitEpilogue(MachineFunction &MF, MBB.addSuccessor(FExitBB); FExitBB->addSuccessor(TailBB); + if (hasFPImpl(MF)) { + BuildMI(*FExitBB, FExitBB->end(), DebugLoc(), TII.get(LinxISA::ADDIri), + LinxISA::R1) + .addReg(LinxISA::R18) + .addImm(0); + } emitStackAdjustChunks(*FExitBB, FExitBB->end(), TII, ExtraStack, false); BuildMI(*FExitBB, FExitBB->end(), DebugLoc(), TII.get(LinxISA::FEXIT)) .addImm(RegBeginEnc) @@ -291,6 +312,12 @@ void LinxISAFrameLowering::emitEpilogue(MachineFunction &MF, MF.insert(std::next(MBB.getIterator()), EpilogueBB); MBB.addSuccessor(EpilogueBB); + if (hasFPImpl(MF)) { + BuildMI(*EpilogueBB, EpilogueBB->end(), DebugLoc(), TII.get(LinxISA::ADDIri), + LinxISA::R1) + .addReg(LinxISA::R18) + .addImm(0); + } emitStackAdjustChunks(*EpilogueBB, EpilogueBB->end(), TII, ExtraStack, false); MachineInstrBuilder MIB = BuildMI(*EpilogueBB, EpilogueBB->end(), DebugLoc(), diff --git a/llvm/lib/Target/LinxISA/LinxISAFrameLowering.h b/llvm/lib/Target/LinxISA/LinxISAFrameLowering.h index b67b48d72c24d..295edd3112012 100644 --- a/llvm/lib/Target/LinxISA/LinxISAFrameLowering.h +++ b/llvm/lib/Target/LinxISA/LinxISAFrameLowering.h @@ -52,7 +52,7 @@ class LinxISAFrameLowering : public TargetFrameLowering { MachineBasicBlock::iterator MI) const override; protected: - bool hasFPImpl(const MachineFunction &MF) const override { return false; } + bool hasFPImpl(const MachineFunction &MF) const override; }; } // namespace llvm diff --git a/llvm/lib/Target/LinxISA/LinxISAInstrInfo.td b/llvm/lib/Target/LinxISA/LinxISAInstrInfo.td index 3dd58041d1a61..3df67b49de148 100644 --- a/llvm/lib/Target/LinxISA/LinxISAInstrInfo.td +++ b/llvm/lib/Target/LinxISA/LinxISAInstrInfo.td @@ -616,17 +616,70 @@ let hasSideEffects = 1, isBarrier = 1, Size = 4 in { (ins i64imm:$dst, i64imm:$srcP, i64imm:$srcL, i64imm:$srcR, i64imm:$srcRType), "# PSEUDO_V_CSEL", []>; + def PSEUDO_V_PSEL : Pseudo<(outs), + (ins i64imm:$dst, i64imm:$srcP, i64imm:$srcL), + "# PSEUDO_V_PSEL", []>; let mayLoad = 1 in + def PSEUDO_V_LB_BRG : Pseudo<(outs), + (ins i64imm:$dst, i64imm:$srcL, i64imm:$srcR, + i64imm:$shamt, i64imm:$localBit), + "# PSEUDO_V_LB_BRG", []>; + let mayLoad = 1 in + def PSEUDO_V_LH_BRG : Pseudo<(outs), + (ins i64imm:$dst, i64imm:$srcL, i64imm:$srcR, + i64imm:$shamt, i64imm:$localBit), + "# PSEUDO_V_LH_BRG", []>; + let mayLoad = 1 in + def PSEUDO_V_LBU_BRG : Pseudo<(outs), + (ins i64imm:$dst, i64imm:$srcL, i64imm:$srcR, + i64imm:$shamt, i64imm:$localBit), + "# PSEUDO_V_LBU_BRG", []>; + let mayLoad = 1 in + def PSEUDO_V_LHU_BRG : Pseudo<(outs), + (ins i64imm:$dst, i64imm:$srcL, i64imm:$srcR, + i64imm:$shamt, i64imm:$localBit), + "# PSEUDO_V_LHU_BRG", []>; + let mayLoad = 1 in def PSEUDO_V_LW_BRG : Pseudo<(outs), (ins i64imm:$dst, i64imm:$srcL, i64imm:$srcR, i64imm:$shamt, i64imm:$localBit), "# PSEUDO_V_LW_BRG", []>; + let mayLoad = 1 in + def PSEUDO_V_LWI_U : Pseudo<(outs), + (ins i64imm:$dst, i64imm:$srcL, i64imm:$simm24, + i64imm:$localBit, i64imm:$brgBit), + "# PSEUDO_V_LWI_U", []>; + let mayLoad = 1 in + def PSEUDO_V_LDI_U : Pseudo<(outs), + (ins i64imm:$dst, i64imm:$srcL, i64imm:$simm24, + i64imm:$localBit, i64imm:$brgBit), + "# PSEUDO_V_LDI_U", []>; + let mayStore = 1 in + def PSEUDO_V_SB_BRG : Pseudo<(outs), + (ins i64imm:$srcD, i64imm:$srcL, i64imm:$srcR, + i64imm:$shamt, i64imm:$localBit), + "# PSEUDO_V_SB_BRG", []>; + let mayStore = 1 in + def PSEUDO_V_SH_BRG : Pseudo<(outs), + (ins i64imm:$srcD, i64imm:$srcL, i64imm:$srcR, + i64imm:$shamt, i64imm:$localBit), + "# PSEUDO_V_SH_BRG", []>; let mayStore = 1 in def PSEUDO_V_SW_BRG : Pseudo<(outs), (ins i64imm:$srcD, i64imm:$srcL, i64imm:$srcR, i64imm:$shamt, i64imm:$localBit), "# PSEUDO_V_SW_BRG", []>; + let mayStore = 1 in + def PSEUDO_V_SWI_U : Pseudo<(outs), + (ins i64imm:$srcD, i64imm:$base, i64imm:$simm24, + i64imm:$localBit, i64imm:$brgBit), + "# PSEUDO_V_SWI_U", []>; + let mayStore = 1 in + def PSEUDO_V_SDI_U : Pseudo<(outs), + (ins i64imm:$srcD, i64imm:$base, i64imm:$simm24, + i64imm:$localBit, i64imm:$brgBit), + "# PSEUDO_V_SDI_U", []>; def PSEUDO_V_RDADD : Pseudo<(outs), (ins i64imm:$dst, i64imm:$srcL), "# PSEUDO_V_RDADD", []>; @@ -665,6 +718,10 @@ let hasSideEffects = 1, isBarrier = 1, Size = 4 in { def PSEUDO_V_B_GEU : Pseudo<(outs), (ins i64imm:$srcL, i64imm:$srcR, BrTarget:$target), "# PSEUDO_V_B_GEU", []>; + def PSEUDO_V_B_Z : Pseudo<(outs), (ins BrTarget:$target), + "# PSEUDO_V_B_Z", []>; + def PSEUDO_V_B_NZ : Pseudo<(outs), (ins BrTarget:$target), + "# PSEUDO_V_B_NZ", []>; } let hasSideEffects = 1, isTerminator = 1, isBarrier = 1, Size = 4 in diff --git a/llvm/lib/Target/LinxISA/LinxISAMCInstLower.cpp b/llvm/lib/Target/LinxISA/LinxISAMCInstLower.cpp index c38418e3dc830..6bef8f335bb60 100644 --- a/llvm/lib/Target/LinxISA/LinxISAMCInstLower.cpp +++ b/llvm/lib/Target/LinxISA/LinxISAMCInstLower.cpp @@ -628,6 +628,60 @@ void LinxISAMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { {"SrcRType", I(4)}}); return; } + case LinxISA::PSEUDO_V_PSEL: { + emitNamedImmFields( + getSpecOpcode("V.PSEL", /*LengthBits=*/64, /*Fields=*/5), + {{"RegDst", I(0)}, + {"SrcL", I(1)}, + {"SrcR", I(2)}, + {"SrcRType", 3}, + {"SrcZero", 1}}); + return; + } + case LinxISA::PSEUDO_V_LB_BRG: { + emitNamedImmFields( + getSpecOpcode("V.LB.BRG", /*LengthBits=*/64, /*Fields=*/6), + {{"C", 0}, + {"L", I(4)}, + {"RegDst", I(0)}, + {"SrcL", I(1)}, + {"SrcR", I(2)}, + {"shamt", I(3)}}); + return; + } + case LinxISA::PSEUDO_V_LH_BRG: { + emitNamedImmFields( + getSpecOpcode("V.LH.BRG", /*LengthBits=*/64, /*Fields=*/6), + {{"C", 0}, + {"L", I(4)}, + {"RegDst", I(0)}, + {"SrcL", I(1)}, + {"SrcR", I(2)}, + {"shamt", I(3)}}); + return; + } + case LinxISA::PSEUDO_V_LBU_BRG: { + emitNamedImmFields( + getSpecOpcode("V.LBU.BRG", /*LengthBits=*/64, /*Fields=*/6), + {{"C", 0}, + {"L", I(4)}, + {"RegDst", I(0)}, + {"SrcL", I(1)}, + {"SrcR", I(2)}, + {"shamt", I(3)}}); + return; + } + case LinxISA::PSEUDO_V_LHU_BRG: { + emitNamedImmFields( + getSpecOpcode("V.LHU.BRG", /*LengthBits=*/64, /*Fields=*/6), + {{"C", 0}, + {"L", I(4)}, + {"RegDst", I(0)}, + {"SrcL", I(1)}, + {"SrcR", I(2)}, + {"shamt", I(3)}}); + return; + } case LinxISA::PSEUDO_V_LW_BRG: { emitNamedImmFields( getSpecOpcode("V.LW.BRG", /*LengthBits=*/64, /*Fields=*/6), @@ -639,6 +693,26 @@ void LinxISAMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { {"shamt", I(3)}}); return; } + case LinxISA::PSEUDO_V_LWI_U: { + const char *Mnem = I(4) ? "V.LWI.U.BRG" : "V.LWI.U"; + emitNamedImmFields(getSpecOpcode(Mnem, /*LengthBits=*/64, /*Fields=*/5), + {{"C", 0}, + {"L", I(3)}, + {"RegDst", I(0)}, + {"SrcL", I(1)}, + {"simm24", I(2)}}); + return; + } + case LinxISA::PSEUDO_V_LDI_U: { + const char *Mnem = I(4) ? "V.LDI.U.BRG" : "V.LDI.U"; + emitNamedImmFields(getSpecOpcode(Mnem, /*LengthBits=*/64, /*Fields=*/5), + {{"C", 0}, + {"L", I(3)}, + {"RegDst", I(0)}, + {"SrcL", I(1)}, + {"simm24", I(2)}}); + return; + } case LinxISA::PSEUDO_V_SW_BRG: { emitNamedImmFields( getSpecOpcode("V.SW.BRG", /*LengthBits=*/64, /*Fields=*/6), @@ -650,6 +724,48 @@ void LinxISAMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { {"shamt", I(3)}}); return; } + case LinxISA::PSEUDO_V_SWI_U: { + const char *Mnem = I(4) ? "V.SWI.U.BRG" : "V.SWI.U"; + emitNamedImmFields(getSpecOpcode(Mnem, /*LengthBits=*/64, /*Fields=*/5), + {{"C", 0}, + {"L", I(3)}, + {"SrcL", I(0)}, + {"SrcR", I(1)}, + {"simm24", I(2)}}); + return; + } + case LinxISA::PSEUDO_V_SDI_U: { + const char *Mnem = I(4) ? "V.SDI.U.BRG" : "V.SDI.U"; + emitNamedImmFields(getSpecOpcode(Mnem, /*LengthBits=*/64, /*Fields=*/5), + {{"C", 0}, + {"L", I(3)}, + {"SrcL", I(0)}, + {"SrcR", I(1)}, + {"simm24", I(2)}}); + return; + } + case LinxISA::PSEUDO_V_SB_BRG: { + emitNamedImmFields( + getSpecOpcode("V.SB.BRG", /*LengthBits=*/64, /*Fields=*/6), + {{"C", 0}, + {"L", I(4)}, + {"SrcD", I(0)}, + {"SrcL", I(1)}, + {"SrcR", I(2)}, + {"shamt", I(3)}}); + return; + } + case LinxISA::PSEUDO_V_SH_BRG: { + emitNamedImmFields( + getSpecOpcode("V.SH.BRG", /*LengthBits=*/64, /*Fields=*/6), + {{"C", 0}, + {"L", I(4)}, + {"SrcD", I(0)}, + {"SrcL", I(1)}, + {"SrcR", I(2)}, + {"shamt", I(3)}}); + return; + } case LinxISA::PSEUDO_V_RDADD: { emitNamedImmFields( getSpecOpcode("V.RDADD", /*LengthBits=*/64, /*Fields=*/2), @@ -746,6 +862,16 @@ void LinxISAMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { OutMI.addOperand(lowerBranchTarget(2)); // simm12 (pcrel) return; } + case LinxISA::PSEUDO_V_B_Z: { + OutMI.setOpcode(getSpecOpcode("B.Z", /*LengthBits=*/32, /*Fields=*/1)); + OutMI.addOperand(lowerBranchTarget(0)); // simm12 (pcrel) + return; + } + case LinxISA::PSEUDO_V_B_NZ: { + OutMI.setOpcode(getSpecOpcode("B.NZ", /*LengthBits=*/32, /*Fields=*/1)); + OutMI.addOperand(lowerBranchTarget(0)); // simm12 (pcrel) + return; + } case LinxISA::PSEUDO_V_C_MOVR: { OutMI.setOpcode(getSpecOpcode("C.MOVR", /*LengthBits=*/16, /*Fields=*/2)); OutMI.addOperand(MCOperand::createImm(I(0))); // RegDst diff --git a/llvm/lib/Target/LinxISA/LinxISARegisterInfo.cpp b/llvm/lib/Target/LinxISA/LinxISARegisterInfo.cpp index ddb9c9cacbcda..500410bb27ff8 100644 --- a/llvm/lib/Target/LinxISA/LinxISARegisterInfo.cpp +++ b/llvm/lib/Target/LinxISA/LinxISARegisterInfo.cpp @@ -43,6 +43,9 @@ BitVector LinxISARegisterInfo::getReservedRegs(const MachineFunction &MF) const Reserved.set(LinxISA::R0); Reserved.set(LinxISA::R1); // sp Reserved.set(LinxISA::R10); // ra + if (MF.getSubtarget().getFrameLowering()->hasFP(MF)) { + Reserved.set(LinxISA::R18); // s8 / frame pointer + } Reserved.set(LinxISA::T1); Reserved.set(LinxISA::T2); Reserved.set(LinxISA::T3); @@ -55,6 +58,9 @@ BitVector LinxISARegisterInfo::getReservedRegs(const MachineFunction &MF) const } Register LinxISARegisterInfo::getFrameRegister(const MachineFunction &MF) const { + if (MF.getSubtarget().getFrameLowering()->hasFP(MF)) { + return LinxISA::R18; // s8 + } return LinxISA::R1; // sp } @@ -102,18 +108,19 @@ static Register scavengeScratchGPR(MachineBasicBlock::iterator II, return BaseReg; } -static void materializeSPPlusOffset(MachineInstr &MI, - MachineBasicBlock::iterator II, - const TargetInstrInfo &TII, - Register BaseReg, - int64_t OffsetBytes) { +static void materializeFramePlusOffset(MachineInstr &MI, + MachineBasicBlock::iterator II, + const TargetInstrInfo &TII, + Register BaseReg, + Register FrameReg, + int64_t OffsetBytes) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); // Emit a chain of 12-bit/24-bit chunks so frame index elimination keeps // working even for very large stack frames generated by libc sources. int64_t Remaining = OffsetBytes; - Register Src = LinxISA::R1; + Register Src = FrameReg; while (Remaining != 0) { const bool IsPos = Remaining > 0; const uint64_t Abs = @@ -133,7 +140,7 @@ static void materializeSPPlusOffset(MachineInstr &MI, if (OffsetBytes == 0) { BuildMI(MBB, II, DL, TII.get(LinxISA::ADDIri), BaseReg) - .addReg(LinxISA::R1) + .addReg(FrameReg) .addImm(0); } } @@ -146,6 +153,7 @@ bool LinxISARegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MachineFunction &MF = *MI.getParent()->getParent(); const auto &TII = *MF.getSubtarget().getInstrInfo(); const MachineFrameInfo &MFI = MF.getFrameInfo(); + const Register FrameReg = getFrameRegister(MF); const int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); @@ -171,13 +179,13 @@ bool LinxISARegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, case LinxISA::SD: { if (OffsetBytes == 0) { MI.getOperand(FIOperandNum) - .ChangeToRegister(LinxISA::R1, /*isDef=*/false); + .ChangeToRegister(FrameReg, /*isDef=*/false); return false; } Register BaseReg = scavengeScratchGPR(II, RS, SPAdj, "reg-offset stack access"); - materializeSPPlusOffset(MI, II, TII, BaseReg, OffsetBytes); + materializeFramePlusOffset(MI, II, TII, BaseReg, FrameReg, OffsetBytes); MI.getOperand(FIOperandNum).ChangeToRegister(BaseReg, /*isDef=*/false); return false; @@ -193,12 +201,12 @@ bool LinxISARegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, case LinxISA::PSEUDO_TMA_TLOAD_ANY: case LinxISA::PSEUDO_TMA_TSTORE: { if (OffsetBytes == 0) { - MI.getOperand(FIOperandNum).ChangeToRegister(LinxISA::R1, /*isDef=*/false); + MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/false); return false; } Register BaseReg = scavengeScratchGPR(II, RS, SPAdj, "tile stack access"); - materializeSPPlusOffset(MI, II, TII, BaseReg, OffsetBytes); + materializeFramePlusOffset(MI, II, TII, BaseReg, FrameReg, OffsetBytes); MI.getOperand(FIOperandNum).ChangeToRegister(BaseReg, /*isDef=*/false); return false; @@ -228,15 +236,15 @@ bool LinxISARegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, report_fatal_error("Linx: unsupported non-immediate frame index operand"); Register BaseReg = scavengeScratchGPR(II, RS, SPAdj, "stack access"); - materializeSPPlusOffset(MI, II, TII, BaseReg, - OffsetBytes + ExtraImm * Scale); + materializeFramePlusOffset(MI, II, TII, BaseReg, FrameReg, + OffsetBytes + ExtraImm * Scale); MI.getOperand(FIOperandNum).ChangeToRegister(BaseReg, /*isDef=*/false); MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); return false; } - MI.getOperand(FIOperandNum).ChangeToRegister(LinxISA::R1, /*isDef=*/false); + MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/false); MI.getOperand(FIOperandNum + 1).ChangeToImmediate(ScaledOff); return false; } diff --git a/llvm/lib/Target/LinxISA/LinxISASIMTAutoVectorize.cpp b/llvm/lib/Target/LinxISA/LinxISASIMTAutoVectorize.cpp index 1119815a169f3..d163fc7b380b2 100644 --- a/llvm/lib/Target/LinxISA/LinxISASIMTAutoVectorize.cpp +++ b/llvm/lib/Target/LinxISA/LinxISASIMTAutoVectorize.cpp @@ -27,6 +27,7 @@ #include "llvm/IR/IntrinsicsLinx.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -51,6 +52,12 @@ enum class SIMTAutoVecMode { MParSafe, }; +enum class SIMTLayoutPolicy { + Auto, + ScalarReplay, + Grouped, +}; + cl::opt LinxSIMTAutoVec("linx-simt-autovec", cl::Hidden, cl::desc("Enable Linx SIMT auto-vectorization pass"), @@ -78,6 +85,18 @@ cl::opt "(must be power-of-two; default 32)"), cl::init(32)); +cl::opt LinxSIMTAutoVecLayout( + "linx-simt-autovec-layout", cl::Hidden, + cl::desc("Linx SIMT launch layout policy"), + cl::init(SIMTLayoutPolicy::Auto), + cl::values( + clEnumValN(SIMTLayoutPolicy::Auto, "auto", + "Prefer canonical grouped layout when safe, else replay"), + clEnumValN(SIMTLayoutPolicy::ScalarReplay, "scalar-replay", + "Force scalar-lane replay through LB1"), + clEnumValN(SIMTLayoutPolicy::Grouped, "grouped", + "Require canonical grouped-lane lowering"))); + static StringRef modeName(SIMTAutoVecMode Mode) { switch (Mode) { case SIMTAutoVecMode::Auto: @@ -90,6 +109,18 @@ static StringRef modeName(SIMTAutoVecMode Mode) { llvm_unreachable("invalid simt autovec mode"); } +static StringRef layoutPolicyName(SIMTLayoutPolicy Policy) { + switch (Policy) { + case SIMTLayoutPolicy::Auto: + return "auto"; + case SIMTLayoutPolicy::ScalarReplay: + return "scalar-replay"; + case SIMTLayoutPolicy::Grouped: + return "grouped"; + } + llvm_unreachable("invalid simt autovec layout policy"); +} + static std::string jsonEscape(StringRef Input) { std::string Out; Out.reserve(Input.size() + 8); @@ -126,7 +157,8 @@ static void emitRemark(StringRef FunctionName, StringRef LoopName, uint64_t GroupCount, bool ForceScalarLane, bool HasRecurrence, StringRef HeaderKind, int TouchesMemoryState, StringRef TripcountSource, - StringRef AddressModel) { + StringRef AddressModel, StringRef LayoutPolicy, + StringRef LayoutKind, StringRef CFStrategy) { if (LinxSIMTAutoVecRemarks.empty()) return; @@ -162,7 +194,10 @@ static void emitRemark(StringRef FunctionName, StringRef LoopName, << ((TouchesMemoryState != 0) ? "true" : "false") << ","; } OS << "\"tripcount_source\":\"" << jsonEscape(TripcountSource) << "\"," - << "\"address_model\":\"" << jsonEscape(AddressModel) << "\"" + << "\"address_model\":\"" << jsonEscape(AddressModel) << "\"," + << "\"layout_policy\":\"" << jsonEscape(LayoutPolicy) << "\"," + << "\"layout_kind\":\"" << jsonEscape(LayoutKind) << "\"," + << "\"cf_strategy\":\"" << jsonEscape(CFStrategy) << "\"" << "}\n"; } @@ -259,7 +294,18 @@ static bool hasStores(Loop *L) { return false; } +static bool hasSelectInLoop(const Loop *L) { + for (BasicBlock *BB : L->blocks()) { + for (const Instruction &I : *BB) { + if (isa(I)) + return true; + } + } + return false; +} + static bool hasInnerControlFlow(const Loop *L) { + const BasicBlock *Latch = L ? L->getLoopLatch() : nullptr; for (BasicBlock *BB : L->blocks()) { const Instruction *TI = BB->getTerminator(); if (!TI) @@ -270,8 +316,10 @@ static bool hasInnerControlFlow(const Loop *L) { const bool Succ0InLoop = L->contains(BI->getSuccessor(0)); const bool Succ1InLoop = L->contains(BI->getSuccessor(1)); // Allow the canonical loop-exit branch (one successor exits the loop). - if (Succ0InLoop != Succ1InLoop) + if (Succ0InLoop != Succ1InLoop && BB == Latch) continue; + if (Succ0InLoop != Succ1InLoop) + return true; // Both successors stay inside the loop => inner if/diamond/continue. return true; } @@ -281,6 +329,33 @@ static bool hasInnerControlFlow(const Loop *L) { return false; } +static bool hasStableLoopScaffold(const Loop *L) { + if (!L) + return false; + + const BasicBlock *Header = L->getHeader(); + const BasicBlock *Preheader = L->getLoopPreheader(); + const BasicBlock *Latch = L->getLoopLatch(); + if (!Header || !Preheader || !Latch) + return false; + + bool SawPreheaderPred = false; + bool SawLatchPred = false; + for (const BasicBlock *Pred : predecessors(Header)) { + if (Pred == Preheader) { + SawPreheaderPred = true; + continue; + } + if (Pred == Latch) { + SawLatchPred = true; + continue; + } + return false; + } + + return SawPreheaderPred && SawLatchPred; +} + static bool hasLoopCarriedPhi(const Loop *L, bool IsCounted) { const BasicBlock *Header = L->getHeader(); if (!Header) @@ -682,7 +757,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { (LinxSIMTAutoVecMode == SIMTAutoVecMode::MParSafe) ? "mpar" : "mseq", false, false, false, false, false, 0, 0, false, false, - "none", -1, "none", "unknown"); + "none", -1, "none", "unknown", + layoutPolicyName(LinxSIMTAutoVecLayout), "none", "none"); return Changed; } @@ -694,11 +770,11 @@ class LinxISASIMTAutoVectorize : public FunctionPass { const auto TripCountOpt = IsInnermost ? getConstantTripCount(SE, L) : std::nullopt; const bool IsCounted = TripCountOpt.has_value(); - const bool IsCanonical = - L->isLoopSimplifyForm() && L->getLoopPreheader() && L->getLoopLatch(); + const bool IsCanonical = hasStableLoopScaffold(L); const unsigned NumBlocks = L->getNumBlocks(); const bool IsSingleBlock = (NumBlocks == 1); const bool HasStore = hasStores(L); + const bool HasSelect = hasSelectInLoop(L); const bool HasExtraPhi = hasLoopCarriedPhi(L, IsInnermost && IsCounted); const bool HasCalls = hasUnsupportedCalls(L); const bool HasLinxTileIntrinsicCalls = hasLinxTileIntrinsicCalls(L); @@ -717,6 +793,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { int RemarkTouchesMemoryState = -1; std::string RemarkTripcountSource = "none"; std::string RemarkAddressModel = "unknown"; + std::string RemarkLayoutKind = "none"; + std::string RemarkCFStrategy = "none"; RemarkAddressModel = IsAffine ? "affine" : "mixed"; auto reject = [&](StringRef Why) { @@ -840,8 +918,36 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return false; } } + if (ExitBlocks.empty()) + L->getExitBlocks(ExitBlocks); const bool ExitHasPhi = isa(Exit->begin()); + { + SmallPtrSet ExitChainBlocks; + for (BasicBlock *B : ExitBlocks) { + BasicBlock *Cur = B; + for (unsigned Depth = 0; Cur && Cur != Exit && Depth < 8; ++Depth) { + if (!ExitChainBlocks.insert(Cur).second) + break; + auto *BI = dyn_cast_or_null(Cur->getTerminator()); + if (!BI || BI->isConditional() || BI->getNumSuccessors() != 1) + break; + BasicBlock *Next = BI->getSuccessor(0); + if (!Next || Next == Cur) + break; + Cur = Next; + } + } + for (BasicBlock *BB : ExitChainBlocks) { + for (Instruction &I : *BB) { + if (isa(I) || I.isTerminator()) + continue; + reject("unsupported_exit_side_effects"); + return false; + } + } + } + auto *PHBr = dyn_cast(Preheader->getTerminator()); if (!PHBr || PHBr->isConditional() || PHBr->getNumSuccessors() != 1 || PHBr->getSuccessor(0) != Header) { @@ -1133,6 +1239,282 @@ class LinxISASIMTAutoVectorize : public FunctionPass { } } } + + bool NeedsExecMaskSaveRestore = false; + struct IfConvertibleSplitInfo { + BasicBlock *BranchBB = nullptr; + BasicBlock *TrueEntryBB = nullptr; + BasicBlock *TrueExitBB = nullptr; + BasicBlock *FalseEntryBB = nullptr; + BasicBlock *FalseExitBB = nullptr; + BasicBlock *MergeBB = nullptr; + }; + struct IfConvertibleStoreMergePlan { + BasicBlock *BranchBB = nullptr; + BasicBlock *TrueStoreBB = nullptr; + StoreInst *TrueStore = nullptr; + BasicBlock *FalseStoreBB = nullptr; + StoreInst *FalseStore = nullptr; + BasicBlock *MergeBB = nullptr; + }; + struct ReplayMaskSplitInfo { + BasicBlock *BranchBB = nullptr; + BasicBlock *TrueEntryBB = nullptr; + BasicBlock *FalseEntryBB = nullptr; + BasicBlock *MergeBB = nullptr; + }; + DenseMap IfConvertibleSplits; + DenseMap IfConvertibleStoreMerges; + DenseMap ReplayMaskSplits; + SmallPtrSet IfConvertibleRegionBlocks; + struct IfConvertibleRegionInfo { + BasicBlock *ExitBB = nullptr; + SmallVector Blocks; + }; + auto isSpeculativeValueBlock = [&](BasicBlock *SideBB, + BasicBlock *MergeBB) -> bool { + if (!SideBB || !MergeBB || SideBB == MergeBB) + return false; + auto *BI = dyn_cast(SideBB->getTerminator()); + if (!BI || BI->isConditional() || BI->getNumSuccessors() != 1 || + BI->getSuccessor(0) != MergeBB) + return false; + + for (Instruction &I : *SideBB) { + if (&I == BI) + continue; + if (isa(I)) + return false; + if (!isSafeToSpeculativelyExecute(&I)) + return false; + for (User *U : I.users()) { + auto *UI = dyn_cast(U); + if (!UI) + return false; + if (UI->getParent() == SideBB) + continue; + auto *PN = dyn_cast(UI); + if (!PN || PN->getParent() != MergeBB) + return false; + } + } + + return true; + }; + auto isPhiBridgeBlock = [&](BasicBlock *BB, BasicBlock *MergeBB) -> bool { + if (!BB || !MergeBB || BB == MergeBB) + return false; + auto *BI = dyn_cast(BB->getTerminator()); + if (!BI || BI->isConditional() || BI->getNumSuccessors() != 1 || + BI->getSuccessor(0) != MergeBB) + return false; + for (Instruction &I : *BB) { + if (&I == BI) + continue; + if (!isa(I)) + return false; + } + return true; + }; + auto analyzeStoreMergeSideBlock = + [&](BasicBlock *SideBB, + BasicBlock *MergeBB) -> std::optional { + if (!SideBB || !MergeBB || SideBB == MergeBB) + return std::nullopt; + auto *BI = dyn_cast(SideBB->getTerminator()); + if (!BI || BI->isConditional() || BI->getNumSuccessors() != 1 || + BI->getSuccessor(0) != MergeBB) + return std::nullopt; + + StoreInst *OnlyStore = nullptr; + for (Instruction &I : *SideBB) { + if (&I == BI) + continue; + if (isa(I)) + return std::nullopt; + if (auto *SI = dyn_cast(&I)) { + if (OnlyStore || SI->isVolatile() || SI->isAtomic()) + return std::nullopt; + OnlyStore = SI; + continue; + } + if (!isSafeToSpeculativelyExecute(&I)) + return std::nullopt; + for (User *U : I.users()) { + auto *UI = dyn_cast(U); + if (!UI) + return std::nullopt; + if (UI->getParent() == SideBB) + continue; + if (UI == OnlyStore) + continue; + auto *SI = dyn_cast(UI); + if (!SI || SI->getParent() != SideBB) + return std::nullopt; + } + } + if (!OnlyStore) + return std::nullopt; + return OnlyStore; + }; + auto isPureBranchHeader = [&](BasicBlock *BB) -> bool { + if (!BB) + return false; + auto *BI = dyn_cast(BB->getTerminator()); + if (!BI || !BI->isConditional() || BI->getNumSuccessors() != 2) + return false; + for (Instruction &I : *BB) { + if (&I == BI) + continue; + if (isa(I)) + return false; + if (!isSafeToSpeculativelyExecute(&I)) + return false; + for (User *U : I.users()) { + auto *UI = dyn_cast(U); + if (!UI) + return false; + if (UI->getParent() == BB) + continue; + if (UI == BI) + continue; + return false; + } + } + return true; + }; + std::function( + BasicBlock *, BasicBlock *, SmallPtrSetImpl &)> + analyzeIfConvertibleRegion; + analyzeIfConvertibleRegion = + [&](BasicBlock *EntryBB, BasicBlock *FinalMergeBB, + SmallPtrSetImpl &Visited) + -> std::optional { + if (!EntryBB || !FinalMergeBB || EntryBB == FinalMergeBB || + !L->contains(EntryBB) || !L->contains(FinalMergeBB)) + return std::nullopt; + if (!Visited.insert(EntryBB).second) + return std::nullopt; + + if (isSpeculativeValueBlock(EntryBB, FinalMergeBB)) { + IfConvertibleRegionInfo Info; + Info.ExitBB = EntryBB; + Info.Blocks.push_back(EntryBB); + return Info; + } + + if (!isPureBranchHeader(EntryBB)) + return std::nullopt; + auto *BI = cast(EntryBB->getTerminator()); + BasicBlock *S0 = BI->getSuccessor(0); + BasicBlock *S1 = BI->getSuccessor(1); + if (!S0 || !S1 || !L->contains(S0) || !L->contains(S1)) + return std::nullopt; + + for (BasicBlock *CandidateMerge : L->blocks()) { + if (!isPhiBridgeBlock(CandidateMerge, FinalMergeBB)) + continue; + SmallPtrSet LeftVisited(Visited.begin(), + Visited.end()); + auto Left = analyzeIfConvertibleRegion(S0, CandidateMerge, LeftVisited); + if (!Left) + continue; + SmallPtrSet RightVisited(Visited.begin(), + Visited.end()); + auto Right = + analyzeIfConvertibleRegion(S1, CandidateMerge, RightVisited); + if (!Right) + continue; + + SmallPtrSet Unique; + IfConvertibleRegionInfo Info; + Info.ExitBB = CandidateMerge; + auto AddBlock = [&](BasicBlock *BB) { + if (BB && Unique.insert(BB).second) + Info.Blocks.push_back(BB); + }; + AddBlock(EntryBB); + for (BasicBlock *BB : Left->Blocks) + AddBlock(BB); + for (BasicBlock *BB : Right->Blocks) + AddBlock(BB); + AddBlock(CandidateMerge); + return Info; + } + + return std::nullopt; + }; + if (HasInnerCF) { + for (BasicBlock *BB : L->blocks()) { + auto *BI = dyn_cast_or_null(BB->getTerminator()); + if (!BI || !BI->isConditional()) + continue; + if (BB == Latch) + continue; + BasicBlock *S0 = BI->getSuccessor(0); + BasicBlock *S1 = BI->getSuccessor(1); + if (S0 && S1 && L->contains(S0) && L->contains(S1)) { + auto *S0BI = dyn_cast_or_null(S0->getTerminator()); + auto *S1BI = dyn_cast_or_null(S1->getTerminator()); + BasicBlock *StoreMerge0 = + (S0BI && !S0BI->isConditional() && S0BI->getNumSuccessors() == 1) + ? S0BI->getSuccessor(0) + : nullptr; + BasicBlock *StoreMerge1 = + (S1BI && !S1BI->isConditional() && S1BI->getNumSuccessors() == 1) + ? S1BI->getSuccessor(0) + : nullptr; + if (StoreMerge0 && StoreMerge0 == StoreMerge1 && + L->contains(StoreMerge0)) { + auto TrueStore = analyzeStoreMergeSideBlock(S0, StoreMerge0); + auto FalseStore = analyzeStoreMergeSideBlock(S1, StoreMerge0); + if (TrueStore && FalseStore) { + const SCEV *TruePtrS = + SE.getSCEVAtScope((*TrueStore)->getPointerOperand(), L); + const SCEV *FalsePtrS = + SE.getSCEVAtScope((*FalseStore)->getPointerOperand(), L); + if (TruePtrS == FalsePtrS) { + IfConvertibleSplits[BB] = {BB, S0, S0, S1, S1, StoreMerge0}; + IfConvertibleStoreMerges[StoreMerge0] = { + BB, S0, *TrueStore, S1, *FalseStore, StoreMerge0}; + IfConvertibleRegionBlocks.insert(S0); + IfConvertibleRegionBlocks.insert(S1); + goto found_ifconvertible_split; + } + } + } + for (BasicBlock *MergeBB : L->blocks()) { + SmallPtrSet LeftVisited; + auto TrueRegion = + analyzeIfConvertibleRegion(S0, MergeBB, LeftVisited); + if (!TrueRegion) + continue; + SmallPtrSet RightVisited; + auto FalseRegion = + analyzeIfConvertibleRegion(S1, MergeBB, RightVisited); + if (!FalseRegion) + continue; + + IfConvertibleSplits[BB] = {BB, S0, TrueRegion->ExitBB, S1, + FalseRegion->ExitBB, MergeBB}; + for (BasicBlock *RB : TrueRegion->Blocks) + IfConvertibleRegionBlocks.insert(RB); + for (BasicBlock *RB : FalseRegion->Blocks) + IfConvertibleRegionBlocks.insert(RB); + goto found_ifconvertible_split; + } + if (NeedsActiveReplay && StoreMerge0 && StoreMerge0 == StoreMerge1 && + L->contains(StoreMerge0)) { + ReplayMaskSplits[BB] = {BB, S0, S1, StoreMerge0}; + goto found_ifconvertible_split; + } + NeedsExecMaskSaveRestore = true; + break; + } + found_ifconvertible_split: + continue; + } + } if (!HasConstTripCount && TripCountOpt.has_value() && TripCountOpt.value_or(0) > 0 && isUInt<63>(TripCountOpt.value_or(0))) { @@ -1180,6 +1562,7 @@ class LinxISASIMTAutoVectorize : public FunctionPass { uint32_t SlotElems = 1; std::string DstName; unsigned SlotBind = 0; + std::optional LocalWordBase; }; struct RecurrencePlan { @@ -1189,6 +1572,7 @@ class LinxISASIMTAutoVectorize : public FunctionPass { Type *SlotTy = nullptr; // storage type for v.lw/v.sw (must be 32-bit or f32) AllocaInst *Slot = nullptr; unsigned SlotBind = 0; + std::optional LocalWordBase; }; struct F32InductionPlan { @@ -1203,6 +1587,7 @@ class LinxISASIMTAutoVectorize : public FunctionPass { PHINode *Phi = nullptr; AllocaInst *Slot = nullptr; unsigned SlotBind = 0; + std::optional LocalWordBase; }; LLVMContext &Ctx = F.getContext(); @@ -1530,6 +1915,163 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return getUnderlyingObject(Ptr->stripPointerCasts()); }; + struct SelectBaseGEPInfo { + GEPOperator *GEP = nullptr; + SelectInst *BaseSel = nullptr; + Value *Index = nullptr; + }; + + struct SelectStorePtrInfo { + Value *Cond = nullptr; + GEPOperator *TrueGEP = nullptr; + Value *TrueBase = nullptr; + GEPOperator *FalseGEP = nullptr; + Value *FalseBase = nullptr; + }; + + auto matchGEPIndexForElemBytes = + [&](Value *Ptr, uint64_t ElemBytes) + -> std::optional> { + auto *GEP = dyn_cast_or_null(Ptr); + if (!GEP && Ptr) + GEP = dyn_cast(Ptr->stripPointerCasts()); + if (!GEP || !isPowerOf2_64(ElemBytes) || ElemBytes == 0 || + ElemBytes > 8) + return std::nullopt; + + Value *Index = nullptr; + const unsigned NumIdx = GEP->getNumIndices(); + if (NumIdx == 1) { + Index = GEP->getOperand(1); + } else if (NumIdx == 2) { + auto *Z = dyn_cast(GEP->getOperand(1)); + if (!Z || !Z->isZero()) + return std::nullopt; + Index = GEP->getOperand(2); + } else { + return std::nullopt; + } + + Type *ElemTy = GEP->getResultElementType(); + if (!ElemTy) + return std::nullopt; + const DataLayout &DL = F.getParent()->getDataLayout(); + const uint64_t GEPBytes = DL.getTypeStoreSize(ElemTy); + if (GEPBytes == ElemBytes) + return std::make_pair(Index, false); + if (GEPBytes == 1 && ElemBytes > 1) + return std::make_pair(Index, true); + return std::nullopt; + }; + + auto matchDirectSelectBaseGEP = + [&](Value *Ptr, uint64_t ElemBytes) -> std::optional { + if (!isPowerOf2_64(ElemBytes) || ElemBytes == 0 || ElemBytes > 8) + return std::nullopt; + auto *GEP = dyn_cast_or_null(Ptr); + if (!GEP) + return std::nullopt; + if (!matchGEPIndexForElemBytes(GEP, ElemBytes)) + return std::nullopt; + auto *BaseSel = dyn_cast(GEP->getPointerOperand()); + if (!BaseSel || !BaseSel->getType()->isPointerTy()) + return std::nullopt; + Value *TrueBase = BaseSel->getTrueValue()->stripPointerCasts(); + Value *FalseBase = BaseSel->getFalseValue()->stripPointerCasts(); + if (!L->isLoopInvariant(TrueBase) || !L->isLoopInvariant(FalseBase)) + return std::nullopt; + return SelectStorePtrInfo{ + BaseSel->getCondition(), + cast(GEP), + TrueBase, + cast(GEP), + FalseBase, + }; + }; + + auto matchInvariantBaseGEP = + [&](Value *Ptr, uint64_t ElemBytes) + -> std::optional> { + if (!isPowerOf2_64(ElemBytes) || ElemBytes == 0 || ElemBytes > 8) + return std::nullopt; + auto *GEP = dyn_cast_or_null(Ptr); + if (!GEP && Ptr) + GEP = dyn_cast(Ptr->stripPointerCasts()); + if (!GEP) + return std::nullopt; + if (!matchGEPIndexForElemBytes(GEP, ElemBytes)) + return std::nullopt; + Value *BasePtr = GEP->getPointerOperand()->stripPointerCasts(); + if (!L->isLoopInvariant(BasePtr)) + return std::nullopt; + return std::make_pair(GEP, BasePtr); + }; + + auto matchSelectBaseGEP = + [&](Value *Ptr, uint64_t ElemBytes) -> std::optional { + if (!isPowerOf2_64(ElemBytes) || ElemBytes == 0 || ElemBytes > 8) + return std::nullopt; + auto *GEP = dyn_cast_or_null(Ptr); + if (!GEP && Ptr) + GEP = dyn_cast(Ptr->stripPointerCasts()); + if (!GEP) + return std::nullopt; + + auto IndexInfo = matchGEPIndexForElemBytes(GEP, ElemBytes); + if (!IndexInfo) + return std::nullopt; + Value *Index = IndexInfo->first; + + Value *BasePtr = GEP->getPointerOperand(); + auto *BaseSel = dyn_cast(BasePtr); + if (!BaseSel) + BaseSel = dyn_cast(BasePtr->stripPointerCasts()); + if (!BaseSel || !BaseSel->getType()->isPointerTy()) + return std::nullopt; + Value *TrueBase = BaseSel->getTrueValue()->stripPointerCasts(); + Value *FalseBase = BaseSel->getFalseValue()->stripPointerCasts(); + if (!L->isLoopInvariant(TrueBase) || !L->isLoopInvariant(FalseBase)) + return std::nullopt; + if (!Index || !Index->getType()->isIntegerTy() || + Index->getType()->getScalarSizeInBits() > 64) + return std::nullopt; + return SelectBaseGEPInfo{GEP, BaseSel, Index}; + }; + + auto matchSelectStorePtr = + [&](Value *Ptr, uint64_t ElemBytes) -> std::optional { + if (auto Direct = matchDirectSelectBaseGEP(Ptr, ElemBytes)) + return Direct; + if (auto BaseSelect = matchSelectBaseGEP(Ptr, ElemBytes)) { + return SelectStorePtrInfo{ + BaseSelect->BaseSel->getCondition(), + BaseSelect->GEP, + BaseSelect->BaseSel->getTrueValue()->stripPointerCasts(), + BaseSelect->GEP, + BaseSelect->BaseSel->getFalseValue()->stripPointerCasts(), + }; + } + + auto *Sel = dyn_cast_or_null(Ptr); + if (!Sel && Ptr) + Sel = dyn_cast(Ptr->stripPointerCasts()); + if (!Sel || !Sel->getType()->isPointerTy()) + return std::nullopt; + auto TrueMatch = matchInvariantBaseGEP( + Sel->getTrueValue()->stripPointerCasts(), ElemBytes); + auto FalseMatch = matchInvariantBaseGEP( + Sel->getFalseValue()->stripPointerCasts(), ElemBytes); + if (!TrueMatch || !FalseMatch) + return std::nullopt; + return SelectStorePtrInfo{ + Sel->getCondition(), + TrueMatch->first, + TrueMatch->second, + FalseMatch->first, + FalseMatch->second, + }; + }; + DenseMap StoreObjByInst; SmallPtrSet StoreObjects; for (StoreInst *SI : Stores) { @@ -1544,8 +2086,13 @@ class LinxISASIMTAutoVectorize : public FunctionPass { uint64_t LaneCount = HasConstTripCount ? ConstTripCount : 1; uint64_t GroupCount = 1; bool UseGroupedDims = false; + const SIMTLayoutPolicy LayoutPolicy = LinxSIMTAutoVecLayout; - auto isUnitStride4Ptr = [&](Value *Ptr) -> bool { + auto isUnitStridePtr = [&](Value *Ptr, uint64_t ElemBytes) -> bool { + if (!isPowerOf2_64(ElemBytes) || ElemBytes == 0 || ElemBytes > 8) + return false; + if (matchSelectStorePtr(Ptr, ElemBytes)) + return true; Ptr = Ptr->stripPointerCasts(); const SCEV *PointerExpr = SE.getSCEVAtScope(Ptr, L); const auto *AddRec = dyn_cast(PointerExpr); @@ -1555,7 +2102,22 @@ class LinxISASIMTAutoVectorize : public FunctionPass { dyn_cast(AddRec->getStepRecurrence(SE)); if (!StepConst) return false; - return StepConst->getAPInt().getSExtValue() == 4; + return StepConst->getAPInt().getSExtValue() == + static_cast(ElemBytes); + }; + + auto getSIMTMemElemBytes = [&](Type *Ty) -> uint64_t { + if (!Ty) + return 0; + if (Ty->isIntegerTy(1) || Ty->isIntegerTy(8)) + return 1; + if (Ty->isIntegerTy(16)) + return 2; + if (Ty->isIntegerTy(32) || Ty->isFloatTy()) + return 4; + if (Ty->isIntegerTy(64) || Ty->isDoubleTy()) + return 8; + return 0; }; auto hasIVShiftByConst = [&](uint64_t ShiftImm) -> bool { @@ -1588,11 +2150,30 @@ class LinxISASIMTAutoVectorize : public FunctionPass { // Recurrence-carrying loops are executed in scalar-lane replay mode // (LB1), so we do not require unit-stride memory for correctness. + bool MemoryIsUnitStride = true; + for (StoreInst *SI : Stores) { + Type *StoreTy = SI->getValueOperand()->getType(); + uint64_t ElemBytes = getSIMTMemElemBytes(StoreTy); + bool StoreIsUnitStride = + isUnitStridePtr(SI->getPointerOperand(), ElemBytes); + if (!StoreIsUnitStride) { + MemoryIsUnitStride = false; + break; + } + } + if (MemoryIsUnitStride) { + for (LoadInst *LI : Loads) { + const uint64_t ElemBytes = getSIMTMemElemBytes(LI->getType()); + bool LoadIsUnitStride = + isUnitStridePtr(LI->getPointerOperand(), ElemBytes); + if (!LoadIsUnitStride) { + MemoryIsUnitStride = false; + break; + } + } + } - // Correctness-first bring-up: use a single lane and drive iteration - // replay via the group dimension (LB1). This avoids dependence and - // aliasing hazards across lanes while we close TSVC coverage. - bool ForceScalarLane = true; + bool ForceScalarLane = (LayoutPolicy != SIMTLayoutPolicy::Grouped); std::optional ForcedLaneCount; // If the loop index is explicitly shifted right (e.g. i >> 1), @@ -1602,54 +2183,98 @@ class LinxISASIMTAutoVectorize : public FunctionPass { if (HasConstTripCount && ConstTripCount > 2 && (ConstTripCount % 2) == 0 && hasIVShiftByConst(1)) { ForcedLaneCount = 2; + } + + auto selectGroupedLayout = [&](uint64_t ChosenLaneCount) { + LaneCount = ChosenLaneCount; + GroupCount = (ChosenLaneCount == 0) ? 1 : (ConstTripCount / ChosenLaneCount); + UseGroupedDims = (GroupCount > 1); ForceScalarLane = false; + RemarkLayoutKind = UseGroupedDims ? "grouped-strip-mined" + : "grouped-single-group"; + }; + + auto canUseGroupedLaneCount = [&](uint64_t CandidateLaneCount) { + if (!HasConstTripCount || ConstTripCount == 0 || CandidateLaneCount <= 1) + return false; + if (!isPowerOf2_64(CandidateLaneCount)) + return false; + if ((ConstTripCount % CandidateLaneCount) != 0) + return false; + if (!Stores.empty() || !Loads.empty()) + return MemoryIsUnitStride; + return true; + }; + + auto groupedRejectReason = [&]() -> const char * { + if (!HasConstTripCount) + return "grouped_layout_requires_static_tripcount"; + if (NeedsExecMaskSaveRestore) + return "grouped_layout_requires_exec_mask_save_restore"; + if ((!Stores.empty() || !Loads.empty()) && !MemoryIsUnitStride) + return "grouped_layout_requires_unit_stride_memory"; + return "grouped_layout_unavailable"; + }; + + uint64_t PreferredGroupedLaneCount = 0; + if (ForcedLaneCount && canUseGroupedLaneCount(*ForcedLaneCount)) { + PreferredGroupedLaneCount = *ForcedLaneCount; + } else if (canUseGroupedLaneCount(RequestedLaneCount) && + ConstTripCount >= RequestedLaneCount) { + PreferredGroupedLaneCount = RequestedLaneCount; + } else if (canUseGroupedLaneCount(ConstTripCount)) { + PreferredGroupedLaneCount = ConstTripCount; } - RemarkForceScalarLane = ForceScalarLane; + if (LayoutPolicy == SIMTLayoutPolicy::Grouped && + NeedsExecMaskSaveRestore) { + RemarkCFStrategy = "exec-mask-save-restore-required"; + reject(groupedRejectReason()); + return false; + } - if (ForcedLaneCount && *ForcedLaneCount > 1 && - HasConstTripCount && isPowerOf2_64(*ForcedLaneCount) && - (ConstTripCount % *ForcedLaneCount) == 0) { - LaneCount = *ForcedLaneCount; - GroupCount = ConstTripCount / *ForcedLaneCount; - UseGroupedDims = (GroupCount > 1); - } else if (!ForceScalarLane && RequestedLaneCount > 1 && - ConstTripCount > RequestedLaneCount && - isPowerOf2_64(RequestedLaneCount) && - (ConstTripCount % RequestedLaneCount) == 0) { - bool UnitStride = true; - for (StoreInst *SI : Stores) { - if (!isUnitStride4Ptr(SI->getPointerOperand())) { - UnitStride = false; - break; - } - } - if (UnitStride) { - for (LoadInst *LI : Loads) { - if (!isUnitStride4Ptr(LI->getPointerOperand())) { - UnitStride = false; - break; - } - } - } - if (UnitStride) { - LaneCount = RequestedLaneCount; - GroupCount = ConstTripCount / RequestedLaneCount; - UseGroupedDims = (GroupCount > 1); - } - } else if (ForceScalarLane) { + if (LayoutPolicy == SIMTLayoutPolicy::Grouped && + PreferredGroupedLaneCount == 0) { + if (NeedsActiveReplay) + RemarkCFStrategy = "active-replay"; + reject(groupedRejectReason()); + return false; + } + + if (PreferredGroupedLaneCount > 0 && + !NeedsExecMaskSaveRestore && + LayoutPolicy != SIMTLayoutPolicy::ScalarReplay) { + selectGroupedLayout(PreferredGroupedLaneCount); + } else { LaneCount = 1; GroupCount = HasConstTripCount ? ConstTripCount : 1; // When scalarizing to a single lane, iteration replay is driven by // the group dimension (LB1). Treat this as a grouped layout even // when the tripcount is only known dynamically, so indexing uses LC1. UseGroupedDims = true; + ForceScalarLane = true; + RemarkLayoutKind = "scalar-replay"; } + RemarkForceScalarLane = ForceScalarLane; RemarkLaneCount = LaneCount; RemarkGroupCount = GroupCount; + if (NeedsExecMaskSaveRestore) { + RemarkCFStrategy = "exec-mask-save-restore-required"; + } else if (NeedsActiveReplay) { + RemarkCFStrategy = "active-replay"; + } else if (!IfConvertibleSplits.empty()) { + RemarkCFStrategy = "if-converted-diamond"; + } else if (IsSingleBlock && HasSelect) { + RemarkCFStrategy = "if-converted-single-block"; + } else if (IsSingleBlock) { + RemarkCFStrategy = "straight-line-single-block"; + } else { + RemarkCFStrategy = "body-cfg"; + } // Recurrences are supported for both single-block and multi-block - // loops; state is carried via an invariant bind slot. + // loops. Bring-up still carries grouped recurrence state through the + // scalar mirror path until the TS-backed carrier is runtime-stable. (void)StoreObjByInst; (void)StoreObjects; @@ -1712,21 +2337,47 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return Idx; }; + static constexpr uint64_t kMaxSIMTLocalWords = 1024u; + uint64_t LocalScratchWordCount = 0; + auto reserveLocalWords = [&](uint64_t Words) -> std::optional { + if (Words == 0) + return std::nullopt; + if (Words > kMaxSIMTLocalWords - LocalScratchWordCount) + return std::nullopt; + const uint64_t Base = LocalScratchWordCount; + LocalScratchWordCount += Words; + return Base; + }; + + const bool ActiveSlotPerLane = NeedsActiveReplay && LaneCount > 1; + const uint64_t ActiveSlotElems = + ActiveSlotPerLane ? (LaneCount * GroupCount) : 1u; std::optional ActiveSlotBind; + std::optional ActiveSlotLocalWordBase; + if (ActiveSlotPerLane) + ActiveSlotLocalWordBase = reserveLocalWords(ActiveSlotElems); if (NeedsActiveReplay) { - BasicBlock &EntryBB = F.getEntryBlock(); - Instruction *EntryIP = &*EntryBB.getFirstInsertionPt(); - IRBuilder<> EB(EntryIP); - auto *ActiveSlot = - EB.CreateAlloca(I32Ty, nullptr, "linx.simt.active"); - PB.CreateStore(ConstantInt::get(I32Ty, 1), ActiveSlot); - Value *SlotI64 = PB.CreatePtrToInt(ActiveSlot, I64Ty); - auto Bind = bindI64(SlotI64); - if (!Bind) { - reject("active_bind_exhausted"); - return false; + if (!ActiveSlotLocalWordBase) { + BasicBlock &EntryBB = F.getEntryBlock(); + Instruction *EntryIP = &*EntryBB.getFirstInsertionPt(); + IRBuilder<> EB(EntryIP); + auto *ActiveSlot = EB.CreateAlloca( + I32Ty, ConstantInt::get(I32Ty, ActiveSlotElems), + "linx.simt.active"); + for (uint64_t Elem = 0; Elem < ActiveSlotElems; ++Elem) { + Value *ElemPtr = + EB.CreateInBoundsGEP(I32Ty, ActiveSlot, + ConstantInt::get(I32Ty, Elem)); + EB.CreateStore(ConstantInt::get(I32Ty, 1), ElemPtr); + } + Value *SlotI64 = PB.CreatePtrToInt(ActiveSlot, I64Ty); + auto Bind = bindI64(SlotI64); + if (!Bind) { + reject("active_bind_exhausted"); + return false; + } + ActiveSlotBind = *Bind; } - ActiveSlotBind = *Bind; } DenseMap, 4>> @@ -1844,6 +2495,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { Plan.Phi = Phi; Plan.Slot = Slot; Plan.SlotBind = *Bind; + if (UseGroupedDims && !NeedsActiveReplay) + Plan.LocalWordBase = reserveLocalWords(LaneCount * GroupCount); ExitPhiPlans.push_back(std::move(Plan)); // Note: incoming values for blocks keyed above cover all @@ -1967,19 +2620,23 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return false; } Plan.SlotBind = *Bind; + if (UseGroupedDims) + Plan.LocalWordBase = + reserveLocalWords((LaneCount * GroupCount) + 1u); RecurrencePlanByPhi[Plan.Phi] = RI; RecurrencePlansByUpdate[Plan.Update].push_back(RI); } } - struct LiveOutPlan { - Instruction *Inst = nullptr; - AllocaInst *Slot = nullptr; - unsigned SlotBind = 0; - }; - SmallVector LiveOutPlans; - if (!LiveOutInsts.empty()) { - BasicBlock &EntryBB = F.getEntryBlock(); + struct LiveOutPlan { + Instruction *Inst = nullptr; + AllocaInst *Slot = nullptr; + unsigned SlotBind = 0; + std::optional LocalWordBase; + }; + SmallVector LiveOutPlans; + if (!LiveOutInsts.empty()) { + BasicBlock &EntryBB = F.getEntryBlock(); Instruction *EntryIP = &*EntryBB.getFirstInsertionPt(); IRBuilder<> EB(EntryIP); for (Instruction *I : LiveOutInsts) { @@ -1991,23 +2648,32 @@ class LinxISASIMTAutoVectorize : public FunctionPass { "linx.simt.liveout"); Value *SlotI64 = PB.CreatePtrToInt(Plan.Slot, I64Ty); auto Bind = bindI64(SlotI64); - if (!Bind) { - reject("liveout_bind_exhausted"); - return false; - } - Plan.SlotBind = *Bind; - LiveOutPlans.push_back(std::move(Plan)); - } - } + if (!Bind) { + reject("liveout_bind_exhausted"); + return false; + } + Plan.SlotBind = *Bind; + LiveOutPlans.push_back(std::move(Plan)); + } + } + + if (NeedsActiveReplay) { + for (ExitPhiPlan &Plan : ExitPhiPlans) + Plan.LocalWordBase.reset(); + } - struct AddressBinding { + struct AddressBinding { unsigned BaseRi; int64_t IndexFactor; unsigned Shift; int64_t StepElems; }; - auto bindPtrStart = [&](Value *Ptr) -> std::optional { + auto bindPtrStartForElem = [&](Value *Ptr, + uint64_t ElemBytes) + -> std::optional { + if (!isPowerOf2_64(ElemBytes) || ElemBytes == 0 || ElemBytes > 8) + return std::nullopt; Ptr = Ptr->stripPointerCasts(); const SCEV *PointerExpr = SE.getSCEVAtScope(Ptr, L); const auto *AddRec = dyn_cast(PointerExpr); @@ -2018,9 +2684,10 @@ class LinxISASIMTAutoVectorize : public FunctionPass { if (!StepConst) return std::nullopt; int64_t StepBytes = StepConst->getAPInt().getSExtValue(); - if ((StepBytes % 4) != 0 || StepBytes == 0) + if ((StepBytes % static_cast(ElemBytes)) != 0 || + StepBytes == 0) return std::nullopt; - const int64_t StepElems = StepBytes / 4; + const int64_t StepElems = StepBytes / static_cast(ElemBytes); const SCEV *Start = AddRec->getStart(); Value *StartV = ExpandedStarts.lookup(Start); @@ -2047,7 +2714,7 @@ class LinxISASIMTAutoVectorize : public FunctionPass { /*IndexFactor=*/ 0, /*Shift=*/ 0, /*StepElems=*/ StepElems}; - const int64_t Delta = StepBytes - 4; + const int64_t Delta = StepBytes - static_cast(ElemBytes); if (Delta == 0) return Binding; @@ -2066,19 +2733,59 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return Binding; }; + auto bindPtrStart = [&](Value *Ptr) -> std::optional { + return bindPtrStartForElem(Ptr, /*ElemBytes=*/4); + }; + + struct VecPipeToken { + unsigned Class = 0; + unsigned Index = 0; + }; + + auto parseVecPipeToken = [&](StringRef Tok) + -> std::optional { + Tok = Tok.trim(); + if (Tok.size() < 4) + return std::nullopt; + unsigned Class = 0; + if (Tok.starts_with("vt#")) + Class = 0; + else if (Tok.starts_with("vu#")) + Class = 1; + else if (Tok.starts_with("vm#")) + Class = 2; + else if (Tok.starts_with("vn#")) + Class = 3; + else + return std::nullopt; + StringRef Tail = Tok.drop_front(3); + unsigned Index = 0; + if (Tail.getAsInteger(10, Index) || Index == 0) + return std::nullopt; + return VecPipeToken{Class, Index}; + }; + + auto formatVecPipeToken = [&](const VecPipeToken &Tok) { + static constexpr const char *kClassPrefix[] = {"vt#", "vu#", "vm#", + "vn#"}; + return std::string(kClassPrefix[Tok.Class]) + std::to_string(Tok.Index); + }; + + auto formatVecPipeHead = [&](const VecPipeToken &Tok, StringRef Suffix) { + static constexpr const char *kHeadPrefix[] = {"vt", "vu", "vm", "vn"}; + return std::string(kHeadPrefix[Tok.Class]) + Suffix.str(); + }; + unsigned NextVecReg = 0; auto allocVec = [&]() -> std::optional { static constexpr unsigned kMaxIndex = 31; - static constexpr const char *kClassPrefix[] = {"vt#", "vu#", "vm#", - "vn#"}; - static constexpr unsigned kNumClasses = - sizeof(kClassPrefix) / sizeof(kClassPrefix[0]); + static constexpr unsigned kNumClasses = 4; if (NextVecReg >= (kMaxIndex * kNumClasses)) return std::nullopt; const unsigned Class = NextVecReg / kMaxIndex; const unsigned Index = (NextVecReg % kMaxIndex) + 1u; ++NextVecReg; - return std::string(kClassPrefix[Class]) + std::to_string(Index); + return formatVecPipeToken(VecPipeToken{Class, Index}); }; unsigned NextAsmLabel = 0; @@ -2089,12 +2796,121 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return SS.str(); }; + auto isLaneCounterToken = [](StringRef Tok) { + Tok = Tok.trim(); + return Tok == "lc0" || Tok == "lc1"; + }; + + auto formatPipeDest = [&](StringRef Tok, StringRef Suffix) { + Tok = Tok.trim(); + auto PipeTok = parseVecPipeToken(Tok); + if (!PipeTok) + return Tok.str(); + return formatVecPipeHead(*PipeTok, Suffix); + }; + + auto formatIntSrc = [&](StringRef Tok) { + Tok = Tok.trim(); + if (parseVecPipeToken(Tok)) + return (Tok + ".sw").str(); + if (isLaneCounterToken(Tok)) + return (Tok + ".uh").str(); + return Tok.str(); + }; + + auto formatFloatSrc = [&](StringRef Tok) { + Tok = Tok.trim(); + if (parseVecPipeToken(Tok)) + return (Tok + ".fs").str(); + if (isLaneCounterToken(Tok)) + return (Tok + ".uh").str(); + return Tok.str(); + }; + + auto formatMaskSrc = [&](StringRef Tok) { + Tok = Tok.trim(); + if (parseVecPipeToken(Tok)) + return (Tok + ".ud").str(); + if (isLaneCounterToken(Tok)) + return (Tok + ".uh").str(); + return Tok.str(); + }; + + auto formatWordDest = [&](StringRef Tok) { + return formatPipeDest(Tok, ".w"); + }; + + auto formatMaskDest = [&](StringRef Tok) { + return formatPipeDest(Tok, ".d"); + }; + + auto formatAssignedDest = [&](StringRef Tok, StringRef Suffix) { + Tok = Tok.trim(); + if (parseVecPipeToken(Tok)) + return (Tok + Suffix).str(); + return Tok.str(); + }; + + auto formatAssignedWordDest = [&](StringRef Tok) { + return formatAssignedDest(Tok, ".w"); + }; + + DenseMap ReplayMaskSaveRegByBranch; + DenseMap> + ReplayMaskRestoreRegsByMerge; + std::optional ExecMaskSaveOneBind; + if (!ReplayMaskSplits.empty()) { + ExecMaskSaveOneBind = + bindI64(ConstantInt::get(I64Ty, 1)); + if (!ExecMaskSaveOneBind) { + reject("exec_mask_bind_exhausted"); + return false; + } + for (const auto &It : ReplayMaskSplits) { + auto SaveReg = allocVec(); + if (!SaveReg) { + reject("vector_reg_exhausted"); + return false; + } + ReplayMaskSaveRegByBranch[It.first] = *SaveReg; + ReplayMaskRestoreRegsByMerge[It.second.MergeBB].push_back(*SaveReg); + } + } + + auto formatAddrExpr = [&](StringRef Expr) { + Expr = Expr.trim(); + const size_t ShiftPos = Expr.find("<<"); + StringRef Base = + ShiftPos == StringRef::npos ? Expr : Expr.substr(0, ShiftPos); + StringRef Shift = + ShiftPos == StringRef::npos ? StringRef() : Expr.substr(ShiftPos); + Base = Base.trim(); + std::string Out; + if (parseVecPipeToken(Base)) + Out = (Base + ".sw").str(); + else if (isLaneCounterToken(Base)) + Out = (Base + ".uh").str(); + else + Out = Base.str(); + if (!Shift.empty()) + Out += Shift.str(); + return Out; + }; + + auto formatShiftedAddr = [&](StringRef Tok, unsigned Shift) { + std::string Expr = Tok.trim().str(); + if (Shift) + Expr += "<<" + std::to_string(Shift); + return formatAddrExpr(Expr); + }; + struct PtrPhiPlan { std::string SelReg; // Small integer selector in a vector register. SmallVector BaseRis; // sel_id -> base RI bind DenseMap SelByPred; // pred -> sel_id }; DenseMap PtrPhiPlans; + DenseMap PendingRecurrenceValues; DenseMap ValOp; SmallString<512> Body; @@ -2103,38 +2919,48 @@ class LinxISASIMTAutoVectorize : public FunctionPass { std::string LinearIndexReg = "lc0"; const unsigned GroupShift = UseGroupedDims ? static_cast(Log2_64(LaneCount)) : 0u; - if (UseGroupedDims) { - auto Lin = allocVec(); - if (!Lin) { - reject("vector_reg_exhausted"); - return false; - } - OS << " v.add lc0, lc1<<" << GroupShift << ", ->" << *Lin << "\n"; - LinearIndexReg = *Lin; - } + if (UseGroupedDims) { + auto Lin = allocVec(); + if (!Lin) { + reject("vector_reg_exhausted"); + return false; + } + OS << " v.add " << formatIntSrc("lc0") << ", " + << formatAddrExpr(("lc1<<" + std::to_string(GroupShift)).c_str()) + << ", ->" << formatWordDest(*Lin) << "\n"; + LinearIndexReg = *Lin; + } DenseMap IndexRegByFactor; IndexRegByFactor[0] = "zero"; IndexRegByFactor[1] = LinearIndexReg; + const bool CacheVecIndexRegs = + !NeedsActiveReplay && !HasInnerCF && IsSingleBlock; std::optional NegLc0Reg; DenseMap GroupedIndexRegByStepElems; + DenseMap LocalSlotMemOffsetByWordBase; + std::optional GroupWordIndexReg; std::function(int64_t)> emitScaledLc0 = [&](int64_t Factor) -> std::optional { - auto Cached = IndexRegByFactor.find(Factor); - if (Cached != IndexRegByFactor.end()) - return Cached->second; - - if (Factor == -1) { - auto NegReg = allocVec(); - if (!NegReg) - return std::nullopt; - OS << " v.sub zero, " << LinearIndexReg << ", ->" << *NegReg - << "\n"; - IndexRegByFactor[Factor] = *NegReg; - return *NegReg; + if (CacheVecIndexRegs) { + auto Cached = IndexRegByFactor.find(Factor); + if (Cached != IndexRegByFactor.end()) + return Cached->second; } + if (Factor == -1) { + auto NegReg = allocVec(); + if (!NegReg) + return std::nullopt; + OS << " v.sub zero, " << formatIntSrc(LinearIndexReg) << ", ->" + << formatWordDest(*NegReg) << "\n"; + if (CacheVecIndexRegs) { + IndexRegByFactor[Factor] = *NegReg; + } + return *NegReg; + } + const bool IsNegative = Factor < 0; const uint64_t AbsFactor = IsNegative ? static_cast(-Factor) @@ -2151,13 +2977,14 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Prev = Pow2Regs.find(Bit - 1); if (Prev == Pow2Regs.end()) return std::nullopt; - auto Next = allocVec(); - if (!Next) - return std::nullopt; - OS << " v.add " << Prev->second << ", " << Prev->second << ", ->" - << *Next << "\n"; - Pow2Regs[Bit] = *Next; - } + auto Next = allocVec(); + if (!Next) + return std::nullopt; + OS << " v.add " << formatIntSrc(Prev->second) << ", " + << formatIntSrc(Prev->second) << ", ->" + << formatWordDest(*Next) << "\n"; + Pow2Regs[Bit] = *Next; + } std::optional AccumReg; for (unsigned Bit = 0; Bit <= HighestBit; ++Bit) { @@ -2170,57 +2997,134 @@ class LinxISASIMTAutoVectorize : public FunctionPass { AccumReg = Part->second; continue; } - auto Sum = allocVec(); - if (!Sum) - return std::nullopt; - OS << " v.add " << *AccumReg << ", " << Part->second << ", ->" - << *Sum << "\n"; - AccumReg = *Sum; - } + auto Sum = allocVec(); + if (!Sum) + return std::nullopt; + OS << " v.add " << formatIntSrc(*AccumReg) << ", " + << formatIntSrc(Part->second) << ", ->" + << formatWordDest(*Sum) << "\n"; + AccumReg = *Sum; + } if (!AccumReg) return std::nullopt; - if (IsNegative) { - auto NegReg = allocVec(); - if (!NegReg) - return std::nullopt; - OS << " v.sub zero, " << *AccumReg << ", ->" << *NegReg << "\n"; - AccumReg = *NegReg; - } + if (IsNegative) { + auto NegReg = allocVec(); + if (!NegReg) + return std::nullopt; + OS << " v.sub zero, " << formatIntSrc(*AccumReg) << ", ->" + << formatWordDest(*NegReg) << "\n"; + AccumReg = *NegReg; + } - IndexRegByFactor[Factor] = *AccumReg; + if (CacheVecIndexRegs) + IndexRegByFactor[Factor] = *AccumReg; return *AccumReg; }; auto emitGroupedIndexReg = [&](int64_t StepElems) -> std::optional { - auto Cached = GroupedIndexRegByStepElems.find(StepElems); - if (Cached != GroupedIndexRegByStepElems.end()) - return Cached->second; + if (CacheVecIndexRegs) { + auto Cached = GroupedIndexRegByStepElems.find(StepElems); + if (Cached != GroupedIndexRegByStepElems.end()) + return Cached->second; + } auto StepScaled = emitScaledLc0(StepElems); if (!StepScaled) return std::nullopt; - auto Idx = allocVec(); - if (!Idx) + auto Idx = allocVec(); + if (!Idx) + return std::nullopt; + OS << " v.sub " << formatIntSrc(*StepScaled) << ", " + << formatIntSrc("lc0") << ", ->" << formatWordDest(*Idx) + << "\n"; + if (CacheVecIndexRegs) { + GroupedIndexRegByStepElems[StepElems] = *Idx; + } + return *Idx; + }; + + auto emitGroupWordIndexReg = [&]() -> std::optional { + if (!UseGroupedDims) return std::nullopt; - OS << " v.sub " << *StepScaled << ", lc0, ->" << *Idx << "\n"; - GroupedIndexRegByStepElems[StepElems] = *Idx; - return *Idx; + if (CacheVecIndexRegs && GroupWordIndexReg) + return GroupWordIndexReg; + auto Dst = allocVec(); + if (!Dst) + return std::nullopt; + if (GroupShift == 0) { + OS << " v.add zero, " << formatIntSrc("lc1") << ", ->" + << formatWordDest(*Dst) << "\n"; + } else { + OS << " v.add zero, " + << formatAddrExpr(("lc1<<" + std::to_string(GroupShift)).c_str()) + << ", ->" << formatWordDest(*Dst) << "\n"; + } + if (CacheVecIndexRegs) { + GroupWordIndexReg = *Dst; + } + return *Dst; }; - auto emitNegLc0 = [&]() -> std::optional { - if (NegLc0Reg) - return NegLc0Reg; - auto Neg = allocVec(); - if (!Neg) + auto emitLocalSlotMemOffset = + [&](uint64_t WordBase) -> std::optional { + if (CacheVecIndexRegs) { + auto Cached = LocalSlotMemOffsetByWordBase.find(WordBase); + if (Cached != LocalSlotMemOffsetByWordBase.end()) + return Cached->second; + } + + if (WordBase == 0) { + std::string Tok = + UseGroupedDims ? ("lc1<<" + std::to_string(GroupShift + 2u)) + : "zero<<2"; + if (CacheVecIndexRegs) + LocalSlotMemOffsetByWordBase[WordBase] = Tok; + return Tok; + } + + auto ConstBind = bindI64(ConstantInt::get(I64Ty, WordBase)); + if (!ConstBind) + return std::nullopt; + + if (!UseGroupedDims) { + std::string Tok = "ri" + std::to_string(*ConstBind) + "<<2"; + if (CacheVecIndexRegs) + LocalSlotMemOffsetByWordBase[WordBase] = Tok; + return Tok; + } + + auto GroupReg = emitGroupWordIndexReg(); + if (!GroupReg) return std::nullopt; - OS << " v.sub zero, lc0, ->" << *Neg << "\n"; - NegLc0Reg = *Neg; - return NegLc0Reg; + auto Dst = allocVec(); + if (!Dst) + return std::nullopt; + OS << " v.add " << formatIntSrc(*GroupReg) << ", ri" << *ConstBind + << ", ->" << formatWordDest(*Dst) << "\n"; + std::string Tok = *Dst + "<<2"; + if (CacheVecIndexRegs) { + LocalSlotMemOffsetByWordBase[WordBase] = Tok; + } + return Tok; }; + auto emitNegLc0 = [&]() -> std::optional { + if (CacheVecIndexRegs && NegLc0Reg) + return NegLc0Reg; + auto Neg = allocVec(); + if (!Neg) + return std::nullopt; + OS << " v.sub zero, " << formatIntSrc("lc0") << ", ->" + << formatWordDest(*Neg) << "\n"; + if (CacheVecIndexRegs) { + NegLc0Reg = *Neg; + } + return *Neg; + }; + auto emitIndexDeltaFromLc0 = [&](StringRef IndexExpr) -> std::optional { StringRef Expr = IndexExpr.trim(); @@ -2231,17 +3135,20 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Delta = allocVec(); if (!Delta) return std::nullopt; - OS << " v.sub " << Expr << ", lc0, ->" << *Delta << "\n"; + OS << " v.sub " << formatIntSrc(Expr) << ", " << formatIntSrc("lc0") + << ", ->" << formatWordDest(*Delta) << "\n"; return *Delta; }; // Convert a byte-based induction/index expression (e.g. i8 GEP index) - // into a word index suitable for v.lw/v.sw (which operate on 32-bit - // elements and use lc0<<2 addressing). - auto emitWordIndexFromByteIndex = - [&](Value *ByteIndex) -> std::optional { + // into an element index suitable for v.l*/v.s* addressing. + auto emitElemIndexFromByteIndex = + [&](Value *ByteIndex, + uint64_t ElemBytes) -> std::optional { if (!ByteIndex) return std::nullopt; + if (!isPowerOf2_64(ElemBytes) || ElemBytes == 0 || ElemBytes > 8) + return std::nullopt; if (!ByteIndex->getType()->isIntegerTy() || ByteIndex->getType()->getScalarSizeInBits() > 64) { return std::nullopt; @@ -2259,29 +3166,30 @@ class LinxISASIMTAutoVectorize : public FunctionPass { const int64_t StartB = StartC->getAPInt().getSExtValue(); const int64_t StepB = StepC->getAPInt().getSExtValue(); - if ((StartB % 4) != 0 || (StepB % 4) != 0) + const int64_t ElemStride = static_cast(ElemBytes); + if ((StartB % ElemStride) != 0 || (StepB % ElemStride) != 0) return std::nullopt; - const int64_t StartW = StartB / 4; - const int64_t StepW = StepB / 4; - if (StepW == 0) + const int64_t StartElems = StartB / ElemStride; + const int64_t StepElems = StepB / ElemStride; + if (StepElems == 0) return std::nullopt; - if (StepW > 4096 || StepW < -4096) + if (StepElems > 4096 || StepElems < -4096) return std::nullopt; std::optional ScaledIndex; - if (StepW == 1) { + if (StepElems == 1) { ScaledIndex = LinearIndexReg; } else { - ScaledIndex = emitScaledLc0(StepW); + ScaledIndex = emitScaledLc0(StepElems); } if (!ScaledIndex) return std::nullopt; - if (StartW == 0) + if (StartElems == 0) return *ScaledIndex; - auto *C64 = ConstantInt::get(I64Ty, (uint64_t)StartW); + auto *C64 = ConstantInt::get(I64Ty, (uint64_t)StartElems); auto Bind = bindI64(C64); if (!Bind) return std::nullopt; @@ -2290,8 +3198,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.add " << *ScaledIndex << ", " << StartTok << ", ->" - << *Dst << "\n"; + OS << " v.add " << formatIntSrc(*ScaledIndex) << ", " << StartTok + << ", ->" << formatWordDest(*Dst) << "\n"; return *Dst; }; @@ -2357,8 +3265,9 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Mul = allocVec(); if (!Mul) return std::nullopt; - OS << " v.mul " << LinearIndexReg << ", " << *StepTok - << ", ->" << *Mul << "\n"; + OS << " v.mul " << formatIntSrc(LinearIndexReg) << ", " + << formatIntSrc(*StepTok) << ", ->" + << formatWordDest(*Mul) << "\n"; ScaledIndex = *Mul; } if (!ScaledIndex) @@ -2400,15 +3309,18 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.add " << *ScaledIndex << ", " << *StartTok << ", ->" - << *Dst << "\n"; + OS << " v.add " << formatIntSrc(*ScaledIndex) << ", " + << formatIntSrc(*StartTok) << ", ->" + << formatWordDest(*Dst) << "\n"; if (!EdgeFresh) ValOp[IV] = *Dst; return *Dst; }; - auto bindPtrGeneral = [&](Value *Ptr) + auto bindPtrGeneralForElem = [&](Value *Ptr, uint64_t ElemBytes) -> std::optional> { + if (!isPowerOf2_64(ElemBytes) || ElemBytes == 0 || ElemBytes > 8) + return std::nullopt; Ptr = Ptr->stripPointerCasts(); if (auto *GEP = dyn_cast(Ptr)) { auto Try = [&]() -> std::optional> { @@ -2431,11 +3343,15 @@ class LinxISASIMTAutoVectorize : public FunctionPass { } Type *ElemTy = GEP->getResultElementType(); - if (!ElemTy || !(ElemTy->isFloatTy() || ElemTy->isIntegerTy(32))) + if (!ElemTy) return std::nullopt; const DataLayout &DL = F.getParent()->getDataLayout(); - if (DL.getTypeStoreSize(ElemTy) != 4) + if (DL.getTypeStoreSize(ElemTy) != ElemBytes) return std::nullopt; + if (!(ElemTy->isIntegerTy() && + ElemTy->getScalarSizeInBits() <= 32) && + !(ElemTy->isFloatTy() && ElemBytes == 4)) + return std::nullopt; Value *BasePtr = GEP->getPointerOperand()->stripPointerCasts(); if (!L->isLoopInvariant(BasePtr)) @@ -2505,7 +3421,9 @@ class LinxISASIMTAutoVectorize : public FunctionPass { if (StepBytesV->getType() != I64Ty) StepBytesV = PB.CreateSExtOrTrunc(StepBytesV, I64Ty); Value *StepElemsV = - PB.CreateAShr(StepBytesV, ConstantInt::get(I64Ty, 2)); + PB.CreateAShr(StepBytesV, + ConstantInt::get(I64Ty, + Log2_64(ElemBytes))); auto StepTok = emitValue(StepElemsV); if (!StepTok) return std::nullopt; @@ -2513,12 +3431,19 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Idx = allocVec(); if (!Mul || !Idx) return std::nullopt; - OS << " v.mul " << LinearIndexReg << ", " << *StepTok << ", ->" - << *Mul << "\n"; - OS << " v.sub " << *Mul << ", lc0, ->" << *Idx << "\n"; + OS << " v.mul " << formatIntSrc(LinearIndexReg) << ", " + << formatIntSrc(*StepTok) << ", ->" << formatWordDest(*Mul) + << "\n"; + OS << " v.sub " << formatIntSrc(*Mul) << ", " << formatIntSrc("lc0") + << ", ->" << formatWordDest(*Idx) << "\n"; return std::make_pair(*BaseOpt, *Idx); }; + auto bindPtrGeneral = [&](Value *Ptr) + -> std::optional> { + return bindPtrGeneralForElem(Ptr, /*ElemBytes=*/4); + }; + auto unsupportedValueReason = [&](Value *V) -> std::string { if (auto *I = dyn_cast(V)) { if (auto *CB = dyn_cast(I)) { @@ -2540,34 +3465,111 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return "unsupported_value_expr:unknown"; }; - std::function(Value *)> emitLoadFromPtr; - emitLoadFromPtr = [&](Value *Ptr) -> std::optional { - auto Address = bindPtrStart(Ptr); + auto ensureExecMaskSaveOneBind = [&]() -> std::optional { + if (!ExecMaskSaveOneBind) + ExecMaskSaveOneBind = bindI64(ConstantInt::get(I64Ty, 1)); + return ExecMaskSaveOneBind; + }; + + auto prefersSignedSIMTLoad = [&](Value *SemanticV) -> bool { + auto *LI = dyn_cast_or_null(SemanticV); + if (!LI) + return false; + Type *Ty = LI->getType(); + if (!Ty->isIntegerTy(8) && !Ty->isIntegerTy(16)) + return false; + bool SawSignedExtUse = false; + for (User *U : LI->users()) { + auto *I = dyn_cast(U); + if (!I) + return false; + switch (I->getOpcode()) { + case Instruction::SExt: + SawSignedExtUse = true; + continue; + case Instruction::ICmp: { + auto *Cmp = cast(I); + switch (Cmp->getPredicate()) { + case CmpInst::ICMP_SLT: + case CmpInst::ICMP_SLE: + case CmpInst::ICMP_SGT: + case CmpInst::ICMP_SGE: + return true; + case CmpInst::ICMP_EQ: + case CmpInst::ICMP_NE: + case CmpInst::ICMP_ULT: + case CmpInst::ICMP_ULE: + case CmpInst::ICMP_UGT: + case CmpInst::ICMP_UGE: + continue; + default: + return false; + } + } + case Instruction::Store: + if (cast(I)->getValueOperand() == LI) + continue; + return false; + default: + return false; + } + } + return SawSignedExtUse; + }; + + auto getSIMTLoadMnemonic = [&](Type *Ty, Value *SemanticV) -> const char * { + const bool SignedLoad = prefersSignedSIMTLoad(SemanticV); + switch (getSIMTMemElemBytes(Ty)) { + case 1: + return SignedLoad ? "v.lb.brg" : "v.lbu.brg"; + case 2: + return SignedLoad ? "v.lh.brg" : "v.lhu.brg"; + case 4: + return "v.lw.brg"; + default: + return nullptr; + } + }; + + std::function(Value *, Type *, Value *)> + emitLoadFromPtr; + emitLoadFromPtr = [&](Value *Ptr, + Type *LoadTy, + Value *SemanticV) -> std::optional { + const uint64_t ElemBytes = getSIMTMemElemBytes(LoadTy); + const char *LoadMnemonic = getSIMTLoadMnemonic(LoadTy, SemanticV); + if (!LoadMnemonic) + return std::nullopt; + const unsigned ElemShift = Log2_64(ElemBytes); + const std::string LaneExpr = + ElemShift == 0 ? "lc0" : ("lc0<<" + std::to_string(ElemShift)); + + auto Address = bindPtrStartForElem(Ptr, ElemBytes); unsigned BaseRi = 0; std::string IndexReg; unsigned IndexShift = 0; if (Address) { BaseRi = Address->BaseRi; if (UseGroupedDims) { - // In grouped mode, compute the full step in elements: - // addr = base + (lc0<<2) + ((stepElems*linearIndex - lc0)<<2) - // This preserves the (lane + group*LaneCount) addressing for - // both unit and non-unit stride patterns. + // In grouped mode, emit the SrcR word-offset term consumed by the + // relative vector memory form `[SrcL, lc0<<2, SrcR<<2]`. + // The textual asm is not a scalar base+index syntax; it feeds the + // block/vector address machinery through the clock-hand/register + // pipe contract. `emitGroupedIndexReg()` materializes the word + // offset that preserves the intended linear `(lane + group*lanes)` + // iteration mapping for both unit and non-unit stride patterns. auto Idx = emitGroupedIndexReg(Address->StepElems); if (!Idx) return std::nullopt; IndexReg = *Idx; - IndexShift = 2; + IndexShift = ElemShift; } else { - // Avoid negative-stride encoding patterns that rely on mixed - // shifts (e.g. lc0<<2 + (-lc0)<<3). Use the stepElems form - // instead to keep the scale uniform at <<2. if (Address->StepElems < 0) { auto Idx = emitGroupedIndexReg(Address->StepElems); if (!Idx) return std::nullopt; IndexReg = *Idx; - IndexShift = 2; + IndexShift = ElemShift; } else { IndexShift = Address->Shift; auto IndexRegOpt = emitScaledLc0(Address->IndexFactor); @@ -2577,7 +3579,7 @@ class LinxISASIMTAutoVectorize : public FunctionPass { } } } else { - auto General = bindPtrGeneral(Ptr); + auto General = bindPtrGeneralForElem(Ptr, ElemBytes); if (!General) { Value *Stripped = Ptr ? Ptr->stripPointerCasts() : nullptr; auto *GEP = dyn_cast_or_null(Stripped); @@ -2608,14 +3610,14 @@ class LinxISASIMTAutoVectorize : public FunctionPass { const DataLayout &DL = F.getParent()->getDataLayout(); Type *ElemTy = GEP->getResultElementType(); - const uint64_t ElemBytes = + const uint64_t PtrElemBytes = ElemTy ? DL.getTypeStoreSize(ElemTy) : 0; std::optional IdxExpr; - if (ElemBytes == 4) { + if (PtrElemBytes == ElemBytes && + (ElemBytes == 1 || ElemBytes == 2 || + ElemBytes == 4)) { IdxExpr = emitValue(Index); - } else if (ElemBytes == 1) { - IdxExpr = emitWordIndexFromByteIndex(Index); } else { return std::nullopt; } @@ -2653,14 +3655,16 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Pred = allocVec(); if (!Pred) return std::nullopt; - OS << " v.cmp.eq " << Plan.SelReg << ", " << SelTok - << ", ->" << *Pred << "\n"; + OS << " v.cmp.eq " << formatIntSrc(Plan.SelReg) + << ", " << formatIntSrc(SelTok) << ", ->" + << formatMaskDest(*Pred) << "\n"; // Reduce ops accumulate into the destination register; seed // our scratch reduce destination before each use. // NOTE: C.MOVR cannot write to a specific t#k entry; it can // only push to `t`/`u` or write a global GPR. OS << " c.movr zero, ->t\n"; - OS << " v.rdor " << *Pred << ", ->t#1\n"; + OS << " v.rdor " << formatMaskSrc(*Pred) + << ", ->t#1\n"; OS << " b.ne t#1, zero, " << CaseLbl << "\n"; } @@ -2671,8 +3675,10 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return std::nullopt; const unsigned Ri = Plan.BaseRis[SelId]; OS << C.first << ":\n"; - OS << " v.lw.brg [ri" << Ri << ", lc0<<2, " - << *DeltaExpr << "<<2], ->" << *Dst << "\n"; + OS << " " << LoadMnemonic << " [ri" << Ri << ", " + << formatAddrExpr(LaneExpr) << ", " + << formatShiftedAddr(*DeltaExpr, ElemShift) + << "], ->" << formatWordDest(*Dst) << "\n"; OS << " j " << EndLbl << "\n"; } OS << EndLbl << ":\n"; @@ -2693,14 +3699,15 @@ class LinxISASIMTAutoVectorize : public FunctionPass { Value *FalsePtr = PB.CreateGEP( GEP->getSourceElementType(), Sel->getFalseValue(), Index); auto Pred = emitCondition(Sel->getCondition()); - auto TV = emitLoadFromPtr(TruePtr); - auto FV = emitLoadFromPtr(FalsePtr); + auto TV = emitLoadFromPtr(TruePtr, LoadTy, SemanticV); + auto FV = emitLoadFromPtr(FalsePtr, LoadTy, SemanticV); if (Pred && TV && FV) { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.csel " << *Pred << ", " << *TV << ", " << *FV - << ", ->" << *Dst << "\n"; + OS << " v.csel " << formatMaskSrc(*Pred) << ", " + << formatFloatSrc(*TV) << ", " << formatFloatSrc(*FV) + << ", ->" << formatWordDest(*Dst) << "\n"; return *Dst; } } @@ -2710,22 +3717,30 @@ class LinxISASIMTAutoVectorize : public FunctionPass { } BaseRi = General->first; IndexReg = General->second; - IndexShift = 2; + IndexShift = ElemShift; } auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.lw.brg [ri" << BaseRi << ", lc0<<2, " << IndexReg; - if (IndexShift) - OS << "<<" << IndexShift; - OS << "], ->" << *Dst << "\n"; - return *Dst; - }; + OS << " " << LoadMnemonic << " [ri" << BaseRi << ", " + << formatAddrExpr(LaneExpr) << ", " + << formatShiftedAddr(IndexReg, IndexShift) << "], ->" + << formatWordDest(*Dst) << "\n"; + return *Dst; + }; auto emitLoadFromInvariantPtr = - [&](Value *Ptr) -> std::optional { + [&](Value *Ptr, Type *LoadTy, + Value *SemanticV) -> std::optional { if (!Ptr) return std::nullopt; + const uint64_t ElemBytes = getSIMTMemElemBytes(LoadTy); + const char *LoadMnemonic = getSIMTLoadMnemonic(LoadTy, SemanticV); + if (!LoadMnemonic) + return std::nullopt; + const unsigned ElemShift = Log2_64(ElemBytes); + const std::string LaneExpr = + ElemShift == 0 ? "lc0" : ("lc0<<" + std::to_string(ElemShift)); Value *PtrI64 = PB.CreatePtrToInt(Ptr->stripPointerCasts(), I64Ty); auto Base = bindI64(PtrI64); if (!Base) @@ -2736,33 +3751,213 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.lw.brg [ri" << *Base << ", lc0<<2, " << *Neg - << "<<2], ->" << *Dst << "\n"; - return *Dst; - }; + OS << " " << LoadMnemonic << " [ri" << *Base << ", " + << formatAddrExpr(LaneExpr) << ", " + << formatShiftedAddr(*Neg, ElemShift) << "], ->" + << formatWordDest(*Dst) << "\n"; + return *Dst; + }; - auto emitLoadFromInvariantBind = - [&](unsigned BaseRi) -> std::optional { + auto emitLoadFromInvariantBindInto = [&](unsigned BaseRi, + StringRef Dst) -> bool { auto Neg = emitNegLc0(); if (!Neg) + return false; + OS << " v.lw.brg [ri" << BaseRi << ", " << formatAddrExpr("lc0<<2") + << ", " << formatShiftedAddr(*Neg, 2) << "], ->" + << formatAssignedWordDest(Dst) << "\n"; + return true; + }; + + auto emitLoadFromInvariantBind = + [&](unsigned BaseRi) -> std::optional { + auto Dst = allocVec(); + if (!Dst) return std::nullopt; + if (!emitLoadFromInvariantBindInto(BaseRi, *Dst)) + return std::nullopt; + return *Dst; + }; + + auto emitStoreToInvariantBind = [&](StringRef Src, unsigned BaseRi, + bool IsFloat = false) { + auto Neg = emitNegLc0(); + if (!Neg) + return false; + OS << " v.sw.brg " + << (IsFloat ? formatFloatSrc(Src) : formatIntSrc(Src)) + << ", [ri" << BaseRi << ", " << formatAddrExpr("lc0<<2") << ", " + << formatShiftedAddr(*Neg, 2) << "]\n"; + return true; + }; + + auto emitLoadFromLocalWordBaseInto = [&](uint64_t WordBase, + StringRef Dst) -> bool { + constexpr uint64_t MaxSImm24Bytes = (1ull << 23) - 1ull; + if (!UseGroupedDims && (WordBase * 4ull) <= MaxSImm24Bytes) { + OS << " v.lwi.u.local [ts, " << formatAddrExpr("lc0<<2") << ", " + << (WordBase * 4ull) << "], ->" + << formatAssignedWordDest(Dst) << "\n"; + return true; + } + auto Offset = emitLocalSlotMemOffset(WordBase); + if (!Offset) + return false; + OS << " v.lw.local [ts, " << formatAddrExpr("lc0<<2") << ", " + << formatAddrExpr(*Offset) << "], ->" + << formatAssignedWordDest(Dst) + << "\n"; + return true; + }; + + auto emitLoadFromLocalWordBase = + [&](uint64_t WordBase) -> std::optional { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.lw.brg [ri" << BaseRi << ", lc0<<2, " << *Neg - << "<<2], ->" << *Dst << "\n"; + if (!emitLoadFromLocalWordBaseInto(WordBase, *Dst)) + return std::nullopt; return *Dst; }; - auto emitStoreToInvariantBind = [&](StringRef Src, unsigned BaseRi) { + auto emitSharedLocalWordOffsetReg = + [&](uint64_t WordBase) -> std::optional { auto Neg = emitNegLc0(); if (!Neg) + return std::nullopt; + + std::optional BaseTok; + if (UseGroupedDims) { + auto GroupReg = emitGroupWordIndexReg(); + if (!GroupReg) + return std::nullopt; + BaseTok = *GroupReg; + } + + if (WordBase != 0) { + auto ConstBind = bindI64(ConstantInt::get(I64Ty, WordBase)); + if (!ConstBind) + return std::nullopt; + if (!BaseTok) { + BaseTok = ("ri" + std::to_string(*ConstBind)); + } else { + auto Sum = allocVec(); + if (!Sum) + return std::nullopt; + OS << " v.add " << formatIntSrc(*BaseTok) << ", ri" << *ConstBind + << ", ->" << formatWordDest(*Sum) << "\n"; + BaseTok = *Sum; + } + } + + if (!BaseTok) + return *Neg; + + auto Dst = allocVec(); + if (!Dst) + return std::nullopt; + OS << " v.add " << formatIntSrc(*BaseTok) << ", " + << formatIntSrc(*Neg) << ", ->" << formatWordDest(*Dst) << "\n"; + return *Dst; + }; + + auto emitLoadFromSharedLocalWordBaseInto = [&](uint64_t WordBase, + StringRef Dst) -> bool { + auto Offset = emitSharedLocalWordOffsetReg(WordBase); + if (!Offset) + return false; + OS << " v.lw.local [ts, " << formatAddrExpr("lc0<<2") << ", " + << formatShiftedAddr(*Offset, 2) << "], ->" + << formatAssignedWordDest(Dst) << "\n"; + return true; + }; + + auto emitLoadFromSharedLocalWordBase = + [&](uint64_t WordBase) -> std::optional { + auto Dst = allocVec(); + if (!Dst) + return std::nullopt; + if (!emitLoadFromSharedLocalWordBaseInto(WordBase, *Dst)) + return std::nullopt; + return *Dst; + }; + + auto emitStoreToLocalWordBase = [&](StringRef Src, uint64_t WordBase, + bool IsFloat = false) { + constexpr uint64_t MaxSImm24Bytes = (1ull << 23) - 1ull; + if (!UseGroupedDims && (WordBase * 4ull) <= MaxSImm24Bytes) { + OS << " v.swi.u.local " + << (IsFloat ? formatFloatSrc(Src) : formatIntSrc(Src)) + << ", [ts, " << formatAddrExpr("lc0<<2") << ", " + << (WordBase * 4ull) << "]\n"; + return true; + } + auto Offset = emitLocalSlotMemOffset(WordBase); + if (!Offset) + return false; + OS << " v.sw.local " + << (IsFloat ? formatFloatSrc(Src) : formatIntSrc(Src)) + << ", [ts, " << formatAddrExpr("lc0<<2") << ", " + << formatAddrExpr(*Offset) << "]\n"; + return true; + }; + + auto emitStoreToSharedLocalWordBase = [&](StringRef Src, uint64_t WordBase, + bool IsFloat = false) { + auto Offset = emitSharedLocalWordOffsetReg(WordBase); + if (!Offset) return false; - OS << " v.sw.brg " << Src << ", [ri" << BaseRi << ", lc0<<2, " - << *Neg << "<<2]\n"; + OS << " v.sw.local " + << (IsFloat ? formatFloatSrc(Src) : formatIntSrc(Src)) + << ", [ts, " << formatAddrExpr("lc0<<2") << ", " + << formatShiftedAddr(*Offset, 2) << "]\n"; return true; }; + auto emitLoadFromActiveBind = + [&](unsigned BaseRi) -> std::optional { + if (!ActiveSlotPerLane) + return emitLoadFromInvariantBind(BaseRi); + auto Dst = allocVec(); + if (!Dst) + return std::nullopt; + if (UseGroupedDims) { + auto Idx = emitGroupedIndexReg(1); + if (!Idx) + return std::nullopt; + OS << " v.lw.brg [ri" << BaseRi << ", " + << formatAddrExpr("lc0<<2") << ", " + << formatShiftedAddr(*Idx, 2) << "], ->" + << formatWordDest(*Dst) << "\n"; + } else { + OS << " v.lw.brg [ri" << BaseRi << ", " + << formatAddrExpr("lc0<<2") << ", " << formatAddrExpr("zero<<2") + << "], ->" << formatWordDest(*Dst) << "\n"; + } + return *Dst; + }; + + auto emitStoreToActiveBind = [&](StringRef Src, unsigned BaseRi, + bool IsFloat = false) { + if (!ActiveSlotPerLane) + return emitStoreToInvariantBind(Src, BaseRi, IsFloat); + if (UseGroupedDims) { + auto Idx = emitGroupedIndexReg(1); + if (!Idx) + return false; + OS << " v.sw.brg " + << (IsFloat ? formatFloatSrc(Src) : formatIntSrc(Src)) + << ", [ri" << BaseRi << ", " << formatAddrExpr("lc0<<2") + << ", " << formatShiftedAddr(*Idx, 2) << "]\n"; + } else { + OS << " v.sw.brg " + << (IsFloat ? formatFloatSrc(Src) : formatIntSrc(Src)) + << ", [ri" << BaseRi << ", " << formatAddrExpr("lc0<<2") + << ", " << formatAddrExpr("zero<<2") << "]\n"; + } + return true; + }; + auto canScalarizeInvariantLoad = [&](const LoadInst *LI) -> bool { if (!L->isLoopInvariant(LI->getPointerOperand())) { return false; @@ -2808,8 +4003,9 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.csel " << *Pred << ", " << *TV << ", " << *FV << ", ->" - << *Dst << "\n"; + OS << " v.csel " << formatMaskSrc(*Pred) << ", " + << formatMaskSrc(*TV) << ", " << formatMaskSrc(*FV) << ", ->" + << formatMaskDest(*Dst) << "\n"; ValOp[Cond] = *Dst; return *Dst; } @@ -2825,8 +4021,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return std::nullopt; StringRef Mn; - std::string A = *Lhs; - std::string B = *Rhs; + std::string A = formatIntSrc(*Lhs); + std::string B = formatIntSrc(*Rhs); switch (Cmp->getPredicate()) { case CmpInst::ICMP_EQ: Mn = "v.cmp.eq"; @@ -2866,7 +4062,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return std::nullopt; } - OS << " " << Mn << " " << A << ", " << B << ", ->" << *Dst + OS << " " << Mn << " " << A << ", " << B << ", ->" + << formatMaskDest(*Dst) << "\n"; ValOp[Cond] = *Dst; return *Dst; @@ -2886,8 +4083,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return std::nullopt; StringRef Mn; - std::string A = *Lhs; - std::string B = *Rhs; + std::string A = formatFloatSrc(*Lhs); + std::string B = formatFloatSrc(*Rhs); switch (FCmp->getPredicate()) { case CmpInst::FCMP_OEQ: case CmpInst::FCMP_UEQ: @@ -2919,7 +4116,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return std::nullopt; } - OS << " " << Mn << " " << A << ", " << B << ", ->" << *Dst << "\n"; + OS << " " << Mn << " " << A << ", " << B << ", ->" + << formatMaskDest(*Dst) << "\n"; ValOp[Cond] = *Dst; return *Dst; }; @@ -2983,8 +4181,9 @@ class LinxISASIMTAutoVectorize : public FunctionPass { : (Opc == Instruction::FSub) ? "v.fsub" : (Opc == Instruction::FMul) ? "v.fmul" : "v.fdiv"; - OS << " " << Mn << " " << *Lhs << ", " << *Rhs << ", ->" << *Dst - << "\n"; + OS << " " << Mn << " " << formatFloatSrc(*Lhs) << ", " + << formatFloatSrc(*Rhs) << ", ->" + << formatWordDest(*Dst) << "\n"; ValOp[V] = *Dst; return *Dst; } @@ -2996,19 +4195,48 @@ class LinxISASIMTAutoVectorize : public FunctionPass { emitValue = [&](Value *V) -> std::optional { if (!V) return std::nullopt; - auto It = ValOp.find(V); - if (It != ValOp.end()) - return It->second; + + auto It = ValOp.find(V); + if (It != ValOp.end()) + return It->second; if (auto *PN = dyn_cast(V)) { auto RecIt = RecurrencePlanByPhi.find(PN); if (RecIt != RecurrencePlanByPhi.end()) { - const RecurrencePlan &Plan = RecurrencePlans[RecIt->second]; - auto Dst = emitLoadFromInvariantBind(Plan.SlotBind); - if (!Dst) - return std::nullopt; - ValOp[V] = *Dst; - return *Dst; + const unsigned RecIdx = RecIt->second; + const RecurrencePlan &Plan = RecurrencePlans[RecIdx]; + if (!Plan.LocalWordBase) { + auto Dst = emitLoadFromInvariantBind(Plan.SlotBind); + if (!Dst) + return std::nullopt; + ValOp[V] = *Dst; + return *Dst; + } + + auto Dst = allocVec(); + if (!Dst) + return std::nullopt; + const std::string BindLabel = freshAsmLabel("L_recurrence_bind"); + const std::string DoneLabel = freshAsmLabel("L_recurrence_done"); + OS << " v.cmp.eq " << formatIntSrc(LinearIndexReg) + << ", zero, ->p\n"; + OS << " b.nz " << BindLabel << "\n"; + if (!emitLoadFromLocalWordBaseInto(*Plan.LocalWordBase, *Dst)) + return std::nullopt; + OS << " j " << DoneLabel << "\n"; + OS << BindLabel << ":\n"; + auto Neg = allocVec(); + if (!Neg) + return std::nullopt; + OS << " v.sub zero, " << formatIntSrc("lc0") << ", ->" + << formatWordDest(*Neg) << "\n"; + OS << " v.lw.brg [ri" << Plan.SlotBind << ", " + << formatAddrExpr("lc0<<2") << ", " + << formatShiftedAddr(*Neg, 2) << "], ->" + << formatAssignedWordDest(*Dst) << "\n"; + OS << DoneLabel << ":\n"; + ValOp[V] = *Dst; + return *Dst; } auto tryEmitPhiSelectCsel = [&]() -> std::optional { @@ -3058,8 +4286,12 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.csel " << *Pred << ", " << *TV << ", " << *FV - << ", ->" << *Dst << "\n"; + const bool IsFloat = PN->getType() == Type::getFloatTy(Ctx); + OS << " v.csel " << formatMaskSrc(*Pred) << ", " + << (IsFloat ? formatFloatSrc(*TV) : formatIntSrc(*TV)) + << ", " + << (IsFloat ? formatFloatSrc(*FV) : formatIntSrc(*FV)) + << ", ->" << formatWordDest(*Dst) << "\n"; return *Dst; }; @@ -3105,11 +4337,55 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.csel " << *Pred << ", " << *TV << ", " << *FV - << ", ->" << *Dst << "\n"; + const bool IsFloat = PN->getType() == Type::getFloatTy(Ctx); + OS << " v.csel " << formatMaskSrc(*Pred) << ", " + << (IsFloat ? formatFloatSrc(*TV) : formatIntSrc(*TV)) + << ", " + << (IsFloat ? formatFloatSrc(*FV) : formatIntSrc(*FV)) + << ", ->" << formatWordDest(*Dst) << "\n"; return *Dst; }; + auto tryEmitPhiSelectViaRegion = [&]() -> std::optional { + for (auto &KV : IfConvertibleSplits) { + const IfConvertibleSplitInfo &Split = KV.second; + if (Split.MergeBB != PN->getParent()) + continue; + if (!Split.BranchBB || !Split.TrueExitBB || !Split.FalseExitBB) + continue; + + Value *VTrue = + PN->getIncomingValueForBlock(Split.TrueExitBB); + Value *VFalse = + PN->getIncomingValueForBlock(Split.FalseExitBB); + if (!VTrue || !VFalse || VTrue == PN || VFalse == PN) + continue; + + auto *BI = + dyn_cast(Split.BranchBB->getTerminator()); + if (!BI || !BI->isConditional() || BI->getNumSuccessors() != 2) + continue; + + auto Pred = emitCondition(BI->getCondition()); + auto TV = emitValue(VTrue); + auto FV = emitValue(VFalse); + if (!Pred || !TV || !FV) + return std::nullopt; + + auto Dst = allocVec(); + if (!Dst) + return std::nullopt; + const bool IsFloat = PN->getType() == Type::getFloatTy(Ctx); + OS << " v.csel " << formatMaskSrc(*Pred) << ", " + << (IsFloat ? formatFloatSrc(*TV) : formatIntSrc(*TV)) + << ", " + << (IsFloat ? formatFloatSrc(*FV) : formatIntSrc(*FV)) + << ", ->" << formatWordDest(*Dst) << "\n"; + return *Dst; + } + return std::nullopt; + }; + BasicBlock *Pred0 = PN->getIncomingBlock(0); BasicBlock *Pred1 = PN->getIncomingBlock(1); if (!L->contains(Pred0) || !L->contains(Pred1)) @@ -3123,6 +4399,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return Dst; if (auto Dst = tryEmitPhiSelectViaSplit(Pred1, Pred0)) return Dst; + if (auto Dst = tryEmitPhiSelectViaRegion()) + return Dst; return std::nullopt; }; @@ -3136,6 +4414,10 @@ class LinxISASIMTAutoVectorize : public FunctionPass { } } + auto It = ValOp.find(V); + if (It != ValOp.end()) + return It->second; + if (PN->getType() == Type::getFloatTy(Ctx)) { Value *LoopIncoming = nullptr; Value *PreIncoming = nullptr; @@ -3184,7 +4466,7 @@ class LinxISASIMTAutoVectorize : public FunctionPass { Value *AdjPtr = PB.CreateGEP( PB.getInt8Ty(), LoopLI->getPointerOperand(), ConstantInt::get(I64Ty, -StepBytes)); - auto Dst = emitLoadFromPtr(AdjPtr); + auto Dst = emitLoadFromPtr(AdjPtr, LoopLI->getType(), LoopLI); if (!Dst) { return std::nullopt; } @@ -3289,8 +4571,11 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Dst = allocVec(); if (!Mul || !Dst) return std::nullopt; - OS << " v.fmul " << *A << ", " << *B << ", ->" << *Mul << "\n"; - OS << " v.fadd " << *Mul << ", " << *C << ", ->" << *Dst + OS << " v.fmul " << formatFloatSrc(*A) << ", " + << formatFloatSrc(*B) << ", ->" << formatWordDest(*Mul) + << "\n"; + OS << " v.fadd " << formatFloatSrc(*Mul) << ", " + << formatFloatSrc(*C) << ", ->" << formatWordDest(*Dst) << "\n"; ValOp[V] = *Dst; return *Dst; @@ -3312,7 +4597,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.fabs " << *Src << ", ->" << *Dst << "\n"; + OS << " v.fabs " << formatFloatSrc(*Src) << ", ->" + << formatWordDest(*Dst) << "\n"; ValOp[V] = *Dst; return *Dst; } @@ -3323,7 +4609,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.fsqrt " << *Src << ", ->" << *Dst << "\n"; + OS << " v.fsqrt " << formatFloatSrc(*Src) << ", ->" + << formatWordDest(*Dst) << "\n"; ValOp[V] = *Dst; return *Dst; } @@ -3365,7 +4652,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { ValOp[V] = Name; return Name; } - auto Dst = emitLoadFromInvariantPtr(LI->getPointerOperand()); + auto Dst = emitLoadFromInvariantPtr(LI->getPointerOperand(), + LI->getType(), LI); if (!Dst) { return std::nullopt; } @@ -3379,20 +4667,24 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return std::nullopt; if (auto *SelPtr = dyn_cast(LI->getPointerOperand())) { auto Pred = emitCondition(SelPtr->getCondition()); - auto TV = emitLoadFromPtr(SelPtr->getTrueValue()); - auto FV = emitLoadFromPtr(SelPtr->getFalseValue()); + auto TV = emitLoadFromPtr(SelPtr->getTrueValue(), LI->getType(), + LI); + auto FV = emitLoadFromPtr(SelPtr->getFalseValue(), LI->getType(), + LI); if (!Pred || !TV || !FV) return std::nullopt; auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.csel " << *Pred << ", " << *TV << ", " << *FV - << ", ->" << *Dst << "\n"; + OS << " v.csel " << formatMaskSrc(*Pred) << ", " + << formatFloatSrc(*TV) << ", " << formatFloatSrc(*FV) + << ", ->" << formatWordDest(*Dst) << "\n"; ValOp[V] = *Dst; return *Dst; } - auto Dst = emitLoadFromPtr(LI->getPointerOperand()); + auto Dst = emitLoadFromPtr(LI->getPointerOperand(), LI->getType(), + LI); if (!Dst) return std::nullopt; ValOp[V] = *Dst; @@ -3408,8 +4700,11 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.csel " << *Pred << ", " << *TV << ", " << *FV - << ", ->" << *Dst << "\n"; + const bool IsFloat = SI->getType() == Type::getFloatTy(Ctx); + OS << " v.csel " << formatMaskSrc(*Pred) << ", " + << (IsFloat ? formatFloatSrc(*TV) : formatIntSrc(*TV)) << ", " + << (IsFloat ? formatFloatSrc(*FV) : formatIntSrc(*FV)) + << ", ->" << formatWordDest(*Dst) << "\n"; ValOp[V] = *Dst; return *Dst; } @@ -3480,7 +4775,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto Dst = allocVec(); if (!Dst) return std::nullopt; - OS << " v.fsub zero, " << *Src << ", ->" << *Dst << "\n"; + OS << " v.fsub zero, " << formatFloatSrc(*Src) << ", ->" + << formatWordDest(*Dst) << "\n"; ValOp[V] = *Dst; return *Dst; } @@ -3503,7 +4799,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { : (Opc == Instruction::FSub) ? "v.fsub" : (Opc == Instruction::FMul) ? "v.fmul" : "v.fdiv"; - OS << " " << Mn << " " << *Lhs << ", " << *Rhs << ", ->" << *Dst + OS << " " << Mn << " " << formatFloatSrc(*Lhs) << ", " + << formatFloatSrc(*Rhs) << ", ->" << formatWordDest(*Dst) << "\n"; ValOp[V] = *Dst; return *Dst; @@ -3585,8 +4882,9 @@ class LinxISASIMTAutoVectorize : public FunctionPass { StringRef Mn = (Opc == Instruction::Add) ? "v.add" : (Opc == Instruction::Sub) ? "v.sub" : "v.mul"; - OS << " " << Mn << " " << *Lhs << ", " << *Rhs << ", ->" << *Dst - << "\n"; + OS << " " << Mn << " " << formatIntSrc(*Lhs) << ", " + << formatIntSrc(*Rhs) << ", ->" << formatWordDest(*Dst) + << "\n"; ValOp[V] = *Dst; return *Dst; } @@ -3627,24 +4925,53 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return std::nullopt; }; - auto emitStoreInst = [&](StoreInst *SI) -> bool { - if (SI->getValueOperand()->getType() != Type::getFloatTy(Ctx)) { - reject("non_float_store_value"); + auto emitStoreValueToPtr = [&](Value *Ptr, StringRef ValTok, + bool IsFloat, + uint64_t ElemBytes) -> bool { + if (!Ptr) return false; - } + if (!isPowerOf2_64(ElemBytes) || ElemBytes == 0 || ElemBytes > 8) + return false; + + const unsigned ElemShift = Log2_64(ElemBytes); + const std::string LaneExpr = + ElemShift == 0 ? "lc0" : ("lc0<<" + std::to_string(ElemShift)); + const char *StoreMnemonic = + (ElemBytes == 1) ? "v.sb.brg" + : (ElemBytes == 2) ? "v.sh.brg" + : (ElemBytes == 4) ? "v.sw.brg" + : "v.sd.brg"; + + auto isVectorValueToken = [&](StringRef Tok) { + Tok = Tok.trim(); + return parseVecPipeToken(Tok).has_value() || + isLaneCounterToken(Tok); + }; + + std::string StoreTok = ValTok.trim().str(); + if (!isVectorValueToken(StoreTok)) { + auto Dst = allocVec(); + if (!Dst) { + reject("vector_reg_exhausted"); + return false; + } + OS << " v.add zero, " << formatIntSrc(StoreTok) << ", ->" + << formatWordDest(*Dst) << "\n"; + StoreTok = *Dst; + } // Pointer sink PHI store: dispatch by selector written on the incoming // edge (classic if/switch sinks in TSVC). - if (auto *GEP = dyn_cast_or_null( - SI->getPointerOperand()->stripPointerCasts())) { + if (auto *GEP = + dyn_cast_or_null(Ptr->stripPointerCasts())) { Value *Base = GEP->getPointerOperand()->stripPointerCasts(); - if (auto *Phi = dyn_cast(Base)) { - auto PlanIt = PtrPhiPlans.find(Phi); - if (PlanIt != PtrPhiPlans.end()) { - Value *Index = nullptr; - const unsigned NumIdx = GEP->getNumIndices(); - if (NumIdx == 1) { - Index = GEP->getOperand(1); + if (auto *Phi = dyn_cast(Base)) { + auto PlanIt = PtrPhiPlans.find(Phi); + if (PlanIt != PtrPhiPlans.end()) { + Value *Index = nullptr; + const unsigned NumIdx = GEP->getNumIndices(); + if (NumIdx == 1) { + Index = GEP->getOperand(1); } else if (NumIdx == 2) { auto *Z = dyn_cast(GEP->getOperand(1)); if (!Z || !Z->isZero()) { @@ -3656,40 +4983,33 @@ class LinxISASIMTAutoVectorize : public FunctionPass { reject("unsupported_ptr_phi_store_gep"); return false; } - if (!Index || !Index->getType()->isIntegerTy() || - Index->getType()->getScalarSizeInBits() > 64) { - reject("unsupported_ptr_phi_store_gep"); - return false; - } + if (!Index || !Index->getType()->isIntegerTy() || + Index->getType()->getScalarSizeInBits() > 64) { + reject("unsupported_ptr_phi_store_gep"); + return false; + } - const DataLayout &DL = - F.getParent()->getDataLayout(); - Type *ElemTy = GEP->getResultElementType(); - const uint64_t ElemBytes = - ElemTy ? DL.getTypeStoreSize(ElemTy) : 0; - std::optional IdxExpr; - if (ElemBytes == 4) { - IdxExpr = emitValue(Index); - } else if (ElemBytes == 1) { - IdxExpr = emitWordIndexFromByteIndex(Index); - } else { - reject("unsupported_ptr_phi_store_gep"); - return false; - } + const DataLayout &DL = F.getParent()->getDataLayout(); + Type *ElemTy = GEP->getResultElementType(); + const uint64_t ElemBytes = + ElemTy ? DL.getTypeStoreSize(ElemTy) : 0; + std::optional IdxExpr; + if (ElemBytes == 4) { + IdxExpr = emitValue(Index); + } else if (ElemBytes == 1) { + IdxExpr = emitElemIndexFromByteIndex(Index, /*ElemBytes=*/4); + } else { + reject("unsupported_ptr_phi_store_gep"); + return false; + } - if (!IdxExpr) { - reject("unsupported_ptr_phi_store_index"); - return false; - } - auto DeltaExpr = emitIndexDeltaFromLc0(*IdxExpr); - if (!DeltaExpr) { + if (!IdxExpr) { reject("unsupported_ptr_phi_store_index"); return false; } - - auto Val = emitValue(SI->getValueOperand()); - if (!Val) { - reject(unsupportedValueReason(SI->getValueOperand())); + auto DeltaExpr = emitIndexDeltaFromLc0(*IdxExpr); + if (!DeltaExpr) { + reject("unsupported_ptr_phi_store_index"); return false; } @@ -3699,121 +5019,395 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return false; } - const std::string EndLbl = - freshAsmLabel("L_ptrphi_st_end"); - SmallVector, 4> CaseLabels; - CaseLabels.reserve(Plan.BaseRis.size()); - for (unsigned SelId = 0; SelId < Plan.BaseRis.size(); - ++SelId) { - std::string CaseLbl = - freshAsmLabel("L_ptrphi_st_case"); - CaseLabels.push_back(std::make_pair(CaseLbl, SelId)); + const std::string EndLbl = freshAsmLabel("L_ptrphi_st_end"); + SmallVector, 4> CaseLabels; + CaseLabels.reserve(Plan.BaseRis.size()); + for (unsigned SelId = 0; SelId < Plan.BaseRis.size(); ++SelId) { + std::string CaseLbl = freshAsmLabel("L_ptrphi_st_case"); + CaseLabels.push_back(std::make_pair(CaseLbl, SelId)); + + std::string SelTok = "zero"; + if (SelId != 0) { + auto Tok = emitValue(ConstantInt::get(I64Ty, SelId)); + if (!Tok) { + reject("ptr_phi_sel_emit_failed"); + return false; + } + SelTok = *Tok; + } + + OS << " v.cmp.eq " << formatIntSrc(Plan.SelReg) << ", " + << formatIntSrc(SelTok) + << ", ->p\n"; + OS << " b.nz " << CaseLbl << "\n"; + } + + OS << " j " << CaseLabels.front().first << "\n"; + for (auto &C : CaseLabels) { + const unsigned SelId = C.second; + if (SelId >= Plan.BaseRis.size()) { + reject("invalid_ptr_phi_plan"); + return false; + } + const unsigned Ri = Plan.BaseRis[SelId]; + OS << C.first << ":\n"; + OS << " " << StoreMnemonic << " " + << (IsFloat ? formatFloatSrc(StoreTok) + : formatIntSrc(StoreTok)) + << ", [ri" + << Ri << ", " << formatAddrExpr(LaneExpr) << ", " + << formatShiftedAddr(*DeltaExpr, ElemShift) << "]\n"; + OS << " j " << EndLbl << "\n"; + } + OS << EndLbl << ":\n"; + return true; + } + } + } + + auto Address = bindPtrStartForElem(Ptr, ElemBytes); + unsigned BaseRi = 0; + std::string IndexReg; + unsigned StoreShift = 0; + if (Address) { + BaseRi = Address->BaseRi; + if (UseGroupedDims) { + auto Idx = emitGroupedIndexReg(Address->StepElems); + if (!Idx) { + reject("unsupported_store_stride"); + return false; + } + IndexReg = *Idx; + StoreShift = ElemShift; + } else { + if (Address->StepElems < 0) { + auto Idx = emitGroupedIndexReg(Address->StepElems); + if (!Idx) { + reject("unsupported_store_stride"); + return false; + } + IndexReg = *Idx; + StoreShift = ElemShift; + } else { + StoreShift = Address->Shift; + auto IndexRegOpt = emitScaledLc0(Address->IndexFactor); + if (!IndexRegOpt) { + reject("unsupported_store_stride"); + return false; + } + IndexReg = *IndexRegOpt; + } + } + } else { + auto General = bindPtrGeneralForElem(Ptr, ElemBytes); + if (!General) { + reject("non_affine_store_address"); + return false; + } + BaseRi = General->first; + IndexReg = General->second; + StoreShift = ElemShift; + } + + OS << " " << StoreMnemonic << " " + << (IsFloat ? formatFloatSrc(StoreTok) + : formatIntSrc(StoreTok)) + << ", [ri" + << BaseRi << ", " << formatAddrExpr(LaneExpr) << ", " + << formatShiftedAddr(IndexReg, StoreShift) << "]\n"; + return true; + }; + + auto emitStoreValueToInvariantBaseGEP = + [&](GEPOperator *GEP, Value *BasePtr, StringRef StoreTok, + bool IsFloat, uint64_t ElemBytes) -> bool { + if (!GEP || !BasePtr || !isPowerOf2_64(ElemBytes) || ElemBytes == 0 || + ElemBytes > 8) + return false; + + Value *Index = nullptr; + bool ByteIndexed = false; + const unsigned NumIdx = GEP->getNumIndices(); + if (NumIdx == 1) { + Index = GEP->getOperand(1); + } else if (NumIdx == 2) { + auto *Z = dyn_cast(GEP->getOperand(1)); + if (!Z || !Z->isZero()) + return false; + Index = GEP->getOperand(2); + } else { + return false; + } + + auto IndexInfo = matchGEPIndexForElemBytes(GEP, ElemBytes); + if (!IndexInfo) + return false; + Index = IndexInfo->first; + ByteIndexed = IndexInfo->second; + + BasePtr = BasePtr->stripPointerCasts(); + if (!L->isLoopInvariant(BasePtr)) + return false; + + Value *BaseI64 = PB.CreatePtrToInt(BasePtr, I64Ty); + auto BaseOpt = bindI64(BaseI64); + if (!BaseOpt) + return false; + std::optional IdxExpr = + ByteIndexed ? emitElemIndexFromByteIndex(Index, ElemBytes) + : emitValue(Index); + if (!IdxExpr) + return false; + auto DeltaExpr = emitIndexDeltaFromLc0(*IdxExpr); + if (!DeltaExpr) + return false; + + const unsigned ElemShift = Log2_64(ElemBytes); + const std::string LaneExpr = + ElemShift == 0 ? "lc0" : ("lc0<<" + std::to_string(ElemShift)); + const char *StoreMnemonic = + (ElemBytes == 1) ? "v.sb.brg" + : (ElemBytes == 2) ? "v.sh.brg" + : (ElemBytes == 4) ? "v.sw.brg" + : "v.sd.brg"; + OS << " " << StoreMnemonic << " " + << (IsFloat ? formatFloatSrc(StoreTok) : formatIntSrc(StoreTok)) + << ", [ri" << *BaseOpt << ", " << formatAddrExpr(LaneExpr) << ", " + << formatShiftedAddr(*DeltaExpr, ElemShift) << "]\n"; + return true; + }; + + auto emitSelectBaseStoreSplit = [&](StoreInst *SI, bool IsFloat, + uint64_t ElemBytes) -> std::optional { + auto SelectGEP = + matchSelectStorePtr(SI->getPointerOperand(), ElemBytes); + if (!SelectGEP) + return std::nullopt; + + Value *Cond = SelectGEP->Cond; + Value *TrueV = SI->getValueOperand(); + Value *FalseV = SI->getValueOperand(); + if (auto *ValSel = dyn_cast(SI->getValueOperand())) { + if (ValSel->getCondition() != Cond) + return std::nullopt; + TrueV = ValSel->getTrueValue(); + FalseV = ValSel->getFalseValue(); + } + + auto SaveOneBind = ensureExecMaskSaveOneBind(); + if (!SaveOneBind) { + reject("exec_mask_bind_exhausted"); + return false; + } + + auto SaveReg = allocVec(); + if (!SaveReg) { + reject("vector_reg_exhausted"); + return false; + } - std::string SelTok = "zero"; - if (SelId != 0) { - auto Tok = emitValue(ConstantInt::get(I64Ty, SelId)); - if (!Tok) { - reject("ptr_phi_sel_emit_failed"); - return false; - } - SelTok = *Tok; - } + auto TrueTok = emitValue(TrueV); + auto FalseTok = emitValue(FalseV); + if (!TrueTok || !FalseTok) { + reject(unsupportedValueReason(SI->getValueOperand())); + return false; + } - auto Pred = allocVec(); - if (!Pred) { - reject("vector_reg_exhausted"); - return false; - } - OS << " v.cmp.eq " << Plan.SelReg << ", " << SelTok - << ", ->" << *Pred << "\n"; - // Reduce ops accumulate into the destination register; seed our - // scratch reduce destination before each use. - OS << " c.movr zero, ->t\n"; - OS << " v.rdor " << *Pred << ", ->t#1\n"; - OS << " b.ne t#1, zero, " << CaseLbl << "\n"; - } + std::string TrueLabel = freshAsmLabel("L_selptr_true"); + std::string FalseLabel = freshAsmLabel("L_selptr_false"); + std::string EndLabel = freshAsmLabel("L_selptr_end"); - OS << " j " << CaseLabels.front().first << "\n"; - for (auto &C : CaseLabels) { - const unsigned SelId = C.second; - if (SelId >= Plan.BaseRis.size()) { - reject("invalid_ptr_phi_plan"); - return false; - } - const unsigned Ri = Plan.BaseRis[SelId]; - OS << C.first << ":\n"; - OS << " v.sw.brg " << *Val << ", [ri" << Ri - << ", lc0<<2, " << *DeltaExpr << "<<2]\n"; - OS << " j " << EndLbl << "\n"; - } - OS << EndLbl << ":\n"; - return true; - } - } - } + auto emitExecMaskCompareLocal = [&](StringRef Mnemonic, StringRef Lhs, + StringRef Rhs) { + OS << " " << Mnemonic << " " << Lhs << ", " << Rhs << ", ->p\n"; + }; + auto emitBranchOnExecMaskLocal = [&](StringRef T, StringRef F) { + OS << " b.nz " << T << "\n"; + if (T != F) + OS << " j " << F << "\n"; + }; + auto emitPredicateToExecMaskLocal = + [&](Value *PredicateExpr) -> bool { + if (auto *Cmp = dyn_cast(PredicateExpr)) { + auto L = emitValue(Cmp->getOperand(0)); + auto R = emitValue(Cmp->getOperand(1)); + if (!L || !R) + return false; + if (Cmp->getOperand(0)->getType()->isIntegerTy(1) || + Cmp->getOperand(1)->getType()->isIntegerTy(1)) + return false; - auto Address = bindPtrStart(SI->getPointerOperand()); - unsigned BaseRi = 0; - std::string IndexReg; - unsigned StoreShift = 0; - if (Address) { - BaseRi = Address->BaseRi; - if (UseGroupedDims) { - auto Idx = emitGroupedIndexReg(Address->StepElems); - if (!Idx) { - reject("unsupported_store_stride"); + StringRef Mn; + std::string A = formatIntSrc(*L); + std::string B = formatIntSrc(*R); + switch (Cmp->getPredicate()) { + case CmpInst::ICMP_EQ: + Mn = "v.cmp.eq"; + break; + case CmpInst::ICMP_NE: + Mn = "v.cmp.ne"; + break; + case CmpInst::ICMP_SLT: + Mn = "v.cmp.lt"; + break; + case CmpInst::ICMP_SLE: + Mn = "v.cmp.ge"; + std::swap(A, B); + break; + case CmpInst::ICMP_SGT: + Mn = "v.cmp.lt"; + std::swap(A, B); + break; + case CmpInst::ICMP_SGE: + Mn = "v.cmp.ge"; + break; + case CmpInst::ICMP_ULT: + Mn = "v.cmp.ltu"; + break; + case CmpInst::ICMP_ULE: + Mn = "v.cmp.geu"; + std::swap(A, B); + break; + case CmpInst::ICMP_UGT: + Mn = "v.cmp.ltu"; + std::swap(A, B); + break; + case CmpInst::ICMP_UGE: + Mn = "v.cmp.geu"; + break; + default: return false; } - IndexReg = *Idx; - StoreShift = 2; - } else { - if (Address->StepElems < 0) { - auto Idx = emitGroupedIndexReg(Address->StepElems); - if (!Idx) { - reject("unsupported_store_stride"); - return false; - } - IndexReg = *Idx; - StoreShift = 2; - } else { - StoreShift = Address->Shift; - auto IndexRegOpt = emitScaledLc0(Address->IndexFactor); - if (!IndexRegOpt) { - reject("unsupported_store_stride"); - return false; - } - IndexReg = *IndexRegOpt; + + emitExecMaskCompareLocal(Mn, A, B); + return true; + } + + if (auto *FCmp = dyn_cast(PredicateExpr)) { + auto L = emitValue(FCmp->getOperand(0)); + auto R = emitValue(FCmp->getOperand(1)); + if (!L || !R) + return false; + + StringRef Mn; + std::string A = formatFloatSrc(*L); + std::string B = formatFloatSrc(*R); + switch (FCmp->getPredicate()) { + case CmpInst::FCMP_OEQ: + case CmpInst::FCMP_UEQ: + Mn = "v.feq"; + break; + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UNE: + Mn = "v.fne"; + break; + case CmpInst::FCMP_OLT: + case CmpInst::FCMP_ULT: + Mn = "v.flt"; + break; + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_ULE: + Mn = "v.fge"; + std::swap(A, B); + break; + case CmpInst::FCMP_OGT: + case CmpInst::FCMP_UGT: + Mn = "v.flt"; + std::swap(A, B); + break; + case CmpInst::FCMP_OGE: + case CmpInst::FCMP_UGE: + Mn = "v.fge"; + break; + default: + return false; } + + emitExecMaskCompareLocal(Mn, A, B); + return true; } - } else { - auto General = bindPtrGeneral(SI->getPointerOperand()); - if (!General) { - reject("non_affine_store_address"); + + auto Pred = emitCondition(PredicateExpr); + if (!Pred) return false; - } - BaseRi = General->first; - IndexReg = General->second; - StoreShift = 2; + emitExecMaskCompareLocal("v.cmp.ne", formatMaskSrc(*Pred), "zero"); + return true; + }; + + OS << " v.psel p, ri" << *SaveOneBind << ", ->" + << formatWordDest(*SaveReg) << "\n"; + if (!emitPredicateToExecMaskLocal(Cond)) { + reject("unsupported_branch_condition"); + return false; + } + emitBranchOnExecMaskLocal(TrueLabel, FalseLabel); + + OS << TrueLabel << ":\n"; + if (!emitStoreValueToInvariantBaseGEP( + SelectGEP->TrueGEP, + SelectGEP->TrueBase, + *TrueTok, IsFloat, ElemBytes)) { + reject("non_affine_store_address"); + return false; + } + OS << " j " << EndLabel << "\n"; + + OS << FalseLabel << ":\n"; + if (!emitStoreValueToInvariantBaseGEP( + SelectGEP->FalseGEP, + SelectGEP->FalseBase, + *FalseTok, IsFloat, ElemBytes)) { + reject("non_affine_store_address"); + return false; } + OS << EndLabel << ":\n"; + OS << " v.cmp.ne " << formatIntSrc(*SaveReg) << ", zero, ->p\n"; + return true; + }; + + auto emitStoreInst = [&](StoreInst *SI) -> bool { + Type *StoreTy = SI->getValueOperand()->getType(); + const bool IsFloat = StoreTy == Type::getFloatTy(Ctx); + uint64_t ElemBytes = 0; + if (StoreTy->isIntegerTy(1) || StoreTy->isIntegerTy(8)) + ElemBytes = 1; + else if (StoreTy->isIntegerTy(16)) + ElemBytes = 2; + else if (IsFloat || StoreTy->isIntegerTy(32)) + ElemBytes = 4; + else if (StoreTy->isIntegerTy(64)) + ElemBytes = 8; + if (ElemBytes == 0) { + reject("non_float_store_value"); + return false; + } + + if (auto SelectStore = emitSelectBaseStoreSplit(SI, IsFloat, ElemBytes)) + return *SelectStore; + auto Val = emitValue(SI->getValueOperand()); if (!Val) { reject(unsupportedValueReason(SI->getValueOperand())); return false; } + if (auto *UpdateI = + dyn_cast(SI->getValueOperand())) { + auto RecIt = RecurrencePlansByUpdate.find(UpdateI); + if (RecIt != RecurrencePlansByUpdate.end()) { + for (unsigned RecIdx : RecIt->second) { + if (RecIdx >= RecurrencePlans.size()) { + reject("invalid_recurrence_plan"); + return false; + } + PendingRecurrenceValues[RecIdx] = *Val; + } + } + } + return emitStoreValueToPtr(SI->getPointerOperand(), *Val, IsFloat, + ElemBytes); + }; - // Preserve original per-iteration instruction order. - // v0.3 encoding rule: v.sw uses an index shift of (2+shamt). For - // contiguous stores we bind idx=zero, but the printed shift still - // must be >= 2 to satisfy the assembler's legality checks. - if (IndexReg == "zero" && StoreShift < 2) - StoreShift = 2; - OS << " v.sw.brg " << *Val << ", [ri" << BaseRi - << ", lc0<<2, " << IndexReg << "<<" << StoreShift << "]\n"; - return true; - }; - - DenseMap PendingRecurrenceValues; auto emitBodyInstructions = [&](BasicBlock *BB) -> bool { for (Instruction &I : *BB) { if (isa(I) || isa(I) || isa(I) || @@ -3882,6 +5476,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto isBodyBlock = [&](BasicBlock *BB) -> bool { if (!BB || !L->contains(BB)) return false; + if (IfConvertibleRegionBlocks.contains(BB)) + return false; // Header is always emitted first. Include the latch block as part // of the linearized body so we don't drop iteration-tail side // effects (stores/recurrence updates) that are commonly placed in @@ -3903,6 +5499,28 @@ class LinxISASIMTAutoVectorize : public FunctionPass { Nodes.push_back(BB); }; + auto forEachDiscoveredBodySucc = + [&](BasicBlock *BB, function_ref Fn) { + auto SplitIt = IfConvertibleSplits.find(BB); + if (SplitIt != IfConvertibleSplits.end()) { + Fn(SplitIt->second.MergeBB); + return; + } + auto *TI = BB ? BB->getTerminator() : nullptr; + if (!TI) + return; + if (auto *BI = dyn_cast(TI)) { + for (unsigned SI = 0; SI < BI->getNumSuccessors(); ++SI) + Fn(BI->getSuccessor(SI)); + return; + } + if (auto *SI = dyn_cast(TI)) { + Fn(SI->getDefaultDest()); + for (auto Case : SI->cases()) + Fn(Case.getCaseSuccessor()); + } + }; + // Discover all blocks reachable from Header within the "iteration" // CFG (excluding edges to Header/Latch). for (unsigned NI = 0; NI < Nodes.size(); ++NI) { @@ -3912,15 +5530,10 @@ class LinxISASIMTAutoVectorize : public FunctionPass { reject("missing_terminator"); return false; } - if (auto *BI = dyn_cast(TI)) { - for (unsigned SI = 0; SI < BI->getNumSuccessors(); ++SI) - addNode(BI->getSuccessor(SI)); - continue; - } - if (auto *SI = dyn_cast(TI)) { - addNode(SI->getDefaultDest()); - for (auto Case : SI->cases()) - addNode(Case.getCaseSuccessor()); + if (isa(TI) || isa(TI)) { + forEachDiscoveredBodySucc(BB, [&](BasicBlock *Succ) { + addNode(Succ); + }); continue; } reject("unsupported_terminator"); @@ -3929,6 +5542,13 @@ class LinxISASIMTAutoVectorize : public FunctionPass { auto forEachBodySucc = [&](BasicBlock *BB, function_ref Fn) { + auto SplitIt = IfConvertibleSplits.find(BB); + if (SplitIt != IfConvertibleSplits.end()) { + BasicBlock *Succ = SplitIt->second.MergeBB; + if (NodeSet.count(Succ) && Succ != Header) + Fn(Succ); + return; + } auto *TI = BB ? BB->getTerminator() : nullptr; if (!TI) return; @@ -4173,30 +5793,153 @@ class LinxISASIMTAutoVectorize : public FunctionPass { T.starts_with("acc"); }; + auto emitExecMaskCompare = [&](StringRef Mnemonic, StringRef Lhs, + StringRef Rhs) { + OS << " " << Mnemonic << " " << Lhs << ", " << Rhs + << ", ->p\n"; + }; + + auto emitBranchOnExecMask = [&](StringRef TrueLabel, + StringRef FalseLabel) { + OS << " b.nz " << TrueLabel << "\n"; + if (TrueLabel != FalseLabel) + OS << " j " << FalseLabel << "\n"; + }; + + auto emitPredicateToExecMask = [&](Value *PredicateExpr) -> bool { + if (auto *Cmp = dyn_cast(PredicateExpr)) { + auto L = emitValue(Cmp->getOperand(0)); + auto R = emitValue(Cmp->getOperand(1)); + if (!L || !R) + return false; + if (!isVectorToken(*L) && !isVectorToken(*R)) + return false; + if (Cmp->getOperand(0)->getType()->isIntegerTy(1) || + Cmp->getOperand(1)->getType()->isIntegerTy(1)) { + return false; + } + + StringRef Mn; + std::string A = formatIntSrc(*L); + std::string B = formatIntSrc(*R); + switch (Cmp->getPredicate()) { + case CmpInst::ICMP_EQ: + Mn = "v.cmp.eq"; + break; + case CmpInst::ICMP_NE: + Mn = "v.cmp.ne"; + break; + case CmpInst::ICMP_SLT: + Mn = "v.cmp.lt"; + break; + case CmpInst::ICMP_SLE: + Mn = "v.cmp.ge"; + std::swap(A, B); + break; + case CmpInst::ICMP_SGT: + Mn = "v.cmp.lt"; + std::swap(A, B); + break; + case CmpInst::ICMP_SGE: + Mn = "v.cmp.ge"; + break; + case CmpInst::ICMP_ULT: + Mn = "v.cmp.ltu"; + break; + case CmpInst::ICMP_ULE: + Mn = "v.cmp.geu"; + std::swap(A, B); + break; + case CmpInst::ICMP_UGT: + Mn = "v.cmp.ltu"; + std::swap(A, B); + break; + case CmpInst::ICMP_UGE: + Mn = "v.cmp.geu"; + break; + default: + return false; + } + + emitExecMaskCompare(Mn, A, B); + return true; + } + + if (auto *FCmp = dyn_cast(PredicateExpr)) { + auto L = emitValue(FCmp->getOperand(0)); + auto R = emitValue(FCmp->getOperand(1)); + if (!L || !R) + return false; + + StringRef Mn; + std::string A = formatFloatSrc(*L); + std::string B = formatFloatSrc(*R); + switch (FCmp->getPredicate()) { + case CmpInst::FCMP_OEQ: + case CmpInst::FCMP_UEQ: + Mn = "v.feq"; + break; + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UNE: + Mn = "v.fne"; + break; + case CmpInst::FCMP_OLT: + case CmpInst::FCMP_ULT: + Mn = "v.flt"; + break; + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_ULE: + Mn = "v.fge"; + std::swap(A, B); + break; + case CmpInst::FCMP_OGT: + case CmpInst::FCMP_UGT: + Mn = "v.flt"; + std::swap(A, B); + break; + case CmpInst::FCMP_OGE: + case CmpInst::FCMP_UGE: + Mn = "v.fge"; + break; + default: + return false; + } + + emitExecMaskCompare(Mn, A, B); + return true; + } + + auto Pred = emitCondition(PredicateExpr); + if (!Pred || !isVectorToken(*Pred)) + return false; + emitExecMaskCompare("v.cmp.ne", formatMaskSrc(*Pred), "zero"); + return true; + }; + auto emitCondBranch = [&](Value *Cond, StringRef TrueLabel, - StringRef FalseLabel) -> bool { + StringRef FalseLabel, + StringRef SavedMaskReg = StringRef()) -> bool { std::string Mnemonic = "b.ne"; std::string Lhs; std::string Rhs = "zero"; + bool BranchAlreadyEmitted = false; auto emitPredicatedBranch = [&](Value *PredicateExpr) -> bool { + if (emitPredicateToExecMask(PredicateExpr)) { + if (!SavedMaskReg.empty()) { + OS << " v.psel p, ri" << *ExecMaskSaveOneBind + << ", ->" << formatWordDest(SavedMaskReg) << "\n"; + } + emitBranchOnExecMask(TrueLabel, FalseLabel); + BranchAlreadyEmitted = true; + return true; + } auto Pred = emitCondition(PredicateExpr); if (!Pred) { reject("unsupported_branch_condition"); return false; } - if (isVectorToken(*Pred)) { - // SIMT inner-CF fallback: branch on per-group "any-active-lane" - // predicate using vector OR-reduction to a scalar queue register. - // Keep this in-body so B.EQ/B.NE carry the actual CFG edges. - // Reduce ops accumulate into the destination register; seed our - // scratch reduce destination before each use. - OS << " c.movr zero, ->t\n"; - OS << " v.rdor " << *Pred << ", ->t#1\n"; - Lhs = "t#1"; - } else { - Lhs = *Pred; - } + Lhs = *Pred; Mnemonic = "b.ne"; Rhs = "zero"; return true; @@ -4274,6 +6017,9 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return false; } + if (BranchAlreadyEmitted) + return true; + OS << " " << Mnemonic << " " << Lhs << ", " << Rhs << ", " << TrueLabel << "\n"; if (TrueLabel != FalseLabel) @@ -4294,14 +6040,37 @@ class LinxISASIMTAutoVectorize : public FunctionPass { reject("exit_phi_value_emit_failed"); return false; } - if (!emitStoreToInvariantBind(*Tok, BaseRi)) { + const ExitPhiPlan *Plan = nullptr; + for (const ExitPhiPlan &P : ExitPhiPlans) { + if (P.SlotBind == BaseRi) { + Plan = &P; + break; + } + } + const bool Stored = + (Plan && Plan->LocalWordBase) + ? emitStoreToSharedLocalWordBase( + *Tok, *Plan->LocalWordBase, + /*IsFloat=*/VIn->getType() == + Type::getFloatTy(Ctx)) + : emitStoreToInvariantBind( + *Tok, BaseRi, + /*IsFloat=*/VIn->getType() == + Type::getFloatTy(Ctx)); + if (!Stored) { reject("exit_phi_store_emit_failed"); return false; } } } - if (ActiveSlotBind && NeedsActiveReplay) { - if (!emitStoreToInvariantBind("zero", *ActiveSlotBind)) { + if (NeedsActiveReplay && + (ActiveSlotBind || ActiveSlotLocalWordBase)) { + const bool StoredActive = + ActiveSlotLocalWordBase + ? emitStoreToLocalWordBase("zero", + *ActiveSlotLocalWordBase) + : emitStoreToActiveBind("zero", *ActiveSlotBind); + if (!StoredActive) { reject("active_store_emit_failed"); return false; } @@ -4327,6 +6096,54 @@ class LinxISASIMTAutoVectorize : public FunctionPass { for (BasicBlock *BB : EmitOrder) { if (BB != Header) OS << Labels.lookup(BB) << ":\n"; + if (auto RestoreIt = ReplayMaskRestoreRegsByMerge.find(BB); + RestoreIt != ReplayMaskRestoreRegsByMerge.end()) { + for (const std::string &SavedMaskReg : RestoreIt->second) + emitExecMaskCompare("v.cmp.ne", formatIntSrc(SavedMaskReg), + "zero"); + } + auto StoreMergeIt = IfConvertibleStoreMerges.find(BB); + if (StoreMergeIt != IfConvertibleStoreMerges.end()) { + const IfConvertibleStoreMergePlan &Plan = StoreMergeIt->second; + auto *BI = + dyn_cast_or_null(Plan.BranchBB->getTerminator()); + if (!BI || !BI->isConditional() || BI->getNumSuccessors() != 2) { + reject("invalid_store_merge_plan"); + return false; + } + if (!Plan.TrueStore || !Plan.FalseStore) { + reject("invalid_store_merge_plan"); + return false; + } + if (Plan.TrueStore->getValueOperand()->getType() != + Type::getFloatTy(Ctx) || + Plan.FalseStore->getValueOperand()->getType() != + Type::getFloatTy(Ctx)) { + reject("non_float_store_value"); + return false; + } + auto Pred = emitCondition(BI->getCondition()); + auto TV = emitValue(Plan.TrueStore->getValueOperand()); + auto FV = emitValue(Plan.FalseStore->getValueOperand()); + if (!Pred || !TV || !FV) { + reject("store_merge_value_emit_failed"); + return false; + } + auto Dst = allocVec(); + if (!Dst) { + reject("vector_reg_exhausted"); + return false; + } + OS << " v.csel " << formatMaskSrc(*Pred) << ", " + << formatFloatSrc(*TV) << ", " << formatFloatSrc(*FV) + << ", ->" << formatWordDest(*Dst) << "\n"; + if (!emitStoreValueToPtr(Plan.TrueStore->getPointerOperand(), + *Dst, /*IsFloat=*/true, + /*ElemBytes=*/4)) { + reject("store_merge_emit_failed"); + return false; + } + } if (!emitBodyInstructions(BB)) return false; @@ -4339,6 +6156,22 @@ class LinxISASIMTAutoVectorize : public FunctionPass { } if (BI) { + auto SplitIt = IfConvertibleSplits.find(BB); + if (SplitIt != IfConvertibleSplits.end()) { + BasicBlock *MergeBB = SplitIt->second.MergeBB; + std::string MergeLabel = labelForSucc(MergeBB); + if (MergeLabel != EndLabel) { + auto CurIt = LabelIndex.find(BB); + auto MergeIt = LabelIndex.find(MergeBB); + if (CurIt != LabelIndex.end() && MergeIt != LabelIndex.end() && + MergeIt->second <= CurIt->second) { + reject("unsupported_inner_backedge"); + return false; + } + } + OS << " j " << MergeLabel << "\n"; + continue; + } if (!BI->isConditional()) { BasicBlock *Succ = BI->getSuccessor(0); std::string Target = targetLabelForSucc(BB, Succ); @@ -4401,7 +6234,12 @@ class LinxISASIMTAutoVectorize : public FunctionPass { OS << " j " << TrueLabel << "\n"; continue; } - if (!emitCondBranch(BI->getCondition(), TrueLabel, FalseLabel)) + StringRef SavedMaskReg; + if (auto MaskIt = ReplayMaskSaveRegByBranch.find(BB); + MaskIt != ReplayMaskSaveRegByBranch.end()) + SavedMaskReg = MaskIt->second; + if (!emitCondBranch(BI->getCondition(), TrueLabel, FalseLabel, + SavedMaskReg)) return false; continue; } @@ -4432,19 +6270,11 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return false; } } - auto Pred = allocVec(); - if (!Pred) { - reject("vector_reg_exhausted"); - return false; - } - OS << " v.cmp.eq " << *CondTok << ", " << *CaseTok << ", ->" - << *Pred << "\n"; - // Reduce ops accumulate into the destination register; seed our - // scratch reduce destination before each use. - OS << " c.movr zero, ->t\n"; - OS << " v.rdor " << *Pred << ", ->t#1\n"; - OS << " b.ne t#1, zero, " << DestLabel << "\n"; - } + OS << " v.cmp.eq " << formatIntSrc(*CondTok) << ", " + << formatIntSrc(*CaseTok) + << ", ->p\n"; + OS << " b.nz " << DestLabel << "\n"; + } std::string DefaultLabel = targetLabelForSucc(BB, SI->getDefaultDest()); OS << " j " << DefaultLabel << "\n"; @@ -4486,8 +6316,9 @@ class LinxISASIMTAutoVectorize : public FunctionPass { } SelTok = *Tok; } - OS << " v.add zero, " << SelTok << ", ->" << Plan.SelReg - << "\n"; + OS << " v.add zero, " << formatIntSrc(SelTok) + << ", ->" << formatAssignedWordDest(Plan.SelReg) + << "\n"; } } @@ -4527,7 +6358,13 @@ class LinxISASIMTAutoVectorize : public FunctionPass { reject("missing_phi_reg"); return false; } - OS << " v.add " << *SrcTok << ", zero, ->" << DIt->second + const bool IsFloat = + Phi->getType() == Type::getFloatTy(Ctx); + OS << " v.add " + << (IsFloat ? formatFloatSrc(*SrcTok) + : formatIntSrc(*SrcTok)) + << ", zero, ->" + << formatAssignedWordDest(DIt->second) << "\n"; } } @@ -4548,9 +6385,33 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return true; }; + if (ActiveSlotLocalWordBase) { + auto OneTok = emitValue(ConstantInt::get(I64Ty, 1)); + if (!OneTok || + !emitStoreToLocalWordBase(*OneTok, *ActiveSlotLocalWordBase)) { + reject("active_init_emit_failed"); + return false; + } + } + + for (const ExitPhiPlan &Plan : ExitPhiPlans) { + if (!Plan.LocalWordBase) + continue; + auto InitTok = emitLoadFromInvariantBind(Plan.SlotBind); + if (!InitTok || + !emitStoreToSharedLocalWordBase( + *InitTok, *Plan.LocalWordBase, + /*IsFloat=*/Plan.Phi && Plan.Phi->getType() == Type::getFloatTy(Ctx))) { + reject("exit_phi_store_emit_failed"); + return false; + } + } + const std::string AfterLabel = "L_after"; - if (ActiveSlotBind) { - auto ActiveTok = emitLoadFromInvariantBind(*ActiveSlotBind); + if (ActiveSlotBind || ActiveSlotLocalWordBase) { + auto ActiveTok = ActiveSlotLocalWordBase + ? emitLoadFromLocalWordBase(*ActiveSlotLocalWordBase) + : emitLoadFromActiveBind(*ActiveSlotBind); if (!ActiveTok) { reject("active_load_failed"); return false; @@ -4560,11 +6421,12 @@ class LinxISASIMTAutoVectorize : public FunctionPass { reject("vector_reg_exhausted"); return false; } - OS << " v.cmp.eq " << *ActiveTok << ", zero, ->" << *Pred << "\n"; + OS << " v.cmp.eq " << formatIntSrc(*ActiveTok) + << ", zero, ->" << formatMaskDest(*Pred) << "\n"; // Reduce ops accumulate into the destination register; seed our scratch // reduce destination before each use. OS << " c.movr zero, ->t\n"; - OS << " v.rdor " << *Pred << ", ->t#1\n"; + OS << " v.rdor " << formatMaskSrc(*Pred) << ", ->t#1\n"; OS << " b.ne t#1, zero, " << AfterLabel << "\n"; } @@ -4590,13 +6452,29 @@ class LinxISASIMTAutoVectorize : public FunctionPass { reject("recurrence_update_not_emitted"); return false; } - if (!emitStoreToInvariantBind(*UpdateVal, Plan.SlotBind)) { + if (Plan.LocalWordBase && + !emitStoreToLocalWordBase(*UpdateVal, + *Plan.LocalWordBase + 1u, + /*IsFloat=*/true)) { + reject("recurrence_store_emit_failed"); + return false; + } + if (!emitStoreToInvariantBind(*UpdateVal, Plan.SlotBind, + /*IsFloat=*/true)) { reject("recurrence_store_emit_failed"); return false; } continue; } - if (!emitStoreToInvariantBind(It->second, Plan.SlotBind)) { + if (Plan.LocalWordBase && + !emitStoreToLocalWordBase(It->second, + *Plan.LocalWordBase + 1u, + /*IsFloat=*/true)) { + reject("recurrence_store_emit_failed"); + return false; + } + if (!emitStoreToInvariantBind(It->second, Plan.SlotBind, + /*IsFloat=*/true)) { reject("recurrence_store_emit_failed"); return false; } @@ -4624,15 +6502,17 @@ class LinxISASIMTAutoVectorize : public FunctionPass { reject("vector_reg_exhausted"); return false; } - OS << " v.fadd " << *Cur << ", " << *StepTok << ", ->" << *Next + OS << " v.fadd " << formatFloatSrc(*Cur) << ", " + << formatFloatSrc(*StepTok) << ", ->" << formatWordDest(*Next) << "\n"; - if (!emitStoreToInvariantBind(*Next, Plan.SlotBind)) { + if (!emitStoreToInvariantBind(*Next, Plan.SlotBind, /*IsFloat=*/true)) { reject("f32_induction_store_emit_failed"); return false; } } - if (ActiveSlotBind && NeedsActiveReplay && ActiveContinueCond) { + if ((ActiveSlotBind || ActiveSlotLocalWordBase) && NeedsActiveReplay && + ActiveContinueCond) { auto PredTok = emitCondition(ActiveContinueCond); if (!PredTok) { reject("active_cond_emit_failed"); @@ -4645,14 +6525,20 @@ class LinxISASIMTAutoVectorize : public FunctionPass { reject("vector_reg_exhausted"); return false; } - OS << " v.cmp.eq " << PredName << ", zero, ->" << *Inv << "\n"; + OS << " v.cmp.eq " << formatMaskSrc(PredName) + << ", zero, ->" << formatMaskDest(*Inv) << "\n"; PredName = *Inv; } // Reduce ops accumulate into the destination register; seed our scratch // reduce destination before each use. OS << " c.movr zero, ->t\n"; - OS << " v.rdor " << PredName << ", ->t#1\n"; - if (!emitStoreToInvariantBind("t#1", *ActiveSlotBind)) { + OS << " v.rdor " << formatMaskSrc(PredName) << ", ->t#1\n"; + const bool StoredActive = + ActiveSlotLocalWordBase + ? emitStoreToLocalWordBase("t#1", + *ActiveSlotLocalWordBase) + : emitStoreToActiveBind("t#1", *ActiveSlotBind); + if (!StoredActive) { reject("active_store_emit_failed"); return false; } @@ -4689,6 +6575,15 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return false; } Plan.SlotBind = *Bind; + if (UseGroupedDims) + Plan.LocalWordBase = reserveLocalWords(LaneCount * GroupCount); + + auto InitTok = emitValue(Plan.InitValue); + if (!InitTok) { + reject("unsupported_reduction_init"); + return false; + } + OS << " c.movr " << *InitTok << ", ->" << Plan.DstName << "\n"; std::optional Src; if (Plan.LaneMulL && Plan.LaneMulR) { @@ -4703,7 +6598,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { reject("vector_reg_exhausted"); return false; } - OS << " v.fmul " << *Lhs << ", " << *Rhs << ", ->" << *Mul + OS << " v.fmul " << formatFloatSrc(*Lhs) << ", " + << formatFloatSrc(*Rhs) << ", ->" << formatWordDest(*Mul) << "\n"; Src = *Mul; } else { @@ -4714,10 +6610,19 @@ class LinxISASIMTAutoVectorize : public FunctionPass { return false; } - OS << " " << reductionMnemonic(Plan.Kind) << " " << *Src << ", ->" - << Plan.DstName << "\n"; - OS << " v.sw.brg " << Plan.DstName << ", [ri" << Plan.SlotBind - << ", lc0<<2, zero<<2]\n"; + OS << " " << reductionMnemonic(Plan.Kind) << " " + << formatFloatSrc(*Src) << ", ->" + << Plan.DstName << "\n"; + if (Plan.LocalWordBase) { + if (!emitStoreToLocalWordBase(Plan.DstName, *Plan.LocalWordBase)) { + reject("reduction_store_emit_failed"); + return false; + } + } else { + OS << " v.sw.brg " << Plan.DstName << ", [ri" << Plan.SlotBind + << ", " << formatAddrExpr("lc0<<2") << ", " + << formatAddrExpr("zero<<2") << "]\n"; + } } } @@ -4729,15 +6634,75 @@ class LinxISASIMTAutoVectorize : public FunctionPass { reject("unsupported_liveout_value"); return false; } - if (!emitStoreToInvariantBind(*Tok, Plan.SlotBind)) { + const bool Stored = + Plan.LocalWordBase + ? emitStoreToSharedLocalWordBase( + *Tok, *Plan.LocalWordBase, + /*IsFloat=*/Plan.Inst->getType() == + Type::getFloatTy(Ctx)) + : emitStoreToInvariantBind( + *Tok, Plan.SlotBind, + /*IsFloat=*/Plan.Inst->getType() == + Type::getFloatTy(Ctx)); + if (!Stored) { reject("liveout_store_emit_failed"); return false; } } - if (ActiveSlotBind) + if (ActiveSlotBind || ActiveSlotLocalWordBase) OS << AfterLabel << ":\n"; + + for (const ExitPhiPlan &Plan : ExitPhiPlans) { + if (!Plan.LocalWordBase) + continue; + auto Tok = emitLoadFromSharedLocalWordBase(*Plan.LocalWordBase); + if (!Tok || + !emitStoreToInvariantBind( + *Tok, Plan.SlotBind, + /*IsFloat=*/Plan.Phi && + Plan.Phi->getType() == Type::getFloatTy(Ctx))) { + reject("exit_phi_store_emit_failed"); + return false; + } + } + + for (const LiveOutPlan &Plan : LiveOutPlans) { + if (!Plan.LocalWordBase) + continue; + auto Tok = emitLoadFromSharedLocalWordBase(*Plan.LocalWordBase); + if (!Tok || + !emitStoreToInvariantBind( + *Tok, Plan.SlotBind, + /*IsFloat=*/Plan.Inst && + Plan.Inst->getType() == Type::getFloatTy(Ctx))) { + reject("liveout_store_emit_failed"); + return false; + } + } + + for (const ReductionPlan &Plan : ReductionPlans) { + if (!Plan.LocalWordBase) + continue; + auto Tok = emitLoadFromLocalWordBase(*Plan.LocalWordBase); + if (!Tok) { + reject("reduction_store_emit_failed"); + return false; + } + if (!emitStoreToInvariantBind(*Tok, Plan.SlotBind, + /*IsFloat=*/true)) { + reject("reduction_store_emit_failed"); + return false; + } + } OS << " C.BSTOP\n"; + F.removeFnAttr("linx-vblock-ts-bytes"); + if (LocalScratchWordCount != 0) { + const uint64_t ScratchBytes = LocalScratchWordCount * 4u; + const uint64_t RoundedBytes = + std::max(16u, PowerOf2Ceil(ScratchBytes)); + F.addFnAttr("linx-vblock-ts-bytes", std::to_string(RoundedBytes)); + } F.addFnAttr("linx-vblock-body-asm", OS.str()); // Decoupled body contract: @@ -4897,7 +6862,8 @@ class LinxISASIMTAutoVectorize : public FunctionPass { HasExtraPhi, RemarkLaneCount, RemarkGroupCount, RemarkForceScalarLane, RemarkHasRecurrence, RemarkHeaderKind, RemarkTouchesMemoryState, RemarkTripcountSource, - RemarkAddressModel); + RemarkAddressModel, layoutPolicyName(LinxSIMTAutoVecLayout), + RemarkLayoutKind, RemarkCFStrategy); } return Changed; } @@ -4931,3 +6897,19 @@ StringRef llvm::linxSIMTAutoVectorizeRemarksPath() { FunctionPass *llvm::createLinxISASIMTAutoVectorizePass() { return new LinxISASIMTAutoVectorize(); } + +PreservedAnalyses llvm::LinxISASIMTAutoVectorizePass::run( + Function &F, FunctionAnalysisManager &AM) { + (void)AM; + if (!linxSIMTAutoVectorizeEnabled() || F.isDeclaration() || + isTsvcAuxHelperName(F.getName()) || !F.getParent()) { + return PreservedAnalyses::all(); + } + + legacy::FunctionPassManager FPM(F.getParent()); + FPM.add(createLinxISASIMTAutoVectorizePass()); + FPM.doInitialization(); + bool Changed = FPM.run(F); + FPM.doFinalization(); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/lib/Target/LinxISA/LinxISASIMTAutoVectorize.h b/llvm/lib/Target/LinxISA/LinxISASIMTAutoVectorize.h index 02616ea0f1ea8..2ae68706a33bc 100644 --- a/llvm/lib/Target/LinxISA/LinxISASIMTAutoVectorize.h +++ b/llvm/lib/Target/LinxISA/LinxISASIMTAutoVectorize.h @@ -10,9 +10,11 @@ #define LLVM_LIB_TARGET_LINXISA_LINXISASIMTAUTOVECTORIZER_H #include "llvm/ADT/StringRef.h" +#include "llvm/IR/PassManager.h" namespace llvm { +class Function; class FunctionPass; bool linxSIMTAutoVectorizeEnabled(); @@ -21,6 +23,12 @@ StringRef linxSIMTAutoVectorizeRemarksPath(); FunctionPass *createLinxISASIMTAutoVectorizePass(); +class LinxISASIMTAutoVectorizePass + : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + } // namespace llvm #endif // LLVM_LIB_TARGET_LINXISA_LINXISASIMTAUTOVECTORIZER_H diff --git a/llvm/lib/Target/LinxISA/LinxISATargetMachine.cpp b/llvm/lib/Target/LinxISA/LinxISATargetMachine.cpp index 73c93c2eb503f..6fdfe5f2226ed 100644 --- a/llvm/lib/Target/LinxISA/LinxISATargetMachine.cpp +++ b/llvm/lib/Target/LinxISA/LinxISATargetMachine.cpp @@ -8,6 +8,7 @@ #include "LinxISATargetMachine.h" #include "LinxISA.h" +#include "LinxISASIMTAutoVectorize.h" #include "LinxISAMachineFunctionInfo.h" #include "LinxISATargetTransformInfo.h" #include "TargetInfo/LinxISATargetInfo.h" @@ -15,7 +16,9 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/PassManager.h" #include "llvm/MC/TargetRegistry.h" +#include "llvm/Passes/PassBuilder.h" #include "llvm/Support/Compiler.h" using namespace llvm; @@ -127,3 +130,18 @@ TargetTransformInfo LinxISATargetMachine::getTargetTransformInfo(const Function &F) const { return TargetTransformInfo(std::make_unique(this, F)); } + +void LinxISATargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { + PB.registerOptimizerLastEPCallback( + [this](ModulePassManager &MPM, OptimizationLevel Level, + ThinOrFullLTOPhase Phase) { + (void)this; + (void)Phase; + if (Level == OptimizationLevel::O0 || Level == OptimizationLevel::O1) + return; + + FunctionPassManager FPM; + FPM.addPass(LinxISASIMTAutoVectorizePass()); + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + }); +} diff --git a/llvm/lib/Target/LinxISA/LinxISATargetMachine.h b/llvm/lib/Target/LinxISA/LinxISATargetMachine.h index e5b7e24ff2186..d353a1458674d 100644 --- a/llvm/lib/Target/LinxISA/LinxISATargetMachine.h +++ b/llvm/lib/Target/LinxISA/LinxISATargetMachine.h @@ -18,6 +18,7 @@ namespace llvm { class Function; struct MachineFunctionInfo; +class PassBuilder; class TargetSubtargetInfo; class LinxISATargetMachine : public CodeGenTargetMachineImpl { @@ -46,6 +47,7 @@ class LinxISATargetMachine : public CodeGenTargetMachineImpl { const TargetSubtargetInfo *STI) const override; TargetTransformInfo getTargetTransformInfo(const Function &F) const override; + void registerPassBuilderCallbacks(PassBuilder &PB) override; }; } // namespace llvm diff --git a/llvm/lib/Target/LinxISA/MCTargetDesc/LinxISAInstPrinter.cpp b/llvm/lib/Target/LinxISA/MCTargetDesc/LinxISAInstPrinter.cpp index c5785b6a35cd3..58b96902dff2e 100644 --- a/llvm/lib/Target/LinxISA/MCTargetDesc/LinxISAInstPrinter.cpp +++ b/llvm/lib/Target/LinxISA/MCTargetDesc/LinxISAInstPrinter.cpp @@ -25,6 +25,10 @@ static StringRef reg5Name(unsigned Code) { static void printReg10Name(raw_ostream &OS, unsigned Code) { Code &= 0x3ffu; + if (Code == 92u) { + OS << "p"; + return; + } if (Code < 32u) { OS << reg5Name(Code); return; @@ -2312,7 +2316,7 @@ void LinxISAInstPrinter::printInst(const MCInst *MI, uint64_t Address, return; } - // Pretty printer for the common "SrcL, SrcR<" operand form. + // Pretty printer for the common "SrcP, SrcL, SrcR<" form. if (Form.mnemonic && (StringRef(Form.mnemonic).equals_insensitive("CSEL") || StringRef(Form.mnemonic).equals_insensitive("V.CSEL")) && diff --git a/llvm/lib/Target/LinxISA/MCTargetDesc/LinxISAMCCodeEmitter.cpp b/llvm/lib/Target/LinxISA/MCTargetDesc/LinxISAMCCodeEmitter.cpp index 63e9b98e97840..c323bbfba41a2 100644 --- a/llvm/lib/Target/LinxISA/MCTargetDesc/LinxISAMCCodeEmitter.cpp +++ b/llvm/lib/Target/LinxISA/MCTargetDesc/LinxISAMCCodeEmitter.cpp @@ -133,9 +133,13 @@ void LinxISAMCCodeEmitter::encodeInstruction(const MCInst &MI, bool PCRel = true; if (Name == "simm12" && Mnemonic.starts_with("B.")) { Kind = static_cast(LinxISA::FIXUP_LINX_B12_PCREL); - } else if (Name == "simm12" && Mnemonic.starts_with("C.BSTART")) { + } else if ((Name == "simm22" || Name == "label") && + Mnemonic.starts_with("B.")) { + Kind = static_cast(LinxISA::FIXUP_LINX_J22_PCREL); + } else if ((Name == "simm12" || Name == "label") && + Mnemonic.starts_with("C.BSTART")) { Kind = static_cast(LinxISA::FIXUP_LINX_CBSTART12_PCREL); - } else if (Name == "simm22" && Mnemonic == "J") { + } else if ((Name == "simm22" || Name == "label") && Mnemonic == "J") { Kind = static_cast(LinxISA::FIXUP_LINX_J22_PCREL); } else if (Name == "simm17" && Mnemonic.starts_with("BSTART.")) { if (hasPltVariant(Expr)) @@ -180,7 +184,11 @@ void LinxISAMCCodeEmitter::encodeInstruction(const MCInst &MI, Kind = static_cast(LinxISA::FIXUP_LINX_PCR17_STORE); } } else { - report_fatal_error("Linx: unsupported expression fixup"); + SmallString<128> Msg; + raw_svector_ostream OS(Msg); + OS << "Linx: unsupported expression fixup for mnemonic '" << Mnemonic + << "', field '" << Name << "'"; + report_fatal_error(OS.str()); } Fixups.push_back(MCFixup::create(/*Offset=*/0, Expr, Kind, /*PCRel=*/PCRel)); diff --git a/llvm/test/CodeGen/LinxISA/autovec_active_replay_break.ll b/llvm/test/CodeGen/LinxISA/autovec_active_replay_break.ll new file mode 100644 index 0000000000000..ba8dea0690e7f --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_active_replay_break.ll @@ -0,0 +1,37 @@ +; RUN: rm -f %t.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-remarks=%t.remarks.json < %s | FileCheck %s +; RUN: FileCheck %s --check-prefix=REMARK < %t.remarks.json + +define void @search_store_index(ptr nocapture %a, ptr nocapture %out) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %cont ] + %slot = getelementptr inbounds i32, ptr %a, i64 %i + %v = load i32, ptr %slot, align 4 + %found = icmp sgt i32 %v, 0 + br i1 %found, label %break, label %cont + +break: + %iret = trunc i64 %i to i32 + store i32 %iret, ptr %out, align 4 + br label %exit + +cont: + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + ret void +} + +; CHECK-LABEL: search_store_index: +; CHECK: FENTRY +; CHECK: C.BSTART.STD +; CHECK-NOT: BSTART.MSEQ + +; REMARK: "function":"search_store_index" +; REMARK: "status":"reject" +; REMARK: "reason":"unsupported_exit_side_effects" diff --git a/llvm/test/CodeGen/LinxISA/autovec_exit_side_effect_reject.ll b/llvm/test/CodeGen/LinxISA/autovec_exit_side_effect_reject.ll new file mode 100644 index 0000000000000..f6f0fb8a6bb34 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_exit_side_effect_reject.ll @@ -0,0 +1,33 @@ +; RUN: rm -f %t.reject.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-layout=grouped --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.reject.remarks.json < %s > /dev/null +; RUN: FileCheck %s --check-prefix=REMARK-REJECT < %t.reject.remarks.json + +define void @search_store_index_exit_store_reject(ptr nocapture %a, ptr nocapture %out) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %cont ] + %slot = getelementptr inbounds i32, ptr %a, i64 %i + %v = load i32, ptr %slot, align 4 + %found = icmp sgt i32 %v, 0 + br i1 %found, label %break, label %cont + +break: + %iret = trunc i64 %i to i32 + store i32 %iret, ptr %out, align 4 + br label %exit + +cont: + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + ret void +} + +; REMARK-REJECT: "function":"search_store_index_exit_store_reject" +; REMARK-REJECT: "status":"reject" +; REMARK-REJECT: "reason":"unsupported_exit_side_effects" +; REMARK-REJECT: "layout_policy":"grouped" diff --git a/llvm/test/CodeGen/LinxISA/autovec_generic_liveout.ll b/llvm/test/CodeGen/LinxISA/autovec_generic_liveout.ll new file mode 100644 index 0000000000000..c01b030820283 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_generic_liveout.ll @@ -0,0 +1,34 @@ +; RUN: llc -mtriple=linx64 -O2 \ +; RUN: --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq \ +; RUN: --linx-simt-autovec-layout=grouped --linx-simt-autovec-lanes=32 \ +; RUN: < %s | FileCheck %s + +define void @copy_and_last_value_liveout(ptr nocapture readonly %a, + ptr nocapture writeonly %tmp, + ptr nocapture writeonly %out) local_unnamed_addr { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %ptr = getelementptr inbounds float, ptr %a, i64 %i + %val = load float, ptr %ptr, align 4 + %dst = getelementptr inbounds float, ptr %tmp, i64 %i + store float %val, ptr %dst, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + store float %val, ptr %out, align 4 + ret void +} + +; CHECK-LABEL: copy_and_last_value_liveout: +; CHECK: BSTART.MSEQ +; CHECK: B.TEXT +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.lw.brg +; CHECK: v.sw.brg +; CHECK: v.sw.brg diff --git a/llvm/test/CodeGen/LinxISA/autovec_grouped_active_replay_reject.ll b/llvm/test/CodeGen/LinxISA/autovec_grouped_active_replay_reject.ll new file mode 100644 index 0000000000000..69d94dd92e7b9 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_grouped_active_replay_reject.ll @@ -0,0 +1,74 @@ +; RUN: rm -f %t.auto.remarks.json %t.grouped.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.auto.remarks.json < %s | FileCheck %s --check-prefix=AUTO +; RUN: FileCheck %s --check-prefix=REMARK-AUTO < %t.auto.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-layout=grouped --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.grouped.remarks.json < %s | FileCheck %s --check-prefix=GROUPED +; RUN: FileCheck %s --check-prefix=REMARK-GROUPED < %t.grouped.remarks.json + +define void @search_store_index_grouped_boundary(ptr nocapture %a, ptr nocapture %out) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %cont ] + %slot = getelementptr inbounds i32, ptr %a, i64 %i + %v = load i32, ptr %slot, align 4 + %found = icmp sgt i32 %v, 0 + br i1 %found, label %break, label %cont + +break: + %iret = trunc i64 %i to i32 + br label %exit + +cont: + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + %res = phi i32 [ %iret, %break ], [ -1, %cont ] + store i32 %res, ptr %out, align 4 + ret void +} + +; AUTO-LABEL: search_store_index_grouped_boundary: +; AUTO: BSTART.MSEQ +; AUTO: B.TEXT +; AUTO: B.IOTI{{.*}}->t<0> +; AUTO: B.IOTI{{.*}}->u<5> +; AUTO: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; AUTO: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; AUTO: v.add{{[[:space:]]+}}lc0, lc1.uw<<5, ->vt#1 +; AUTO: v.sw.brg.local{{.*}}ri1, [ts, lc0<<2, lc1<<7] +; AUTO: v.lw.brg.local{{.*}}[ts, lc0<<2, lc1<<7] +; AUTO: v.rdor +; AUTO: b.ne +; AUTO: v.sw.brg.local{{.*}}[ts, lc0<<2, +; AUTO: v.sw.brg.local{{.*}}zero, [ts, lc0<<2, lc1<<7] +; AUTO: v.lw.brg.local{{.*}}[ts, lc0<<2, +; AUTO: v.sw.brg + +; REMARK-AUTO: "function":"search_store_index_grouped_boundary" +; REMARK-AUTO: "status":"lowered" +; REMARK-AUTO: "layout_policy":"auto" +; REMARK-AUTO: "layout_kind":"grouped-strip-mined" +; REMARK-AUTO: "cf_strategy":"active-replay" + +; GROUPED-LABEL: search_store_index_grouped_boundary: +; GROUPED: BSTART.MSEQ +; GROUPED: B.TEXT +; GROUPED: B.IOTI{{.*}}->t<0> +; GROUPED: B.IOTI{{.*}}->u<5> +; GROUPED: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; GROUPED: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; GROUPED: v.sw.brg.local{{.*}}ri1, [ts, lc0<<2, lc1<<7] +; GROUPED: v.lw.brg.local{{.*}}[ts, lc0<<2, lc1<<7] +; GROUPED: v.sw.brg.local{{.*}}[ts, lc0<<2, +; GROUPED: v.sw.brg.local{{.*}}zero, [ts, lc0<<2, lc1<<7] +; GROUPED: v.lw.brg.local{{.*}}[ts, lc0<<2, +; GROUPED: v.sw.brg + +; REMARK-GROUPED: "function":"search_store_index_grouped_boundary" +; REMARK-GROUPED: "status":"lowered" +; REMARK-GROUPED: "layout_policy":"grouped" +; REMARK-GROUPED: "layout_kind":"grouped-strip-mined" +; REMARK-GROUPED: "cf_strategy":"active-replay" diff --git a/llvm/test/CodeGen/LinxISA/autovec_grouped_exec_mask_save_restore_reject.ll b/llvm/test/CodeGen/LinxISA/autovec_grouped_exec_mask_save_restore_reject.ll new file mode 100644 index 0000000000000..7ff9862196115 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_grouped_exec_mask_save_restore_reject.ll @@ -0,0 +1,151 @@ +; RUN: rm -f %t.auto.remarks.json %t.grouped.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.auto.remarks.json < %s | FileCheck %s --check-prefix=AUTO +; RUN: FileCheck %s --check-prefix=REMARK-AUTO < %t.auto.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-layout=grouped --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.grouped.remarks.json < %s | FileCheck %s --check-prefix=GROUPED +; RUN: FileCheck %s --check-prefix=REMARK-GROUPED < %t.grouped.remarks.json + +define void @search_store_index_nested(ptr nocapture %a, ptr nocapture %b, ptr nocapture %out) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %cont ] + %slot = getelementptr inbounds i32, ptr %a, i64 %i + %v = load i32, ptr %slot, align 4 + %found = icmp sgt i32 %v, 0 + br i1 %found, label %break, label %work + +work: + %dst = getelementptr inbounds float, ptr %b, i64 %i + %small = icmp slt i32 %v, 5 + br i1 %small, label %then0, label %else0 + +then0: + store float 1.000000e+00, ptr %dst, align 4 + br label %cont + +else0: + store float 2.000000e+00, ptr %dst, align 4 + br label %cont + +break: + %iret = trunc i64 %i to i32 + br label %exit + +cont: + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + %res = phi i32 [ %iret, %break ], [ -1, %cont ] + store i32 %res, ptr %out, align 4 + ret void +} + +define void @search_store_index_split_addrs(ptr nocapture %a, ptr nocapture %b, + ptr nocapture %c, ptr nocapture %out) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %cont ] + %slot = getelementptr inbounds i32, ptr %a, i64 %i + %v = load i32, ptr %slot, align 4 + %found = icmp sgt i32 %v, 0 + br i1 %found, label %break, label %work + +work: + %dst0 = getelementptr inbounds float, ptr %b, i64 %i + %dst1 = getelementptr inbounds float, ptr %c, i64 %i + %small = icmp slt i32 %v, 5 + br i1 %small, label %then0, label %else0 + +then0: + store float 1.000000e+00, ptr %dst0, align 4 + br label %cont + +else0: + store float 2.000000e+00, ptr %dst1, align 4 + br label %cont + +break: + %iret = trunc i64 %i to i32 + br label %exit + +cont: + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + %res = phi i32 [ %iret, %break ], [ -1, %cont ] + store i32 %res, ptr %out, align 4 + ret void +} + +define void @store_split_addrs_raw(ptr nocapture readonly %a, + ptr nocapture writeonly %b, + ptr nocapture writeonly %c) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %cont ] + %slot = getelementptr inbounds i32, ptr %a, i64 %i + %v = load i32, ptr %slot, align 4 + %dst0 = getelementptr inbounds float, ptr %b, i64 %i + %dst1 = getelementptr inbounds float, ptr %c, i64 %i + %small = icmp slt i32 %v, 5 + br i1 %small, label %then0, label %else0 + +then0: + store float 1.000000e+00, ptr %dst0, align 4 + br label %cont + +else0: + store float 2.000000e+00, ptr %dst1, align 4 + br label %cont + +cont: + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + ret void +} + +; AUTO-LABEL: search_store_index_nested: +; AUTO: BSTART.MSEQ +; AUTO: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; AUTO: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; AUTO: v.rdor +; AUTO: v.csel + +; REMARK-AUTO: "function":"search_store_index_nested" +; REMARK-AUTO: "status":"lowered" +; REMARK-AUTO: "layout_policy":"auto" +; REMARK-AUTO: "layout_kind":"grouped-strip-mined" +; REMARK-AUTO: "cf_strategy":"active-replay" + +; GROUPED-LABEL: search_store_index_split_addrs: +; GROUPED: BSTART.MSEQ +; GROUPED: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; GROUPED: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; GROUPED: v.psel p, +; GROUPED: b.nz +; GROUPED: v.sw.brg +; GROUPED: v.sw.brg +; GROUPED: v.cmp.ne + +; REMARK-GROUPED: "function":"search_store_index_split_addrs" +; REMARK-GROUPED: "status":"lowered" +; REMARK-GROUPED: "layout_policy":"grouped" +; REMARK-GROUPED: "layout_kind":"grouped-strip-mined" +; REMARK-GROUPED: "cf_strategy":"active-replay" +; REMARK-GROUPED: "function":"store_split_addrs_raw" +; REMARK-GROUPED: "status":"reject" +; REMARK-GROUPED: "reason":"grouped_layout_requires_exec_mask_save_restore" +; REMARK-GROUPED: "layout_policy":"grouped" +; REMARK-GROUPED: "cf_strategy":"exec-mask-save-restore-required" diff --git a/llvm/test/CodeGen/LinxISA/autovec_grouped_near_canonical_split_addrs.ll b/llvm/test/CodeGen/LinxISA/autovec_grouped_near_canonical_split_addrs.ll new file mode 100644 index 0000000000000..92071a36066c9 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_grouped_near_canonical_split_addrs.ll @@ -0,0 +1,56 @@ +; RUN: rm -f %t.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-layout=grouped --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.remarks.json < %s | FileCheck %s +; RUN: FileCheck %s --check-prefix=REMARK < %t.remarks.json + +define void @search_store_index_split_addrs_near_canonical(ptr nocapture readonly %a, + ptr nocapture writeonly %b, + ptr nocapture writeonly %c, + ptr nocapture writeonly %out) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %work ] + %slot = getelementptr inbounds i32, ptr %a, i64 %i + %v = load i32, ptr %slot, align 4 + %keep_going = icmp slt i32 %v, 11 + br i1 %keep_going, label %work, label %break + +work: + %small = icmp slt i32 %v, 5 + %dst = select i1 %small, ptr %b, ptr %c + %val = select i1 %small, float 1.000000e+00, float 2.000000e+00 + %dst.slot = getelementptr inbounds float, ptr %dst, i64 %i + store float %val, ptr %dst.slot, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop, !llvm.loop !0 + +break: + %iret = trunc nuw nsw i64 %i to i32 + br label %exit + +exit: + %res = phi i32 [ %iret, %break ], [ -1, %work ] + store i32 %res, ptr %out, align 4 + ret void +} + +; CHECK-LABEL: search_store_index_split_addrs_near_canonical: +; CHECK: BSTART.MSEQ +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.psel p, +; CHECK: b.nz +; CHECK: v.sw.brg +; CHECK: v.sw.brg +; CHECK: v.cmp.ne + +; REMARK: "function":"search_store_index_split_addrs_near_canonical" +; REMARK: "status":"lowered" +; REMARK: "layout_policy":"grouped" +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"active-replay" + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/CodeGen/LinxISA/autovec_ifconverted_diamond.ll b/llvm/test/CodeGen/LinxISA/autovec_ifconverted_diamond.ll new file mode 100644 index 0000000000000..3cc8487707c7c --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_ifconverted_diamond.ll @@ -0,0 +1,106 @@ +; RUN: rm -f %t.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.remarks.json < %s | FileCheck %s --check-prefix=ASM +; RUN: FileCheck %s --check-prefix=REMARK < %t.remarks.json + +define void @vector_inner_diamond(ptr nocapture %a, ptr nocapture %b) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %merge ] + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %cond = fcmp ogt float %bv, 0.000000e+00 + br i1 %cond, label %then, label %else + +then: + %t = fadd float %bv, 1.000000e+00 + br label %merge + +else: + %e = fsub float 0.000000e+00, %bv + br label %merge + +merge: + %sel = phi float [ %t, %then ], [ %e, %else ] + %ap = getelementptr inbounds float, ptr %a, i64 %i + store float %sel, ptr %ap, align 4 + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + ret void +} + +define void @vector_nested_diamond(ptr nocapture %a, ptr nocapture %b) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %merge1 ] + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %gt0 = fcmp ogt float %bv, 0.000000e+00 + br i1 %gt0, label %then0, label %else0 + +then0: + %gt10 = fcmp ogt float %bv, 1.000000e+01 + br i1 %gt10, label %then1, label %else1 + +then1: + br label %merge0 + +else1: + br label %merge0 + +merge0: + %hi = phi float [ 1.000000e+00, %then1 ], [ 2.000000e+00, %else1 ] + br label %merge1 + +else0: + br label %merge1 + +merge1: + %sel = phi float [ %hi, %merge0 ], [ 3.000000e+00, %else0 ] + %ap = getelementptr inbounds float, ptr %a, i64 %i + store float %sel, ptr %ap, align 4 + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + ret void +} + +; ASM-LABEL: vector_inner_diamond: +; ASM: BSTART.MSEQ +; ASM: B.TEXT +; ASM: B.IOR +; ASM: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; ASM: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; ASM: C.B.DIMI{{[[:space:]]+}}1,{{.*->lb2}} +; ASM: v.lw.brg +; ASM: v.sw.brg +; ASM: C.BSTOP +; ASM-NOT: v.rdor + +; ASM-LABEL: vector_nested_diamond: +; ASM: BSTART.MSEQ +; ASM: B.TEXT +; ASM: B.IOR +; ASM: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; ASM: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; ASM: C.B.DIMI{{[[:space:]]+}}1,{{.*->lb2}} +; ASM: v.lw.brg +; ASM: v.sw.brg +; ASM: C.BSTOP +; ASM-NOT: v.rdor + +; REMARK: "function":"vector_inner_diamond" +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"if-converted-diamond" + +; REMARK: "function":"vector_nested_diamond" +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"if-converted-diamond" diff --git a/llvm/test/CodeGen/LinxISA/autovec_ifconverted_ptr_sink.ll b/llvm/test/CodeGen/LinxISA/autovec_ifconverted_ptr_sink.ll new file mode 100644 index 0000000000000..3fc02c64e5af2 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_ifconverted_ptr_sink.ll @@ -0,0 +1,50 @@ +; RUN: rm -f %t.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.remarks.json < %s | FileCheck %s --check-prefix=ASM +; RUN: FileCheck %s --check-prefix=REMARK < %t.remarks.json + +define void @vector_ptr_sink_diamond(ptr nocapture %a, ptr nocapture %c, + ptr nocapture %b) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %merge ] + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %cond = fcmp ogt float %bv, 0.000000e+00 + br i1 %cond, label %then, label %else + +then: + br label %merge + +else: + br label %merge + +merge: + %dst.base = phi ptr [ %a, %then ], [ %c, %else ] + %out = select i1 %cond, float 4.000000e+00, float 2.000000e+00 + %dst = getelementptr inbounds float, ptr %dst.base, i64 %i + store float %out, ptr %dst, align 4 + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + ret void +} + +; ASM-LABEL: vector_ptr_sink_diamond: +; ASM: BSTART.MSEQ +; ASM: B.TEXT +; ASM: C.B.DIMI{{[[:space:]]+}}1,{{.*->lb0}} +; ASM: C.B.DIMI{{[[:space:]]+}}64,{{.*->lb1}} +; ASM: v.flt +; ASM: v.csel +; ASM: v.cmp.eq +; ASM: b.nz +; ASM: v.sw.brg +; ASM-NOT: v.rdor + +; REMARK: "function":"vector_ptr_sink_diamond" +; REMARK: "layout_kind":"scalar-replay" +; REMARK: "cf_strategy":"if-converted-diamond" diff --git a/llvm/test/CodeGen/LinxISA/autovec_ifconverted_select.ll b/llvm/test/CodeGen/LinxISA/autovec_ifconverted_select.ll new file mode 100644 index 0000000000000..a96d69e75d8fa --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_ifconverted_select.ll @@ -0,0 +1,78 @@ +; RUN: rm -f %t.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.remarks.json < %s | FileCheck %s --check-prefix=ASM +; RUN: FileCheck %s --check-prefix=REMARK < %t.remarks.json + +define void @vector_inner_select(ptr nocapture %a, ptr nocapture %b) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %ap = getelementptr inbounds float, ptr %a, i64 %i + %add = fadd float %bv, 1.000000e+00 + %neg = fsub float 0.000000e+00, %bv + %cond = fcmp ogt float %bv, 0.000000e+00 + %sel = select i1 %cond, float %add, float %neg + store float %sel, ptr %ap, align 4 + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + ret void +} + +define void @vector_nested_select(ptr nocapture %a, ptr nocapture %b) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %ap = getelementptr inbounds float, ptr %a, i64 %i + %gt10 = fcmp ogt float %bv, 1.000000e+01 + %hi = select i1 %gt10, float 1.000000e+00, float 2.000000e+00 + %gt0 = fcmp ogt float %bv, 0.000000e+00 + %sel = select i1 %gt0, float %hi, float 3.000000e+00 + store float %sel, ptr %ap, align 4 + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + ret void +} + +; ASM-LABEL: vector_inner_select: +; ASM: BSTART.MSEQ +; ASM: B.TEXT +; ASM: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; ASM: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; ASM: v.flt +; ASM: v.csel +; ASM: v.sw.brg +; ASM-NOT: b.nz +; ASM-NOT: v.rdor + +; ASM-LABEL: vector_nested_select: +; ASM: BSTART.MSEQ +; ASM: B.TEXT +; ASM: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; ASM: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; ASM: v.flt +; ASM: v.csel +; ASM: v.csel +; ASM: v.sw.brg +; ASM-NOT: b.nz +; ASM-NOT: v.rdor + +; REMARK: "function":"vector_inner_select" +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"if-converted-single-block" + +; REMARK: "function":"vector_nested_select" +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"if-converted-single-block" diff --git a/llvm/test/CodeGen/LinxISA/autovec_ifconverted_store_diamond.ll b/llvm/test/CodeGen/LinxISA/autovec_ifconverted_store_diamond.ll new file mode 100644 index 0000000000000..daa03ddf8ab2f --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_ifconverted_store_diamond.ll @@ -0,0 +1,48 @@ +; RUN: rm -f %t.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.remarks.json < %s | FileCheck %s --check-prefix=ASM +; RUN: FileCheck %s --check-prefix=REMARK < %t.remarks.json + +define void @vector_store_diamond(ptr nocapture %a, ptr nocapture %b) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %merge ] + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %ap = getelementptr inbounds float, ptr %a, i64 %i + %cond = fcmp ogt float %bv, 0.000000e+00 + br i1 %cond, label %then, label %else + +then: + %t = fadd float %bv, 1.000000e+00 + store float %t, ptr %ap, align 4 + br label %merge + +else: + %e = fsub float 0.000000e+00, %bv + store float %e, ptr %ap, align 4 + br label %merge + +merge: + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + ret void +} + +; ASM-LABEL: vector_store_diamond: +; ASM: BSTART.MSEQ +; ASM: B.TEXT +; ASM: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; ASM: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; ASM: v.flt +; ASM: v.csel +; ASM: v.sw.brg +; ASM-NOT: v.rdor + +; REMARK: "function":"vector_store_diamond" +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"if-converted-diamond" diff --git a/llvm/test/CodeGen/LinxISA/autovec_inner_mask_branch.ll b/llvm/test/CodeGen/LinxISA/autovec_inner_mask_branch.ll new file mode 100644 index 0000000000000..5225b42008542 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_inner_mask_branch.ll @@ -0,0 +1,50 @@ +; RUN: rm -f %t.remarks.json +; RUN: clang -target linx64-linx-none-elf -O2 -mllvm -linx-simt-autovec=1 -mllvm -linx-simt-autovec-mode=mseq -mllvm -linx-simt-autovec-remarks=%t.remarks.json -S -x ir %s -o - | FileCheck %s +; RUN: FileCheck %s --check-prefix=REMARK < %t.remarks.json + +define void @vector_inner_if(ptr nocapture %a, ptr nocapture %b) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %join ] + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %ap = getelementptr inbounds float, ptr %a, i64 %i + %cond = fcmp ogt float %bv, 0.000000e+00 + br i1 %cond, label %then, label %else + +then: + %t = fadd float %bv, 1.000000e+00 + store float %t, ptr %ap, align 4 + br label %join + +else: + %e = fsub float 0.000000e+00, %bv + store float %e, ptr %ap, align 4 + br label %join + +join: + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + ret void +} + +; CHECK-LABEL: vector_inner_if: +; CHECK: BSTART.MSEQ +; CHECK: B.TEXT +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.flt +; CHECK: v.csel +; CHECK: v.sw.brg +; CHECK-NOT: b.nz +; CHECK-NOT: v.rdor + +; REMARK: "function":"vector_inner_if" +; REMARK: "single_block":true +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"if-converted-single-block" diff --git a/llvm/test/CodeGen/LinxISA/autovec_invariant_store_value.ll b/llvm/test/CodeGen/LinxISA/autovec_invariant_store_value.ll new file mode 100644 index 0000000000000..674c158f6f7d3 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_invariant_store_value.ll @@ -0,0 +1,74 @@ +; RUN: llc -mtriple=linx64 -O2 \ +; RUN: --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq \ +; RUN: --linx-simt-autovec-layout=grouped --linx-simt-autovec-lanes=32 \ +; RUN: < %s | FileCheck %s + +define void @fill_i32(ptr nocapture writeonly %buf, i32 %value) local_unnamed_addr { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %ptr = getelementptr inbounds i32, ptr %buf, i64 %i + store i32 %value, ptr %ptr, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @fill_i8(ptr nocapture writeonly %buf, i8 %value) local_unnamed_addr { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %ptr = getelementptr inbounds i8, ptr %buf, i64 %i + store i8 %value, ptr %ptr, align 1 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @fill_i16(ptr nocapture writeonly %buf, i16 %value) local_unnamed_addr { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %ptr = getelementptr inbounds i16, ptr %buf, i64 %i + store i16 %value, ptr %ptr, align 2 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +; CHECK-LABEL: fill_i32: +; CHECK: BSTART.MSEQ +; CHECK: B.TEXT +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.add zero, ri{{[0-9]+}}{{(\.sw)?}}, ->vt#{{[0-9]+}} +; CHECK: v.sw.brg vt#{{[0-9]+}}, [ri{{[0-9]+}}, lc0<<2, vt#{{[0-9]+}}<<2] + +; CHECK-LABEL: fill_i8: +; CHECK: BSTART.MSEQ +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.add zero, ri{{[0-9]+}}{{(\.sw)?}}, ->vt#{{[0-9]+}} +; CHECK: v.sb.brg vt#{{[0-9]+}}, [ri{{[0-9]+}}, lc0, vt#{{[0-9]+}}] + +; CHECK-LABEL: fill_i16: +; CHECK: BSTART.MSEQ +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.add zero, ri{{[0-9]+}}{{(\.sw)?}}, ->vt#{{[0-9]+}} +; CHECK: v.sh.brg vt#{{[0-9]+}}, [ri{{[0-9]+}}, lc0<<1, vt#{{[0-9]+}}<<1] diff --git a/llvm/test/CodeGen/LinxISA/autovec_layout_policy.ll b/llvm/test/CodeGen/LinxISA/autovec_layout_policy.ll new file mode 100644 index 0000000000000..6a166fc5926e6 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_layout_policy.ll @@ -0,0 +1,97 @@ +; RUN: rm -f %t.auto.remarks.json %t.reject.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.auto.remarks.json < %s | FileCheck %s --check-prefix=AUTO +; RUN: FileCheck %s --check-prefix=REMARK-AUTO < %t.auto.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-layout=scalar-replay --linx-simt-autovec-lanes=32 < %s | FileCheck %s --check-prefix=SCALAR +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-layout=grouped --linx-simt-autovec-lanes=32 --linx-simt-autovec-remarks=%t.reject.remarks.json < %s > /dev/null +; RUN: FileCheck %s --check-prefix=REMARK-REJECT < %t.reject.remarks.json + +define void @single_group(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %ap = getelementptr inbounds float, ptr %a, i64 %i + %av = load float, ptr %ap, align 4 + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %sum = fadd float %av, %bv + %cp = getelementptr inbounds float, ptr %c, i64 %i + store float %sum, ptr %cp, align 4 + %inc = add nuw i64 %i, 1 + %cmp = icmp ult i64 %inc, 8 + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +define void @strip_mined(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %ap = getelementptr inbounds float, ptr %a, i64 %i + %av = load float, ptr %ap, align 4 + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %sum = fadd float %av, %bv + %cp = getelementptr inbounds float, ptr %c, i64 %i + store float %sum, ptr %cp, align 4 + %inc = add nuw i64 %i, 1 + %cmp = icmp ult i64 %inc, 128 + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +define void @dynamic_tripcount(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c, i64 %n) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %ap = getelementptr inbounds float, ptr %a, i64 %i + %av = load float, ptr %ap, align 4 + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %sum = fadd float %av, %bv + %cp = getelementptr inbounds float, ptr %c, i64 %i + store float %sum, ptr %cp, align 4 + %inc = add nuw i64 %i, 1 + %cmp = icmp ult i64 %inc, %n + br i1 %cmp, label %loop, label %exit + +exit: + ret void +} + +; AUTO-LABEL: single_group: +; AUTO: BSTART.MSEQ +; AUTO: C.B.DIMI{{[[:space:]]+}}8,{{.*->lb0}} +; AUTO: C.B.DIMI{{[[:space:]]+}}1,{{.*->lb1}} + +; AUTO-LABEL: strip_mined: +; AUTO: BSTART.MSEQ +; AUTO: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; AUTO: C.B.DIMI{{[[:space:]]+}}4,{{.*->lb1}} +; AUTO: v.add lc0, lc1{{(\.uw)?}}<<5 + +; SCALAR-LABEL: single_group: +; SCALAR: BSTART.MSEQ +; SCALAR: C.B.DIMI{{[[:space:]]+}}1,{{.*->lb0}} +; SCALAR: C.B.DIMI{{[[:space:]]+}}8,{{.*->lb1}} + +; REMARK-AUTO: "function":"single_group" +; REMARK-AUTO: "layout_policy":"auto" +; REMARK-AUTO: "layout_kind":"grouped-single-group" +; REMARK-AUTO: "function":"strip_mined" +; REMARK-AUTO: "layout_policy":"auto" +; REMARK-AUTO: "layout_kind":"grouped-strip-mined" + +; REMARK-REJECT: "function":"dynamic_tripcount" +; REMARK-REJECT: "status":"reject" +; REMARK-REJECT: "reason":"grouped_layout_requires_static_tripcount" +; REMARK-REJECT: "layout_policy":"grouped" diff --git a/llvm/test/CodeGen/LinxISA/autovec_narrow_load_copy.ll b/llvm/test/CodeGen/LinxISA/autovec_narrow_load_copy.ll new file mode 100644 index 0000000000000..88badda4c25eb --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_narrow_load_copy.ll @@ -0,0 +1,164 @@ +; RUN: llc -mtriple=linx64 -O2 \ +; RUN: --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq \ +; RUN: --linx-simt-autovec-layout=grouped --linx-simt-autovec-lanes=32 \ +; RUN: < %s | FileCheck %s + +define void @copy_u8(ptr nocapture readonly %src, ptr nocapture writeonly %dst) local_unnamed_addr { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %src.ptr = getelementptr inbounds i8, ptr %src, i64 %i + %dst.ptr = getelementptr inbounds i8, ptr %dst, i64 %i + %val = load i8, ptr %src.ptr, align 1 + store i8 %val, ptr %dst.ptr, align 1 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @copy_u16(ptr nocapture readonly %src, ptr nocapture writeonly %dst) local_unnamed_addr { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %src.ptr = getelementptr inbounds i16, ptr %src, i64 %i + %dst.ptr = getelementptr inbounds i16, ptr %dst, i64 %i + %val = load i16, ptr %src.ptr, align 2 + store i16 %val, ptr %dst.ptr, align 2 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @widen_i8_to_i32(ptr nocapture readonly %src, ptr nocapture writeonly %dst) local_unnamed_addr { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %src.ptr = getelementptr inbounds i8, ptr %src, i64 %i + %dst.ptr = getelementptr inbounds i32, ptr %dst, i64 %i + %val = load i8, ptr %src.ptr, align 1 + %wide = sext i8 %val to i32 + store i32 %wide, ptr %dst.ptr, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @widen_i16_to_i32(ptr nocapture readonly %src, ptr nocapture writeonly %dst) local_unnamed_addr { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %src.ptr = getelementptr inbounds i16, ptr %src, i64 %i + %dst.ptr = getelementptr inbounds i32, ptr %dst, i64 %i + %val = load i16, ptr %src.ptr, align 2 + %wide = sext i16 %val to i32 + store i32 %wide, ptr %dst.ptr, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @sign_classify_i8(ptr nocapture readonly %src, ptr nocapture writeonly %dst) local_unnamed_addr { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %src.ptr = getelementptr inbounds i8, ptr %src, i64 %i + %dst.ptr = getelementptr inbounds i32, ptr %dst, i64 %i + %val = load i8, ptr %src.ptr, align 1 + %neg = icmp slt i8 %val, 0 + %out = select i1 %neg, i32 -1, i32 1 + store i32 %out, ptr %dst.ptr, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @sign_classify_i16(ptr nocapture readonly %src, ptr nocapture writeonly %dst) local_unnamed_addr { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %src.ptr = getelementptr inbounds i16, ptr %src, i64 %i + %dst.ptr = getelementptr inbounds i32, ptr %dst, i64 %i + %val = load i16, ptr %src.ptr, align 2 + %neg = icmp slt i16 %val, 0 + %out = select i1 %neg, i32 -1, i32 1 + store i32 %out, ptr %dst.ptr, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +; CHECK-LABEL: copy_u8: +; CHECK: BSTART.MSEQ +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.lbu.brg [ri{{[0-9]+}}, lc0, vt#{{[0-9]+}}], ->vt#{{[0-9]+}} +; CHECK: v.sb.brg vt#{{[0-9]+}}, [ri{{[0-9]+}}, lc0, vt#{{[0-9]+}}] + +; CHECK-LABEL: copy_u16: +; CHECK: BSTART.MSEQ +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.lhu.brg [ri{{[0-9]+}}, lc0<<1, vt#{{[0-9]+}}<<1], ->vt#{{[0-9]+}} +; CHECK: v.sh.brg vt#{{[0-9]+}}, [ri{{[0-9]+}}, lc0<<1, vt#{{[0-9]+}}<<1] + +; CHECK-LABEL: widen_i8_to_i32: +; CHECK: BSTART.MSEQ +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.lb.brg [ri{{[0-9]+}}, lc0, vt#{{[0-9]+}}], ->vt#{{[0-9]+}} +; CHECK: v.sw.brg vt#{{[0-9]+}}, [ri{{[0-9]+}}, lc0<<2, vt#{{[0-9]+}}<<2] + +; CHECK-LABEL: widen_i16_to_i32: +; CHECK: BSTART.MSEQ +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.lh.brg [ri{{[0-9]+}}, lc0<<1, vt#{{[0-9]+}}<<1], ->vt#{{[0-9]+}} +; CHECK: v.sw.brg vt#{{[0-9]+}}, [ri{{[0-9]+}}, lc0<<2, vt#{{[0-9]+}}<<2] + +; CHECK-LABEL: sign_classify_i8: +; CHECK: BSTART.MSEQ +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.lb.brg [ri{{[0-9]+}}, lc0, vt#{{[0-9]+}}], ->vt#{{[0-9]+}} +; CHECK: v.cmp.lt +; CHECK: v.csel +; CHECK: v.sw.brg + +; CHECK-LABEL: sign_classify_i16: +; CHECK: BSTART.MSEQ +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.lh.brg [ri{{[0-9]+}}, lc0<<1, vt#{{[0-9]+}}<<1], ->vt#{{[0-9]+}} +; CHECK: v.cmp.lt +; CHECK: v.csel +; CHECK: v.sw.brg diff --git a/llvm/test/CodeGen/LinxISA/autovec_nested_mask_branch.ll b/llvm/test/CodeGen/LinxISA/autovec_nested_mask_branch.ll new file mode 100644 index 0000000000000..54cff07f33713 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_nested_mask_branch.ll @@ -0,0 +1,58 @@ +; RUN: rm -f %t.remarks.json +; RUN: clang -target linx64-linx-none-elf -O2 -mllvm -linx-simt-autovec=1 -mllvm -linx-simt-autovec-mode=mseq -mllvm -linx-simt-autovec-remarks=%t.remarks.json -S -x ir %s -o - | FileCheck %s +; RUN: FileCheck %s --check-prefix=REMARK < %t.remarks.json + +define void @vector_nested_if(ptr nocapture %a, ptr nocapture %b) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %join ] + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %ap = getelementptr inbounds float, ptr %a, i64 %i + %cond0 = fcmp ogt float %bv, 0.000000e+00 + br i1 %cond0, label %then0, label %else0 + +then0: + %cond1 = fcmp ogt float %bv, 1.000000e+01 + br i1 %cond1, label %then1, label %else1 + +then1: + store float 1.000000e+00, ptr %ap, align 4 + br label %join + +else1: + store float 2.000000e+00, ptr %ap, align 4 + br label %join + +else0: + store float 3.000000e+00, ptr %ap, align 4 + br label %join + +join: + %inc = add nuw i64 %i, 1 + %done = icmp ult i64 %inc, 64 + br i1 %done, label %loop, label %exit + +exit: + ret void +} + +; CHECK-LABEL: vector_nested_if: +; CHECK: BSTART.MSEQ +; CHECK: B.TEXT +; CHECK: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; CHECK: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; CHECK: v.flt +; CHECK: v.csel +; CHECK: v.flt zero, +; CHECK: v.csel +; CHECK: v.sw.brg +; CHECK-NOT: b.nz +; CHECK-NOT: v.rdor + +; REMARK: "function":"vector_nested_if" +; REMARK: "single_block":true +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"if-converted-single-block" diff --git a/llvm/test/CodeGen/LinxISA/autovec_nested_mask_branch.s b/llvm/test/CodeGen/LinxISA/autovec_nested_mask_branch.s new file mode 100644 index 0000000000000..d55efceb2f139 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_nested_mask_branch.s @@ -0,0 +1,64 @@ + .file "autovec_nested_mask_branch.ll" + .text + .globl vector_nested_if # -- Begin function vector_nested_if + .p2align 1 + .type vector_nested_if,@function +vector_nested_if: # @vector_nested_if +# %bb.0: # %entry +FENTRY [ra ~ ra], sp!, 8 +# %bb.1: # %entry +C.BSTART.STD +addi zero, 64, ->a2 +c.movi 1, ->a3 +lui 263168, ->a4 +lui 262144, ->a5 +lui 260096, ->a6 +lui 266752, ->a7 +# %bb.3: # %entry +C.BSTART.STD +BSTART.MSEQ VS8 +B.TEXT .__linx_vblock_body.0 +B.IOR [a1,a7,a0],[] +B.IOR [a6,a0,a5],[] +B.IOR [a0,a4,a3],[] +B.IOR [a2],[] +C.B.DIMI 32, ->lb0 +C.B.DIMI 2, ->lb1 +C.B.DIMI 1, ->lb2 +# %bb.2: # %entry +FRET.STK [ra ~ ra], sp!, 8 +# %bb.4: +.__linx_vblock_body.0: # EH_LABEL +v.add lc0, lc1<<5, ->vt#1 +v.sub vt#1, lc0, ->vt#2 +v.lw.brg [ri0, lc0<<2, vt#2<<2], ->vt#3 +v.flt zero, vt#3, ->p +b.nz .__linx_vbody_vblock_body.0.L1 +j .__linx_vbody_vblock_body.0.L4 +.__linx_vbody_vblock_body.0.L1: # EH_LABEL +v.flt ri1, vt#3, ->p +b.nz .__linx_vbody_vblock_body.0.L2 +j .__linx_vbody_vblock_body.0.L3 +.__linx_vbody_vblock_body.0.L2: # EH_LABEL +v.sw.brg ri3, [ri2, lc0<<2, vt#2<<2] +j .__linx_vbody_vblock_body.0.L5 +.__linx_vbody_vblock_body.0.L3: # EH_LABEL +v.sw.brg ri5, [ri4, lc0<<2, vt#2<<2] +j .__linx_vbody_vblock_body.0.L5 +.__linx_vbody_vblock_body.0.L4: # EH_LABEL +v.sw.brg ri7, [ri6, lc0<<2, vt#2<<2] +j .__linx_vbody_vblock_body.0.L5 +.__linx_vbody_vblock_body.0.L5: # EH_LABEL +v.add vt#1, ri8, ->vt#4 +v.cmp.ltu vt#4, ri9, ->p +b.nz .__linx_vbody_vblock_body.0.L_end +j .__linx_vbody_vblock_body.0.L5_exit1 +.__linx_vbody_vblock_body.0.L5_exit1: # EH_LABEL +j .__linx_vbody_vblock_body.0.L_end +.__linx_vbody_vblock_body.0.L_end: # EH_LABEL +C.BSTOP +.__linx_vblock_body.0.end: # EH_LABEL +.Lfunc_end0: + .size vector_nested_if, .Lfunc_end0-vector_nested_if + # -- End function + .section ".note.GNU-stack","",@progbits diff --git a/llvm/test/CodeGen/LinxISA/autovec_reduction_liveout.ll b/llvm/test/CodeGen/LinxISA/autovec_reduction_liveout.ll index 2f1ffc441d0bb..b3dece9aa2b6b 100644 --- a/llvm/test/CodeGen/LinxISA/autovec_reduction_liveout.ll +++ b/llvm/test/CodeGen/LinxISA/autovec_reduction_liveout.ll @@ -25,7 +25,13 @@ exit: ; preds = %loop ; CHECK-LABEL: sum_liveout: ; CHECK: BSTART.MSEQ ; CHECK: B.IOR +; CHECK: B.IOTI +; CHECK: B.IOTI ; CHECK: C.B.DIMI 32, ->lb0 ; CHECK: C.B.DIMI 2, ->lb1 +; CHECK: v.lw.brg +; CHECK: c.movr ; CHECK: v.rdfadd -; CHECK: v.sw.brg a{{[0-5]}}, [ri +; CHECK: v.sw.brg.local a{{[0-5]}}, [ts +; CHECK: v.lw.brg.local [ts +; CHECK: v.sw.brg diff --git a/llvm/test/CodeGen/LinxISA/autovec_tsvc_control_flow.ll b/llvm/test/CodeGen/LinxISA/autovec_tsvc_control_flow.ll new file mode 100644 index 0000000000000..6abca488851ea --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_tsvc_control_flow.ll @@ -0,0 +1,207 @@ +; RUN: rm -f %t.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-remarks=%t.remarks.json < %s > /dev/null +; RUN: FileCheck %s --check-prefix=REMARK < %t.remarks.json + +; TSVC control-flow inspired shapes. These are intentionally compiler-boundary +; checks, not grouped runtime positives. + +define void @masked_update_mul(ptr nocapture %a, + ptr nocapture readonly %b, + ptr nocapture readonly %c) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %cond = fcmp ogt float %bv, 0.0 + %ap = getelementptr inbounds float, ptr %a, i64 %i + %av = load float, ptr %ap, align 4 + %cp = getelementptr inbounds float, ptr %c, i64 %i + %cv = load float, ptr %cp, align 4 + %mul = fmul float %bv, %cv + %sum = fadd float %av, %mul + %out = select i1 %cond, float %sum, float %av + store float %out, ptr %ap, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @nested_dual_update(ptr nocapture %a, + ptr nocapture %b, + ptr nocapture %c, + ptr nocapture readonly %d, + ptr nocapture readonly %e, + i64 %x) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %join2 ] + %ap = getelementptr inbounds float, ptr %a, i64 %i + %av = load float, ptr %ap, align 4 + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %cmp0 = fcmp ogt float %av, %bv + br i1 %cmp0, label %then0, label %else0 + +then0: + %dp0 = getelementptr inbounds float, ptr %d, i64 %i + %dv0 = load float, ptr %dp0, align 4 + %mul0 = fmul float %bv, %dv0 + %an = fadd float %av, %mul0 + store float %an, ptr %ap, align 4 + %cmp1 = icmp sgt i64 64, 10 + br i1 %cmp1, label %then1, label %else1 + +then1: + %cp1 = getelementptr inbounds float, ptr %c, i64 %i + %cv1 = load float, ptr %cp1, align 4 + %mul1 = fmul float %dv0, %dv0 + %cn1 = fadd float %cv1, %mul1 + store float %cn1, ptr %cp1, align 4 + br label %join2 + +else1: + %ep1 = getelementptr inbounds float, ptr %e, i64 %i + %ev1 = load float, ptr %ep1, align 4 + %mul2 = fmul float %dv0, %ev1 + %cn2 = fadd float %mul2, 1.0 + %cp2 = getelementptr inbounds float, ptr %c, i64 %i + store float %cn2, ptr %cp2, align 4 + br label %join2 + +else0: + %ep0 = getelementptr inbounds float, ptr %e, i64 %i + %ev0 = load float, ptr %ep0, align 4 + %mul3 = fmul float %ev0, %ev0 + %bn = fadd float %av, %mul3 + store float %bn, ptr %bp, align 4 + %cmp2 = icmp sgt i64 %x, 0 + br i1 %cmp2, label %then2, label %else2 + +then2: + %dp2 = getelementptr inbounds float, ptr %d, i64 %i + %dv2 = load float, ptr %dp2, align 4 + %mul4 = fmul float %dv2, %dv2 + %cn3 = fadd float %av, %mul4 + %cp3 = getelementptr inbounds float, ptr %c, i64 %i + store float %cn3, ptr %cp3, align 4 + br label %join2 + +else2: + %cp4 = getelementptr inbounds float, ptr %c, i64 %i + %cv4 = load float, ptr %cp4, align 4 + %mul5 = fmul float %ev0, %ev0 + %cn4 = fadd float %cv4, %mul5 + store float %cn4, ptr %cp4, align 4 + br label %join2 + +join2: + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit2, label %loop + +exit2: + ret void +} + +define void @independent_conditional_dual_store(ptr nocapture %a, + ptr nocapture %b, + ptr nocapture readonly %c, + ptr nocapture readonly %d, + ptr nocapture readonly %e, + float %t) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %ep = getelementptr inbounds float, ptr %e, i64 %i + %ev = load float, ptr %ep, align 4 + %cond = fcmp oge float %ev, %t + %ap = getelementptr inbounds float, ptr %a, i64 %i + %av = load float, ptr %ap, align 4 + %cp = getelementptr inbounds float, ptr %c, i64 %i + %cv = load float, ptr %cp, align 4 + %dp = getelementptr inbounds float, ptr %d, i64 %i + %dv = load float, ptr %dp, align 4 + %cmul = fmul float %cv, %dv + %an = fadd float %av, %cmul + %outa = select i1 %cond, float %an, float %av + store float %outa, ptr %ap, align 4 + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %csq = fmul float %cv, %cv + %bn = fadd float %bv, %csq + %outb = select i1 %cond, float %bn, float %bv + store float %outb, ptr %bp, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit3, label %loop + +exit3: + ret void +} + +define void @dependent_conditional_update(ptr nocapture %a, + ptr nocapture %b, + ptr nocapture %c, + ptr nocapture readonly %d, + ptr nocapture readonly %e) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %cp = getelementptr inbounds float, ptr %c, i64 %i + %cv = load float, ptr %cp, align 4 + %ep = getelementptr inbounds float, ptr %e, i64 %i + %ev = load float, ptr %ep, align 4 + %dp = getelementptr inbounds float, ptr %d, i64 %i + %dv = load float, ptr %dp, align 4 + %mul0 = fmul float %ev, %dv + %newa = fadd float %cv, %mul0 + %ap = getelementptr inbounds float, ptr %a, i64 %i + store float %newa, ptr %ap, align 4 + %cond = fcmp ogt float %newa, 0.0 + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %newb = fadd float %newa, %bv + %outa = select i1 %cond, float %newa, float %mul0 + %outb = select i1 %cond, float %newb, float %bv + store float %outa, ptr %ap, align 4 + store float %outb, ptr %bp, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit4, label %loop + +exit4: + ret void +} + +; REMARK: "function":"masked_update_mul" +; REMARK: "status":"reject" +; REMARK: "reason":"unsupported_value_expr:select" +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"if-converted-single-block" +; REMARK: "function":"nested_dual_update" +; REMARK: "status":"reject" +; REMARK: "reason":"unsupported_value_expr:fadd" +; REMARK: "layout_kind":"scalar-replay" +; REMARK: "cf_strategy":"exec-mask-save-restore-required" +; REMARK: "function":"independent_conditional_dual_store" +; REMARK: "status":"reject" +; REMARK: "reason":"unsupported_value_expr:select" +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"if-converted-single-block" +; REMARK: "function":"dependent_conditional_update" +; REMARK: "status":"reject" +; REMARK: "reason":"unsupported_value_expr:select" +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"if-converted-single-block" diff --git a/llvm/test/CodeGen/LinxISA/autovec_tsvc_extra.ll b/llvm/test/CodeGen/LinxISA/autovec_tsvc_extra.ll new file mode 100644 index 0000000000000..9d062243ea456 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/autovec_tsvc_extra.ll @@ -0,0 +1,179 @@ +; RUN: rm -f %t.remarks.json +; RUN: llc -mtriple=linx64 -O2 --linx-simt-autovec=1 --linx-simt-autovec-mode=mseq --linx-simt-autovec-remarks=%t.remarks.json < %s | FileCheck %s --check-prefix=ASM +; RUN: FileCheck %s --check-prefix=REMARK < %t.remarks.json + +define void @vector_shift_half_index(ptr nocapture readonly %b, + ptr nocapture readonly %c, + ptr nocapture writeonly %a) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %bp = getelementptr inbounds i32, ptr %b, i64 %i + %bv = load i32, ptr %bp, align 4 + %half = lshr i64 %i, 1 + %cp = getelementptr inbounds i32, ptr %c, i64 %half + %cv = load i32, ptr %cp, align 4 + %sum = add nsw i32 %bv, %cv + %ap = getelementptr inbounds i32, ptr %a, i64 %i + store i32 %sum, ptr %ap, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @vector_min_select_store(ptr nocapture readonly %a, + ptr nocapture readonly %b, + ptr nocapture writeonly %out) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %ap = getelementptr inbounds float, ptr %a, i64 %i + %av = load float, ptr %ap, align 4 + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %gt = fcmp ogt float %av, %bv + %minv = select i1 %gt, float %bv, float %av + %op = getelementptr inbounds float, ptr %out, i64 %i + store float %minv, ptr %op, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 64 + br i1 %done, label %exit, label %loop + +exit: + ret void +} + +define void @vector_shifted_out_store(ptr nocapture %a, + ptr nocapture readonly %b) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %ap0 = getelementptr inbounds float, ptr %a, i64 %i + %av = load float, ptr %ap0, align 4 + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %sum = fadd float %av, %bv + %outi = add nuw nsw i64 %i, 32 + %ap1 = getelementptr inbounds float, ptr %a, i64 %outi + store float %sum, ptr %ap1, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp eq i64 %inc, 32 + br i1 %done, label %exit1, label %loop + +exit1: + ret void +} + +define void @vector_shifted_out_param(ptr nocapture %a, + ptr nocapture readonly %b, + i64 %m) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %inc, %loop ] + %ap0 = getelementptr inbounds float, ptr %a, i64 %i + %av = load float, ptr %ap0, align 4 + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %sum = fadd float %av, %bv + %outi = add nuw nsw i64 %i, %m + %ap1 = getelementptr inbounds float, ptr %a, i64 %outi + store float %sum, ptr %ap1, align 4 + %inc = add nuw nsw i64 %i, 1 + %done = icmp ult i64 %inc, %m + br i1 %done, label %loop, label %exit2 + +exit2: + ret void +} + +define void @vector_stride_inc(ptr nocapture %a, + ptr nocapture readonly %b, + i64 %incv) { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %next, %loop ] + %bp = getelementptr inbounds float, ptr %b, i64 %i + %bv = load float, ptr %bp, align 4 + %ap = getelementptr inbounds float, ptr %a, i64 %i + store float %bv, ptr %ap, align 4 + %next = add nuw nsw i64 %i, %incv + %done = icmp ult i64 %next, 64 + br i1 %done, label %loop, label %exit3 + +exit3: + ret void +} + +; REMARK: "function":"vector_shift_half_index" +; REMARK: "status":"reject" +; REMARK: "reason":"non_float_store_value" +; REMARK: "layout_kind":"scalar-replay" +; REMARK: "cf_strategy":"straight-line-single-block" +; REMARK: "function":"vector_min_select_store" +; REMARK: "status":"lowered" +; REMARK: "reason":"lowered_vblock_mseq_affine" +; REMARK: "layout_kind":"grouped-strip-mined" +; REMARK: "cf_strategy":"if-converted-single-block" +; REMARK: "function":"vector_shifted_out_store" +; REMARK: "status":"lowered" +; REMARK: "layout_kind":"grouped-single-group" +; REMARK: "cf_strategy":"straight-line-single-block" +; REMARK: "function":"vector_shifted_out_param" +; REMARK: "status":"lowered" +; REMARK: "layout_kind":"scalar-replay" +; REMARK: "cf_strategy":"straight-line-single-block" +; REMARK: "function":"vector_stride_inc" +; REMARK: "status":"lowered" +; REMARK: "layout_kind":"scalar-replay" +; REMARK: "cf_strategy":"straight-line-single-block" + +; ASM-LABEL: vector_shift_half_index: +; ASM: C.BSTART.STD +; ASM-NOT: BSTART.MSEQ +; ASM: addw +; ASM: c.swi + +; ASM-LABEL: vector_min_select_store: +; ASM: BSTART.MSEQ +; ASM: B.TEXT +; ASM: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; ASM: C.B.DIMI{{[[:space:]]+}}2,{{.*->lb1}} +; ASM: v.flt +; ASM: v.csel +; ASM: v.sw.brg +; ASM-NOT: v.rdor + +; ASM-LABEL: vector_shifted_out_store: +; ASM: BSTART.MSEQ +; ASM: B.TEXT +; ASM: C.B.DIMI{{[[:space:]]+}}32,{{.*->lb0}} +; ASM: C.B.DIMI{{[[:space:]]+}}1,{{.*->lb1}} +; ASM: v.fadd +; ASM: v.sw.brg + +; ASM-LABEL: vector_shifted_out_param: +; ASM: BSTART.MSEQ +; ASM: C.B.DIMI{{[[:space:]]+}}1,{{.*->lb0}} +; ASM: B.DIM +; ASM: v.fadd +; ASM: v.sw.brg + +; ASM-LABEL: vector_stride_inc: +; ASM: BSTART.MSEQ +; ASM: C.B.DIMI{{[[:space:]]+}}1,{{.*->lb0}} +; ASM: B.DIM{{[[:space:]]+}}a3,{{[[:space:]]+}}0,{{[[:space:]]+}}->lb1 +; ASM: v.mul +; ASM: v.sw.brg diff --git a/llvm/test/CodeGen/LinxISA/blockify-setc-imm-liveout.mir b/llvm/test/CodeGen/LinxISA/blockify-setc-imm-liveout.mir new file mode 100644 index 0000000000000..c6829ba30fabf --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/blockify-setc-imm-liveout.mir @@ -0,0 +1,90 @@ +# RUN: llc -mtriple=linx64 -run-pass=linx-blockify -o - %s | FileCheck %s +# +# Ensure the local `ADDWrr dst, src, zero` + `SETC_*I dst, imm` fold does not +# erase the defining copy when `dst` remains live-out to successor blocks. +# +# CHECK-LABEL: name: setc_imm_addw_liveout +# CHECK: $r7 = ADDWrr +# CHECK-NOT: SETC_GEI $t1, 2 +# CHECK: SETC_GEI $r7, 2 +# CHECK: bb.1.entry: +# CHECK: liveins: $r2, $r3, $r4, $r5, $r6, $r7 + +--- | + define dso_local void @setc_imm_addw_liveout(ptr %cur, ptr %valp, ptr %callback) { + entry: + ret void + } +... +--- +name: setc_imm_addw_liveout +alignment: 2 +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +isSSA: false +noVRegs: true +liveins: + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } + - { reg: '$r4', virtual-reg: '' } +frameInfo: + stackSize: 16 + adjustsStack: true + hasCalls: false + isCalleeSavedInfoValid: true +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -8, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '$r10', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -16, size: 8, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + bb.0.entry: + successors: %bb.1(0x40000000), %bb.4(0x40000000) + liveins: $r2, $r3, $r4 + + FENTRY 10, 10, 16, implicit-def $r1 + $r5 = ADDIri $r2, 0 + $r2 = nuw ADDIri $r5, 80 + $r6 = LWI $r5, 10 + $r7 = ADDWrr killed $r6, $r0 + $r8 = ADDIri $r0, 1 + $r6 = ADDIri $r0, 0 + BLT $r8, $r7, %bb.4 + + bb.1.entry: + successors: %bb.3(0x40000000), %bb.2(0x40000000) + liveins: $r2, $r3, $r4, $r5, $r6, $r7 + + BEQ $r7, $r6, %bb.3 + JUMP %bb.2 + + bb.3: + successors: %bb.2(0x80000000) + liveins: $r1 + + FRET_STK 10, 10, 16, implicit-def $r1, implicit-def $r10, implicit $r1 + + bb.4.entry: + successors: %bb.2(0x40000000), %bb.5(0x40000000) + liveins: $r2, $r3, $r4, $r5, $r6, $r7 + + $r8 = ADDIri $r0, 2 + BEQ $r7, $r8, %bb.2 + JUMP %bb.5 + + bb.2: + liveins: $r1 + + FRET_STK 10, 10, 16, implicit-def $r1, implicit-def $r10, implicit $r1 + + bb.5: + liveins: $r1 + + FRET_STK 10, 10, 16, implicit-def $r1, implicit-def $r10, implicit $r1 +... diff --git a/llvm/test/CodeGen/LinxISA/dynamic-stackalloc.ll b/llvm/test/CodeGen/LinxISA/dynamic-stackalloc.ll index 18a8cc373f96c..22da5c8af6f29 100644 --- a/llvm/test/CodeGen/LinxISA/dynamic-stackalloc.ll +++ b/llvm/test/CodeGen/LinxISA/dynamic-stackalloc.ll @@ -9,6 +9,10 @@ entry: } ; CHECK-LABEL: f: +; CHECK: FENTRY +; CHECK: c.movr sp, ->s7 ; CHECK: addi ; CHECK: sub sp, ; CHECK: ->sp +; CHECK: c.movr s7, ->sp +; CHECK: FRET.STK diff --git a/llvm/test/CodeGen/LinxISA/stacksave-expand.ll b/llvm/test/CodeGen/LinxISA/stacksave-expand.ll index ade07fa3567ae..6e31025f5d64c 100644 --- a/llvm/test/CodeGen/LinxISA/stacksave-expand.ll +++ b/llvm/test/CodeGen/LinxISA/stacksave-expand.ll @@ -6,6 +6,8 @@ declare void @llvm.stackrestore(ptr) define i64 @stacksave_roundtrip(i64 %n) { ; CHECK-LABEL: stacksave_roundtrip: ; CHECK: FENTRY +; CHECK: c.movr sp, ->s7 +; CHECK: c.movr s7, ->sp ; CHECK: FRET.STK entry: %sp = call ptr @llvm.stacksave() diff --git a/llvm/test/CodeGen/LinxISA/vblock-launch-reject-archived-base.ll b/llvm/test/CodeGen/LinxISA/vblock-launch-reject-archived-base.ll new file mode 100644 index 0000000000000..62d3552105df7 --- /dev/null +++ b/llvm/test/CodeGen/LinxISA/vblock-launch-reject-archived-base.ll @@ -0,0 +1,17 @@ +; RUN: not llc -mtriple=linx64 < %s 2>&1 | FileCheck %s + +declare void @llvm.linx.vblock.launch(i32, ptr, i64, i64, i64, i32, + i64, i64, i64, i64, i64, i64, + i64, i64, i64, i64, i64, i64) + +define void @reject_archived_raw_base() #0 { +entry: + call void @llvm.linx.vblock.launch(i32 0, ptr null, i64 8, i64 1, i64 1, i32 0, + i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, + i64 0, i64 0, i64 0, i64 0, i64 0, i64 0) + ret void +} + +attributes #0 = { "linx-vblock-body-asm"=" v.lwi.u.local [to1, lc0.uh<<2, 8], ->vn.w\0A C.BSTOP\0A" } + +; CHECK: archived raw vector operand name is not allowed in canonical v0.4; use TA/TB/TC/TD/TO/TS diff --git a/llvm/test/CodeGen/LinxISA/vblock-launch.ll b/llvm/test/CodeGen/LinxISA/vblock-launch.ll index d2320e3ca129a..364c8bb7eb55a 100644 --- a/llvm/test/CodeGen/LinxISA/vblock-launch.ll +++ b/llvm/test/CodeGen/LinxISA/vblock-launch.ll @@ -36,6 +36,38 @@ entry: ret void } +define void @vseq_local_scratch() #1 { +entry: + call void @llvm.linx.vblock.launch(i32 0, ptr null, i64 8, i64 1, i64 1, i32 0, + i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, + i64 0, i64 0, i64 0, i64 0, i64 0, i64 0) + ret void +} + +define void @vseq_typed_body() #2 { +entry: + call void @llvm.linx.vblock.launch(i32 0, ptr null, i64 8, i64 2, i64 1, i32 0, + i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, + i64 0, i64 0, i64 0, i64 0, i64 0, i64 0) + ret void +} + +define void @vseq_local_scratch_dword() #3 { +entry: + call void @llvm.linx.vblock.launch(i32 0, ptr null, i64 8, i64 1, i64 1, i32 0, + i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, + i64 0, i64 0, i64 0, i64 0, i64 0, i64 0) + ret void +} + +define void @vseq_reuse_body() #4 { +entry: + call void @llvm.linx.vblock.launch(i32 0, ptr null, i64 8, i64 1, i64 1, i32 0, + i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, + i64 0, i64 0, i64 0, i64 0, i64 0, i64 0) + ret void +} + ; CHECK-LABEL: vseq: ; CHECK: BSTART.MSEQ ; CHECK-NEXT: B.TEXT {{\.__linx_vblock_body\.[0-9]+}} @@ -44,6 +76,7 @@ entry: ; CHECK: C.B.DIMI{{[[:space:]]+}}4,{{.*->lb2}} ; CHECK: {{^\.__linx_vblock_body\.[0-9]+:}} ; CHECK: C.BSTOP +; CHECK: {{^\.__linx_vblock_body\.[0-9]+\.end:}} ; CHECK-LABEL: vseq_rdc: ; CHECK: BSTART.MSEQ @@ -51,6 +84,7 @@ entry: ; CHECK: {{^\.__linx_vblock_body\.[0-9]+:}} ; CHECK: v.rdadd vt#1, ->a0 ; CHECK: C.BSTOP +; CHECK: {{^\.__linx_vblock_body\.[0-9]+\.end:}} ; CHECK-LABEL: vseq_tile: ; CHECK: BSTART.VSEQ @@ -60,4 +94,44 @@ entry: ; CHECK: BSTART.VPAR ; CHECK: B.TEXT {{\.__linx_vblock_body\.[0-9]+}} +; CHECK-LABEL: vseq_local_scratch: +; CHECK: BSTART.MSEQ +; CHECK-NEXT: B.TEXT {{\.__linx_vblock_body\.[0-9]+}} +; CHECK: B.IOTI{{[[:space:]]+\[\], last[[:space:]]+}}->t<0> +; CHECK: B.IOTI{{[[:space:]]+\[\], last[[:space:]]+}}->u<2> +; CHECK: {{^\.__linx_vblock_body\.[0-9]+:}} +; CHECK: v.swi.u.local zero, [ts, lc0<<2, 0] +; CHECK: v.lwi.u.local [ts, lc0<<2, 4], ->vt#1 +; CHECK: C.BSTOP + +; CHECK-LABEL: vseq_typed_body: +; CHECK: BSTART.MSEQ +; CHECK: B.TEXT {{\.__linx_vblock_body\.[0-9]+}} +; CHECK: {{^\.__linx_vblock_body\.[0-9]+:}} +; CHECK: v.add lc0, lc1.uw<<5, ->vt#1 +; CHECK: v.sub vt#1, lc0.uw, ->vt#2 +; CHECK: v.lw.brg [ri1, lc0<<2, vt#2<<2], ->vt#3 +; CHECK: C.BSTOP + +; CHECK-LABEL: vseq_local_scratch_dword: +; CHECK: BSTART.MSEQ +; CHECK: B.TEXT {{\.__linx_vblock_body\.[0-9]+}} +; CHECK: {{^\.__linx_vblock_body\.[0-9]+:}} +; CHECK: v.sdi.u.local zero, [ts, lc0<<3, 8] +; CHECK: v.ldi.u.local [ts, lc0<<3, 16], ->vn#1 +; CHECK: C.BSTOP + +; CHECK-LABEL: vseq_reuse_body: +; CHECK: BSTART.MSEQ +; CHECK: B.TEXT {{\.__linx_vblock_body\.[0-9]+}} +; CHECK: {{^\.__linx_vblock_body\.[0-9]+:}} +; CHECK: v.lwi.u.local [ts, lc0<<2, 8], ->vt#1 +; CHECK: v.add vt#1, lc0.uw, ->vu#1 +; CHECK: v.swi.u.local vu#1, [ts, lc0<<2, 12] +; CHECK: C.BSTOP + attributes #0 = { "linx-vblock-body-asm"=" v.rdadd vt#1.sw, ->a0\0A C.BSTOP\0A" } +attributes #1 = { "linx-vblock-body-asm"=" v.swi.u.local zero, [ts, lc0.uh<<2, 0]\0A v.lwi.u.local [ts, lc0.uh<<2, 4], ->vt.w\0A C.BSTOP\0A" "linx-vblock-ts-bytes"="64" } +attributes #2 = { "linx-vblock-body-asm"=" v.add lc0.uh, lc1.uh<<5, ->vt.w\0A v.sub vt#1.sw, lc0.uh, ->vt.w\0A v.lw.brg [ri1, lc0.uh<<2, vt#2.sw<<2], ->vt#3.w\0A C.BSTOP\0A" } +attributes #3 = { "linx-vblock-body-asm"=" v.sdi.u.local zero, [ts, lc0.uh<<3, 8]\0A v.ldi.u.local [ts, lc0.uh<<3, 16], ->vn.d\0A C.BSTOP\0A" "linx-vblock-ts-bytes"="64" } +attributes #4 = { "linx-vblock-body-asm"=" v.lwi.u.local [ts, lc0.uh<<2, 8], ->vt.w\0A v.add vt#1.reuse.sw, lc0.uh, ->vu.w\0A v.swi.u.local vu#1.reuse.uw, [ts, lc0.uh<<2, 12]\0A C.BSTOP\0A" "linx-vblock-ts-bytes"="64" } diff --git a/llvm/test/MC/LinxISA/body-local-label-branches.s b/llvm/test/MC/LinxISA/body-local-label-branches.s new file mode 100644 index 0000000000000..dbf3052c3a3bd --- /dev/null +++ b/llvm/test/MC/LinxISA/body-local-label-branches.s @@ -0,0 +1,27 @@ +# RUN: llvm-mc -triple=linx64 -filetype=obj %s -o - | llvm-objdump -d --triple=linx64 - | FileCheck %s + + .text + .globl body_local_label_branches + .type body_local_label_branches,@function +body_local_label_branches: + b.nz .Ltake0 + j .Ljoin0 +.Ltake0: + b.z .Ljoin0 + j .Ljoin1 +.Ljoin0: + j .Ldone +.Ljoin1: + j .Ldone +.Ldone: + C.BSTOP + .size body_local_label_branches, .-body_local_label_branches + +# CHECK-LABEL: : +# CHECK: b.nz +# CHECK: j +# CHECK: b.z +# CHECK: j +# CHECK: j +# CHECK: j +# CHECK: C.BSTOP diff --git a/llvm/test/MC/LinxISA/simt-p-save-restore-gap.s b/llvm/test/MC/LinxISA/simt-p-save-restore-gap.s new file mode 100644 index 0000000000000..6decd610e0a33 --- /dev/null +++ b/llvm/test/MC/LinxISA/simt-p-save-restore-gap.s @@ -0,0 +1,22 @@ +# RUN: split-file %s %t +# RUN: not llvm-mc -triple=linx64 -filetype=obj %t/scalar_add_p.s -o /dev/null 2>&1 | FileCheck %s --check-prefix=SCALAR +# RUN: not llvm-mc -triple=linx64 -filetype=obj %t/legacy_l_add_p.s -o /dev/null 2>&1 | FileCheck %s --check-prefix=LEGACYL + +# SCALAR: error: register operand does not fit field width +# SCALAR: add p, zero, ->t +# SCALAR: error: destination register does not fit field width +# SCALAR: add t, zero, ->p + +# LEGACYL: error: legacy 'L.*' mnemonics are not allowed in canonical v0.4 +# LEGACYL: l.add p, zero, ->t + +#--- scalar_add_p.s + .text +scalar_add_p: + add p, zero, ->t + add t, zero, ->p + +#--- legacy_l_add_p.s + .text +legacy_l_add_p: + l.add p, zero, ->t diff --git a/llvm/test/MC/LinxISA/simt-p-save-restore-roundtrip.s b/llvm/test/MC/LinxISA/simt-p-save-restore-roundtrip.s new file mode 100644 index 0000000000000..43f06f2d23ad1 --- /dev/null +++ b/llvm/test/MC/LinxISA/simt-p-save-restore-roundtrip.s @@ -0,0 +1,30 @@ +# RUN: llvm-mc -triple=linx64 -filetype=obj %s -o - | llvm-objdump -d --triple=linx64 - | FileCheck %s + + .text + .globl test_simt_p_save_restore_roundtrip + .type test_simt_p_save_restore_roundtrip,@function +test_simt_p_save_restore_roundtrip: + BSTART.MSEQ 0 + B.TEXT .body + B.IOTI [], last ->t<1KB> + C.B.DIMI 8, ->lb0 + C.B.DIMI 1, ->lb1 + C.BSTART +.body: + v.cmp.lt lc0.uh, ri0, ->p + v.psel p, ri1, ->vt.w + v.cmp.ne vt#1.sw, zero, ->p + C.BSTOP + .size test_simt_p_save_restore_roundtrip, .-test_simt_p_save_restore_roundtrip + +# CHECK-LABEL: : +# CHECK: BSTART.MSEQ +# CHECK: B.TEXT +# CHECK: B.IOTI{{[[:space:]]+}}[], last{{[[:space:]]+}}->t<1KB> +# CHECK: C.B.DIMI{{[[:space:]]+}}8, {{[[:space:]]*}}->lb0 +# CHECK: C.B.DIMI{{[[:space:]]+}}1, {{[[:space:]]*}}->lb1 +# CHECK: C.BSTART +# CHECK: v.cmp.lt{{[[:space:]]+}}lc0, ri0,{{[[:space:]]*}}->p +# CHECK: v.psel{{[[:space:]]+}}p, ri1,{{[[:space:]]*}}->vt +# CHECK: v.cmp.ne{{[[:space:]]+}}vt#1, zero,{{[[:space:]]*}}->p +# CHECK: C.BSTOP diff --git a/llvm/test/MC/LinxISA/v03-vector-reuse-roundtrip.s b/llvm/test/MC/LinxISA/v03-vector-reuse-roundtrip.s new file mode 100644 index 0000000000000..3bf324751ec01 --- /dev/null +++ b/llvm/test/MC/LinxISA/v03-vector-reuse-roundtrip.s @@ -0,0 +1,32 @@ +# RUN: llvm-mc -triple=linx64 -filetype=obj %s -o - | llvm-objdump -d --triple=linx64 - | FileCheck %s + + .text + .globl test_v03_vector_reuse_roundtrip + .type test_v03_vector_reuse_roundtrip,@function +test_v03_vector_reuse_roundtrip: + BSTART.MSEQ 0 + B.TEXT .body + B.IOTI [], last ->t<4KB> + B.IOTI [], last ->u<2KB> + C.B.DIMI 8, ->lb0 + C.B.DIMI 1, ->lb1 + C.BSTART +.body: + v.lwi.u.local [ts, lc0<<2, 8], ->vt.w + v.add vt#1.reuse.sw, lc0.uh, ->vu.w + v.swi.u.local vu#1.reuse.uw, [ts, lc0<<2, 12] + C.BSTOP + .size test_v03_vector_reuse_roundtrip, .-test_v03_vector_reuse_roundtrip + +# CHECK-LABEL: : +# CHECK: BSTART.MSEQ +# CHECK: B.TEXT +# CHECK: B.IOTI{{[[:space:]]+}}[], last{{[[:space:]]+}}->t<4KB> +# CHECK: B.IOTI{{[[:space:]]+}}[], last{{[[:space:]]+}}->u<2KB> +# CHECK: C.B.DIMI{{[[:space:]]+}}8, {{[[:space:]]*}}->lb0 +# CHECK: C.B.DIMI{{[[:space:]]+}}1, {{[[:space:]]*}}->lb1 +# CHECK: C.BSTART +# CHECK: v.lwi.u.local{{[[:space:]]+}}[ts, lc0<<2, 8], ->vt +# CHECK: v.add{{[[:space:]]+}}vt#1, lc0,{{[[:space:]]*}}->vu +# CHECK: v.swi.u.local{{[[:space:]]+}}vu#1, [ts, lc0<<2, 12] +# CHECK: C.BSTOP