//===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // /// \file /// This pass compute turns all control flow pseudo instructions into native one /// computing their address on the fly ; it also sets STACK_SIZE info. //===----------------------------------------------------------------------===// #include "llvm/Support/Debug.h" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "R600Defines.h" #include "R600InstrInfo.h" #include "R600MachineFunctionInfo.h" #include "R600RegisterInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "r600cf" namespace { struct CFStack { enum StackItem { ENTRY = 0, SUB_ENTRY = 1, FIRST_NON_WQM_PUSH = 2, FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 }; const AMDGPUSubtarget *ST; std::vector<StackItem> BranchStack; std::vector<StackItem> LoopStack; unsigned MaxStackSize; unsigned CurrentEntries; unsigned CurrentSubEntries; CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st), // We need to reserve a stack entry for CALL_FS in vertex shaders. MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), CurrentEntries(0), CurrentSubEntries(0) { } unsigned getLoopDepth(); bool branchStackContains(CFStack::StackItem); bool requiresWorkAroundForInst(unsigned Opcode); unsigned getSubEntrySize(CFStack::StackItem Item); void updateMaxStackSize(); void pushBranch(unsigned Opcode, bool isWQM = false); void pushLoop(); void popBranch(); void popLoop(); }; unsigned CFStack::getLoopDepth() { return LoopStack.size(); } bool CFStack::branchStackContains(CFStack::StackItem Item) { for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(), E = BranchStack.end(); I != E; ++I) { if (*I == Item) return true; } return false; } bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() && getLoopDepth() > 1) return true; if (!ST->hasCFAluBug()) return false; switch(Opcode) { default: return false; case AMDGPU::CF_ALU_PUSH_BEFORE: case AMDGPU::CF_ALU_ELSE_AFTER: case AMDGPU::CF_ALU_BREAK: case AMDGPU::CF_ALU_CONTINUE: if (CurrentSubEntries == 0) return false; if (ST->getWavefrontSize() == 64) { // We are being conservative here. We only require this work-around if // CurrentSubEntries > 3 && // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) // // We have to be conservative, because we don't know for certain that // our stack allocation algorithm for Evergreen/NI is correct. Applying this // work-around when CurrentSubEntries > 3 allows us to over-allocate stack // resources without any problems. return CurrentSubEntries > 3; } else { assert(ST->getWavefrontSize() == 32); // We are being conservative here. We only require the work-around if // CurrentSubEntries > 7 && // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) // See the comment on the wavefront size == 64 case for why we are // being conservative. return CurrentSubEntries > 7; } } } unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { switch(Item) { default: return 0; case CFStack::FIRST_NON_WQM_PUSH: assert(!ST->hasCaymanISA()); if (ST->getGeneration() <= AMDGPUSubtarget::R700) { // +1 For the push operation. // +2 Extra space required. return 3; } else { // Some documentation says that this is not necessary on Evergreen, // but experimentation has show that we need to allocate 1 extra // sub-entry for the first non-WQM push. // +1 For the push operation. // +1 Extra space required. return 2; } case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); // +1 For the push operation. // +1 Extra space required. return 2; case CFStack::SUB_ENTRY: return 1; } } void CFStack::updateMaxStackSize() { unsigned CurrentStackSize = CurrentEntries + (RoundUpToAlignment(CurrentSubEntries, 4) / 4); MaxStackSize = std::max(CurrentStackSize, MaxStackSize); } void CFStack::pushBranch(unsigned Opcode, bool isWQM) { CFStack::StackItem Item = CFStack::ENTRY; switch(Opcode) { case AMDGPU::CF_PUSH_EG: case AMDGPU::CF_ALU_PUSH_BEFORE: if (!isWQM) { if (!ST->hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI // See comment in // CFStack::getSubEntrySize() else if (CurrentEntries > 0 && ST->getGeneration() > AMDGPUSubtarget::EVERGREEN && !ST->hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; else Item = CFStack::SUB_ENTRY; } else Item = CFStack::ENTRY; break; } BranchStack.push_back(Item); if (Item == CFStack::ENTRY) CurrentEntries++; else CurrentSubEntries += getSubEntrySize(Item); updateMaxStackSize(); } void CFStack::pushLoop() { LoopStack.push_back(CFStack::ENTRY); CurrentEntries++; updateMaxStackSize(); } void CFStack::popBranch() { CFStack::StackItem Top = BranchStack.back(); if (Top == CFStack::ENTRY) CurrentEntries--; else CurrentSubEntries-= getSubEntrySize(Top); BranchStack.pop_back(); } void CFStack::popLoop() { CurrentEntries--; LoopStack.pop_back(); } class R600ControlFlowFinalizer : public MachineFunctionPass { private: typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile; enum ControlFlowInstruction { CF_TC, CF_VC, CF_CALL_FS, CF_WHILE_LOOP, CF_END_LOOP, CF_LOOP_BREAK, CF_LOOP_CONTINUE, CF_JUMP, CF_ELSE, CF_POP, CF_END }; static char ID; const R600InstrInfo *TII; const R600RegisterInfo *TRI; unsigned MaxFetchInst; const AMDGPUSubtarget *ST; bool IsTrivialInst(MachineInstr *MI) const { switch (MI->getOpcode()) { case AMDGPU::KILL: case AMDGPU::RETURN: return true; default: return false; } } const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const { unsigned Opcode = 0; bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN); switch (CFI) { case CF_TC: Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600; break; case CF_VC: Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600; break; case CF_CALL_FS: Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600; break; case CF_WHILE_LOOP: Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600; break; case CF_END_LOOP: Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600; break; case CF_LOOP_BREAK: Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600; break; case CF_LOOP_CONTINUE: Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600; break; case CF_JUMP: Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600; break; case CF_ELSE: Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600; break; case CF_POP: Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600; break; case CF_END: if (ST->hasCaymanISA()) { Opcode = AMDGPU::CF_END_CM; break; } Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600; break; } assert (Opcode && "No opcode selected"); return TII->get(Opcode); } bool isCompatibleWithClause(const MachineInstr *MI, std::set<unsigned> &DstRegs) const { unsigned DstMI, SrcMI; for (MachineInstr::const_mop_iterator I = MI->operands_begin(), E = MI->operands_end(); I != E; ++I) { const MachineOperand &MO = *I; if (!MO.isReg()) continue; if (MO.isDef()) { unsigned Reg = MO.getReg(); if (AMDGPU::R600_Reg128RegClass.contains(Reg)) DstMI = Reg; else DstMI = TRI->getMatchingSuperReg(Reg, TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), &AMDGPU::R600_Reg128RegClass); } if (MO.isUse()) { unsigned Reg = MO.getReg(); if (AMDGPU::R600_Reg128RegClass.contains(Reg)) SrcMI = Reg; else SrcMI = TRI->getMatchingSuperReg(Reg, TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)), &AMDGPU::R600_Reg128RegClass); } } if ((DstRegs.find(SrcMI) == DstRegs.end())) { DstRegs.insert(DstMI); return true; } else return false; } ClauseFile MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) const { MachineBasicBlock::iterator ClauseHead = I; std::vector<MachineInstr *> ClauseContent; unsigned AluInstCount = 0; bool IsTex = TII->usesTextureCache(ClauseHead); std::set<unsigned> DstRegs; for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) { if (IsTrivialInst(I)) continue; if (AluInstCount >= MaxFetchInst) break; if ((IsTex && !TII->usesTextureCache(I)) || (!IsTex && !TII->usesVertexCache(I))) break; if (!isCompatibleWithClause(I, DstRegs)) break; AluInstCount ++; ClauseContent.push_back(I); } MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), getHWInstrDesc(IsTex?CF_TC:CF_VC)) .addImm(0) // ADDR .addImm(AluInstCount - 1); // COUNT return ClauseFile(MIb, std::move(ClauseContent)); } void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const { static const unsigned LiteralRegs[] = { AMDGPU::ALU_LITERAL_X, AMDGPU::ALU_LITERAL_Y, AMDGPU::ALU_LITERAL_Z, AMDGPU::ALU_LITERAL_W }; const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs = TII->getSrcs(MI); for (unsigned i = 0, e = Srcs.size(); i < e; ++i) { if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X) continue; int64_t Imm = Srcs[i].second; std::vector<int64_t>::iterator It = std::find(Lits.begin(), Lits.end(), Imm); if (It != Lits.end()) { unsigned Index = It - Lits.begin(); Srcs[i].first->setReg(LiteralRegs[Index]); } else { assert(Lits.size() < 4 && "Too many literals in Instruction Group"); Srcs[i].first->setReg(LiteralRegs[Lits.size()]); Lits.push_back(Imm); } } } MachineBasicBlock::iterator insertLiterals( MachineBasicBlock::iterator InsertPos, const std::vector<unsigned> &Literals) const { MachineBasicBlock *MBB = InsertPos->getParent(); for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { unsigned LiteralPair0 = Literals[i]; unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0; InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(), TII->get(AMDGPU::LITERALS)) .addImm(LiteralPair0) .addImm(LiteralPair1); } return InsertPos; } ClauseFile MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I) const { MachineBasicBlock::iterator ClauseHead = I; std::vector<MachineInstr *> ClauseContent; I++; for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) { if (IsTrivialInst(I)) { ++I; continue; } if (!I->isBundle() && !TII->isALUInstr(I->getOpcode())) break; std::vector<int64_t> Literals; if (I->isBundle()) { MachineInstr *DeleteMI = I; MachineBasicBlock::instr_iterator BI = I.getInstrIterator(); while (++BI != E && BI->isBundledWithPred()) { BI->unbundleFromPred(); for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) { MachineOperand &MO = BI->getOperand(i); if (MO.isReg() && MO.isInternalRead()) MO.setIsInternalRead(false); } getLiteral(&*BI, Literals); ClauseContent.push_back(&*BI); } I = BI; DeleteMI->eraseFromParent(); } else { getLiteral(I, Literals); ClauseContent.push_back(I); I++; } for (unsigned i = 0, e = Literals.size(); i < e; i+=2) { unsigned literal0 = Literals[i]; unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0; MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::LITERALS)) .addImm(literal0) .addImm(literal2); ClauseContent.push_back(MILit); } } assert(ClauseContent.size() < 128 && "ALU clause is too big"); ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); return ClauseFile(ClauseHead, std::move(ClauseContent)); } void EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, unsigned &CfCount) { CounterPropagateAddr(Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE)) .addImm(CfCount); for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { BB->splice(InsertPos, BB, Clause.second[i]); } CfCount += 2 * Clause.second.size(); } void EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, unsigned &CfCount) { Clause.first->getOperand(0).setImm(0); CounterPropagateAddr(Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) .addImm(CfCount); for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) { BB->splice(InsertPos, BB, Clause.second[i]); } CfCount += Clause.second.size(); } void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const { MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm()); } void CounterPropagateAddr(const std::set<MachineInstr *> &MIs, unsigned Addr) const { for (MachineInstr *MI : MIs) { CounterPropagateAddr(MI, Addr); } } public: R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {} bool runOnMachineFunction(MachineFunction &MF) override { ST = &MF.getSubtarget<AMDGPUSubtarget>(); MaxFetchInst = ST->getTexVTXClauseSize(); TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo()); TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo()); R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); CFStack CFStack(ST, MFI->getShaderType()); for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack; std::vector<MachineInstr * > IfThenElseStack; if (MFI->getShaderType() == ShaderType::VERTEX) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; } std::vector<ClauseFile> FetchClauses, AluClauses; std::vector<MachineInstr *> LastAlu(1); std::vector<MachineInstr *> ToPopAfter; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) { DEBUG(dbgs() << CfCount << ":"; I->dump();); FetchClauses.push_back(MakeFetchClause(MBB, I)); CfCount++; LastAlu.back() = nullptr; continue; } MachineBasicBlock::iterator MI = I; if (MI->getOpcode() != AMDGPU::ENDIF) LastAlu.back() = nullptr; if (MI->getOpcode() == AMDGPU::CF_ALU) LastAlu.back() = MI; I++; bool RequiresWorkAround = CFStack.requiresWorkAroundForInst(MI->getOpcode()); switch (MI->getOpcode()) { case AMDGPU::CF_ALU_PUSH_BEFORE: if (RequiresWorkAround) { DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) .addImm(CfCount + 1) .addImm(1); MI->setDesc(TII->get(AMDGPU::CF_ALU)); CfCount++; CFStack.pushBranch(AMDGPU::CF_PUSH_EG); } else CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); case AMDGPU::CF_ALU: I = MI; AluClauses.push_back(MakeALUClause(MBB, I)); DEBUG(dbgs() << CfCount << ":"; MI->dump();); CfCount++; break; case AMDGPU::WHILELOOP: { CFStack.pushLoop(); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_WHILE_LOOP)) .addImm(1); std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount, std::set<MachineInstr *>()); Pair.second.insert(MIb); LoopStack.push_back(std::move(Pair)); MI->eraseFromParent(); CfCount++; break; } case AMDGPU::ENDLOOP: { CFStack.popLoop(); std::pair<unsigned, std::set<MachineInstr *> > Pair = std::move(LoopStack.back()); LoopStack.pop_back(); CounterPropagateAddr(Pair.second, CfCount); BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP)) .addImm(Pair.first + 1); MI->eraseFromParent(); CfCount++; break; } case AMDGPU::IF_PREDICATE_SET: { LastAlu.push_back(nullptr); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP)) .addImm(0) .addImm(0); IfThenElseStack.push_back(MIb); DEBUG(dbgs() << CfCount << ":"; MIb->dump();); MI->eraseFromParent(); CfCount++; break; } case AMDGPU::ELSE: { MachineInstr * JumpInst = IfThenElseStack.back(); IfThenElseStack.pop_back(); CounterPropagateAddr(JumpInst, CfCount); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_ELSE)) .addImm(0) .addImm(0); DEBUG(dbgs() << CfCount << ":"; MIb->dump();); IfThenElseStack.push_back(MIb); MI->eraseFromParent(); CfCount++; break; } case AMDGPU::ENDIF: { CFStack.popBranch(); if (LastAlu.back()) { ToPopAfter.push_back(LastAlu.back()); } else { MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_POP)) .addImm(CfCount + 1) .addImm(1); (void)MIb; DEBUG(dbgs() << CfCount << ":"; MIb->dump();); CfCount++; } MachineInstr *IfOrElseInst = IfThenElseStack.back(); IfThenElseStack.pop_back(); CounterPropagateAddr(IfOrElseInst, CfCount); IfOrElseInst->getOperand(1).setImm(1); LastAlu.pop_back(); MI->eraseFromParent(); break; } case AMDGPU::BREAK: { CfCount ++; MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_LOOP_BREAK)) .addImm(0); LoopStack.back().second.insert(MIb); MI->eraseFromParent(); break; } case AMDGPU::CONTINUE: { MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_LOOP_CONTINUE)) .addImm(0); LoopStack.back().second.insert(MIb); MI->eraseFromParent(); CfCount++; break; } case AMDGPU::RETURN: { BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END)); CfCount++; MI->eraseFromParent(); if (CfCount % 2) { BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD)); CfCount++; } for (unsigned i = 0, e = FetchClauses.size(); i < e; i++) EmitFetchClause(I, FetchClauses[i], CfCount); for (unsigned i = 0, e = AluClauses.size(); i < e; i++) EmitALUClause(I, AluClauses[i], CfCount); } default: if (TII->isExport(MI->getOpcode())) { DEBUG(dbgs() << CfCount << ":"; MI->dump();); CfCount++; } break; } } for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) { MachineInstr *Alu = ToPopAfter[i]; BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu), TII->get(AMDGPU::CF_ALU_POP_AFTER)) .addImm(Alu->getOperand(0).getImm()) .addImm(Alu->getOperand(1).getImm()) .addImm(Alu->getOperand(2).getImm()) .addImm(Alu->getOperand(3).getImm()) .addImm(Alu->getOperand(4).getImm()) .addImm(Alu->getOperand(5).getImm()) .addImm(Alu->getOperand(6).getImm()) .addImm(Alu->getOperand(7).getImm()) .addImm(Alu->getOperand(8).getImm()); Alu->eraseFromParent(); } MFI->StackSize = CFStack.MaxStackSize; } return false; } const char *getPassName() const override { return "R600 Control Flow Finalizer Pass"; } }; char R600ControlFlowFinalizer::ID = 0; } // end anonymous namespace llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) { return new R600ControlFlowFinalizer(TM); }