// Copyright (c) 2018 Google LLC. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "source/opt/loop_fission.h" #include <set> #include "source/opt/register_pressure.h" // Implement loop fission with an optional parameter to split only // if the register pressure in a given loop meets a certain criteria. This is // controlled via the constructors of LoopFissionPass. // // 1 - Build a list of loops to be split, these are top level loops (loops // without child loops themselves) which meet the register pressure criteria, as // determined by the ShouldSplitLoop method of LoopFissionPass. // // 2 - For each loop in the list, group each instruction into a set of related // instructions by traversing each instructions users and operands recursively. // We stop if we encounter an instruction we have seen before or an instruction // which we don't consider relevent (i.e OpLoopMerge). We then group these // groups into two different sets, one for the first loop and one for the // second. // // 3 - We then run CanPerformSplit to check that it would be legal to split a // loop using those two sets. We check that we haven't altered the relative // order load/stores appear in the binary and that we aren't breaking any // dependency between load/stores by splitting them into two loops. We also // check that none of the OpBranch instructions are dependent on a load as we // leave control flow structure intact and move only instructions in the body so // we want to avoid any loads with side affects or aliasing. // // 4 - We then split the loop by calling SplitLoop. This function clones the // loop and attaches it to the preheader and connects the new loops merge block // to the current loop header block. We then use the two sets built in step 2 to // remove instructions from each loop. If an instruction appears in the first // set it is removed from the second loop and vice versa. // // 5 - If the multiple split passes flag is set we check if each of the loops // still meet the register pressure criteria. If they do then we add them to the // list of loops to be split (created in step one) to allow for loops to be // split multiple times. // namespace spvtools { namespace opt { class LoopFissionImpl { public: LoopFissionImpl(IRContext* context, Loop* loop) : context_(context), loop_(loop), load_used_in_condition_(false) {} // Group each instruction in the loop into sets of instructions related by // their usedef chains. An instruction which uses another will appear in the // same set. Then merge those sets into just two sets. Returns false if there // was one or less sets created. bool GroupInstructionsByUseDef(); // Check if the sets built by GroupInstructionsByUseDef violate any data // dependence rules. bool CanPerformSplit(); // Split the loop and return a pointer to the new loop. Loop* SplitLoop(); // Checks if |inst| is safe to move. We can only move instructions which don't // have any side effects and OpLoads and OpStores. bool MovableInstruction(const Instruction& inst) const; private: // Traverse the def use chain of |inst| and add the users and uses of |inst| // which are in the same loop to the |returned_set|. void TraverseUseDef(Instruction* inst, std::set<Instruction*>* returned_set, bool ignore_phi_users = false, bool report_loads = false); // We group the instructions in the block into two different groups, the // instructions to be kept in the original loop and the ones to be cloned into // the new loop. As the cloned loop is attached to the preheader it will be // the first loop and the second loop will be the original. std::set<Instruction*> cloned_loop_instructions_; std::set<Instruction*> original_loop_instructions_; // We need a set of all the instructions to be seen so we can break any // recursion and also so we can ignore certain instructions by preemptively // adding them to this set. std::set<Instruction*> seen_instructions_; // A map of instructions to their relative position in the function. std::map<Instruction*, size_t> instruction_order_; IRContext* context_; Loop* loop_; // This is set to true by TraverseUseDef when traversing the instructions // related to the loop condition and any if conditions should any of those // instructions be a load. bool load_used_in_condition_; }; bool LoopFissionImpl::MovableInstruction(const Instruction& inst) const { return inst.opcode() == SpvOp::SpvOpLoad || inst.opcode() == SpvOp::SpvOpStore || inst.opcode() == SpvOp::SpvOpSelectionMerge || inst.opcode() == SpvOp::SpvOpPhi || inst.IsOpcodeCodeMotionSafe(); } void LoopFissionImpl::TraverseUseDef(Instruction* inst, std::set<Instruction*>* returned_set, bool ignore_phi_users, bool report_loads) { assert(returned_set && "Set to be returned cannot be null."); analysis::DefUseManager* def_use = context_->get_def_use_mgr(); std::set<Instruction*>& inst_set = *returned_set; // We create this functor to traverse the use def chain to build the // grouping of related instructions. The lambda captures the std::function // to allow it to recurse. std::function<void(Instruction*)> traverser_functor; traverser_functor = [this, def_use, &inst_set, &traverser_functor, ignore_phi_users, report_loads](Instruction* user) { // If we've seen the instruction before or it is not inside the loop end the // traversal. if (!user || seen_instructions_.count(user) != 0 || !context_->get_instr_block(user) || !loop_->IsInsideLoop(context_->get_instr_block(user))) { return; } // Don't include labels or loop merge instructions in the instruction sets. // Including them would mean we group instructions related only by using the // same labels (i.e phis). We already preempt the inclusion of // OpSelectionMerge by adding related instructions to the seen_instructions_ // set. if (user->opcode() == SpvOp::SpvOpLoopMerge || user->opcode() == SpvOp::SpvOpLabel) return; // If the |report_loads| flag is set, set the class field // load_used_in_condition_ to false. This is used to check that none of the // condition checks in the loop rely on loads. if (user->opcode() == SpvOp::SpvOpLoad && report_loads) { load_used_in_condition_ = true; } // Add the instruction to the set of instructions already seen, this breaks // recursion and allows us to ignore certain instructions. seen_instructions_.insert(user); inst_set.insert(user); // Wrapper functor to traverse the operands of each instruction. auto traverse_operand = [&traverser_functor, def_use](const uint32_t* id) { traverser_functor(def_use->GetDef(*id)); }; user->ForEachInOperand(traverse_operand); // For the first traversal we want to ignore the users of the phi. if (ignore_phi_users && user->opcode() == SpvOp::SpvOpPhi) return; // Traverse each user with this lambda. def_use->ForEachUser(user, traverser_functor); // Wrapper functor for the use traversal. auto traverse_use = [&traverser_functor](Instruction* use, uint32_t) { traverser_functor(use); }; def_use->ForEachUse(user, traverse_use); }; // We start the traversal of the use def graph by invoking the above // lambda with the |inst| parameter. traverser_functor(inst); } bool LoopFissionImpl::GroupInstructionsByUseDef() { std::vector<std::set<Instruction*>> sets{}; // We want to ignore all the instructions stemming from the loop condition // instruction. BasicBlock* condition_block = loop_->FindConditionBlock(); if (!condition_block) return false; Instruction* condition = &*condition_block->tail(); // We iterate over the blocks via iterating over all the blocks in the // function, we do this so we are iterating in the same order which the blocks // appear in the binary. Function& function = *loop_->GetHeaderBlock()->GetParent(); // Create a temporary set to ignore certain groups of instructions within the // loop. We don't want any instructions related to control flow to be removed // from either loop only instructions within the control flow bodies. std::set<Instruction*> instructions_to_ignore{}; TraverseUseDef(condition, &instructions_to_ignore, true, true); // Traverse control flow instructions to ensure they are added to the // seen_instructions_ set and will be ignored when it it called with actual // sets. for (BasicBlock& block : function) { if (!loop_->IsInsideLoop(block.id())) continue; for (Instruction& inst : block) { // Ignore all instructions related to control flow. if (inst.opcode() == SpvOp::SpvOpSelectionMerge || inst.IsBranch()) { TraverseUseDef(&inst, &instructions_to_ignore, true, true); } } } // Traverse the instructions and generate the sets, automatically ignoring any // instructions in instructions_to_ignore. for (BasicBlock& block : function) { if (!loop_->IsInsideLoop(block.id()) || loop_->GetHeaderBlock()->id() == block.id()) continue; for (Instruction& inst : block) { // Record the order that each load/store is seen. if (inst.opcode() == SpvOp::SpvOpLoad || inst.opcode() == SpvOp::SpvOpStore) { instruction_order_[&inst] = instruction_order_.size(); } // Ignore instructions already seen in a traversal. if (seen_instructions_.count(&inst) != 0) { continue; } // Build the set. std::set<Instruction*> inst_set{}; TraverseUseDef(&inst, &inst_set); if (!inst_set.empty()) sets.push_back(std::move(inst_set)); } } // If we have one or zero sets return false to indicate that due to // insufficient instructions we couldn't split the loop into two groups and // thus the loop can't be split any further. if (sets.size() < 2) { return false; } // Merge the loop sets into two different sets. In CanPerformSplit we will // validate that we don't break the relative ordering of loads/stores by doing // this. for (size_t index = 0; index < sets.size() / 2; ++index) { cloned_loop_instructions_.insert(sets[index].begin(), sets[index].end()); } for (size_t index = sets.size() / 2; index < sets.size(); ++index) { original_loop_instructions_.insert(sets[index].begin(), sets[index].end()); } return true; } bool LoopFissionImpl::CanPerformSplit() { // Return false if any of the condition instructions in the loop depend on a // load. if (load_used_in_condition_) { return false; } // Build a list of all parent loops of this loop. Loop dependence analysis // needs this structure. std::vector<const Loop*> loops; Loop* parent_loop = loop_; while (parent_loop) { loops.push_back(parent_loop); parent_loop = parent_loop->GetParent(); } LoopDependenceAnalysis analysis{context_, loops}; // A list of all the stores in the cloned loop. std::vector<Instruction*> set_one_stores{}; // A list of all the loads in the cloned loop. std::vector<Instruction*> set_one_loads{}; // Populate the above lists. for (Instruction* inst : cloned_loop_instructions_) { if (inst->opcode() == SpvOp::SpvOpStore) { set_one_stores.push_back(inst); } else if (inst->opcode() == SpvOp::SpvOpLoad) { set_one_loads.push_back(inst); } // If we find any instruction which we can't move (such as a barrier), // return false. if (!MovableInstruction(*inst)) return false; } // We need to calculate the depth of the loop to create the loop dependency // distance vectors. const size_t loop_depth = loop_->GetDepth(); // Check the dependencies between loads in the cloned loop and stores in the // original and vice versa. for (Instruction* inst : original_loop_instructions_) { // If we find any instruction which we can't move (such as a barrier), // return false. if (!MovableInstruction(*inst)) return false; // Look at the dependency between the loads in the original and stores in // the cloned loops. if (inst->opcode() == SpvOp::SpvOpLoad) { for (Instruction* store : set_one_stores) { DistanceVector vec{loop_depth}; // If the store actually should appear after the load, return false. // This means the store has been placed in the wrong grouping. if (instruction_order_[store] > instruction_order_[inst]) { return false; } // If not independent check the distance vector. if (!analysis.GetDependence(store, inst, &vec)) { for (DistanceEntry& entry : vec.GetEntries()) { // A distance greater than zero means that the store in the cloned // loop has a dependency on the load in the original loop. if (entry.distance > 0) return false; } } } } else if (inst->opcode() == SpvOp::SpvOpStore) { for (Instruction* load : set_one_loads) { DistanceVector vec{loop_depth}; // If the load actually should appear after the store, return false. if (instruction_order_[load] > instruction_order_[inst]) { return false; } // If not independent check the distance vector. if (!analysis.GetDependence(inst, load, &vec)) { for (DistanceEntry& entry : vec.GetEntries()) { // A distance less than zero means the load in the cloned loop is // dependent on the store instruction in the original loop. if (entry.distance < 0) return false; } } } } } return true; } Loop* LoopFissionImpl::SplitLoop() { // Clone the loop. LoopUtils util{context_, loop_}; LoopUtils::LoopCloningResult clone_results; Loop* cloned_loop = util.CloneAndAttachLoopToHeader(&clone_results); // Update the OpLoopMerge in the cloned loop. cloned_loop->UpdateLoopMergeInst(); // Add the loop_ to the module. // TODO(1841): Handle failure to create pre-header. Function::iterator it = util.GetFunction()->FindBlock(loop_->GetOrCreatePreHeaderBlock()->id()); util.GetFunction()->AddBasicBlocks(clone_results.cloned_bb_.begin(), clone_results.cloned_bb_.end(), ++it); loop_->SetPreHeaderBlock(cloned_loop->GetMergeBlock()); std::vector<Instruction*> instructions_to_kill{}; // Kill all the instructions which should appear in the cloned loop but not in // the original loop. for (uint32_t id : loop_->GetBlocks()) { BasicBlock* block = context_->cfg()->block(id); for (Instruction& inst : *block) { // If the instruction appears in the cloned loop instruction group, kill // it. if (cloned_loop_instructions_.count(&inst) == 1 && original_loop_instructions_.count(&inst) == 0) { instructions_to_kill.push_back(&inst); if (inst.opcode() == SpvOp::SpvOpPhi) { context_->ReplaceAllUsesWith( inst.result_id(), clone_results.value_map_[inst.result_id()]); } } } } // Kill all instructions which should appear in the original loop and not in // the cloned loop. for (uint32_t id : cloned_loop->GetBlocks()) { BasicBlock* block = context_->cfg()->block(id); for (Instruction& inst : *block) { Instruction* old_inst = clone_results.ptr_map_[&inst]; // If the instruction belongs to the original loop instruction group, kill // it. if (cloned_loop_instructions_.count(old_inst) == 0 && original_loop_instructions_.count(old_inst) == 1) { instructions_to_kill.push_back(&inst); } } } for (Instruction* i : instructions_to_kill) { context_->KillInst(i); } return cloned_loop; } LoopFissionPass::LoopFissionPass(const size_t register_threshold_to_split, bool split_multiple_times) : split_multiple_times_(split_multiple_times) { // Split if the number of registers in the loop exceeds // |register_threshold_to_split|. split_criteria_ = [register_threshold_to_split]( const RegisterLiveness::RegionRegisterLiveness& liveness) { return liveness.used_registers_ > register_threshold_to_split; }; } LoopFissionPass::LoopFissionPass() : split_multiple_times_(false) { // Split by default. split_criteria_ = [](const RegisterLiveness::RegionRegisterLiveness&) { return true; }; } bool LoopFissionPass::ShouldSplitLoop(const Loop& loop, IRContext* c) { LivenessAnalysis* analysis = c->GetLivenessAnalysis(); RegisterLiveness::RegionRegisterLiveness liveness{}; Function* function = loop.GetHeaderBlock()->GetParent(); analysis->Get(function)->ComputeLoopRegisterPressure(loop, &liveness); return split_criteria_(liveness); } Pass::Status LoopFissionPass::Process() { bool changed = false; for (Function& f : *context()->module()) { // We collect all the inner most loops in the function and run the loop // splitting util on each. The reason we do this is to allow us to iterate // over each, as creating new loops will invalidate the the loop iterator. std::vector<Loop*> inner_most_loops{}; LoopDescriptor& loop_descriptor = *context()->GetLoopDescriptor(&f); for (Loop& loop : loop_descriptor) { if (!loop.HasChildren() && ShouldSplitLoop(loop, context())) { inner_most_loops.push_back(&loop); } } // List of new loops which meet the criteria to be split again. std::vector<Loop*> new_loops_to_split{}; while (!inner_most_loops.empty()) { for (Loop* loop : inner_most_loops) { LoopFissionImpl impl{context(), loop}; // Group the instructions in the loop into two different sets of related // instructions. If we can't group the instructions into the two sets // then we can't split the loop any further. if (!impl.GroupInstructionsByUseDef()) { continue; } if (impl.CanPerformSplit()) { Loop* second_loop = impl.SplitLoop(); changed = true; context()->InvalidateAnalysesExceptFor( IRContext::kAnalysisLoopAnalysis); // If the newly created loop meets the criteria to be split, split it // again. if (ShouldSplitLoop(*second_loop, context())) new_loops_to_split.push_back(second_loop); // If the original loop (now split) still meets the criteria to be // split, split it again. if (ShouldSplitLoop(*loop, context())) new_loops_to_split.push_back(loop); } } // If the split multiple times flag has been set add the new loops which // meet the splitting criteria into the list of loops to be split on the // next iteration. if (split_multiple_times_) { inner_most_loops = std::move(new_loops_to_split); } else { break; } } } return changed ? Pass::Status::SuccessWithChange : Pass::Status::SuccessWithoutChange; } } // namespace opt } // namespace spvtools