HELLO·Android
系统源代码
IT资讯
技术文章
我的收藏
注册
登录
-
我收藏的文章
创建代码块
我的代码块
我的账号
Pie
|
9.0.0_r8
下载
查看原文件
收藏
根目录
external
swiftshader
third_party
subzero
src
IceTargetLoweringARM32.cpp
//===- subzero/src/IceTargetLoweringARM32.cpp - ARM32 lowering ------------===// // // The Subzero Code Generator // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// /// /// \file /// \brief Implements the TargetLoweringARM32 class, which consists almost /// entirely of the lowering sequence for each high-level instruction. /// //===----------------------------------------------------------------------===// #include "IceTargetLoweringARM32.h" #include "IceCfg.h" #include "IceCfgNode.h" #include "IceClFlags.h" #include "IceDefs.h" #include "IceELFObjectWriter.h" #include "IceGlobalInits.h" #include "IceInstARM32.def" #include "IceInstARM32.h" #include "IceInstVarIter.h" #include "IceLiveness.h" #include "IceOperand.h" #include "IcePhiLoweringImpl.h" #include "IceRegistersARM32.h" #include "IceTargetLoweringARM32.def" #include "IceUtils.h" #include "llvm/Support/MathExtras.h" #include
#include
#include
namespace ARM32 { std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) { return ::Ice::ARM32::TargetARM32::create(Func); } std::unique_ptr<::Ice::TargetDataLowering> createTargetDataLowering(::Ice::GlobalContext *Ctx) { return ::Ice::ARM32::TargetDataARM32::create(Ctx); } std::unique_ptr<::Ice::TargetHeaderLowering> createTargetHeaderLowering(::Ice::GlobalContext *Ctx) { return ::Ice::ARM32::TargetHeaderARM32::create(Ctx); } void staticInit(::Ice::GlobalContext *Ctx) { ::Ice::ARM32::TargetARM32::staticInit(Ctx); if (Ice::getFlags().getUseNonsfi()) { // In nonsfi, we need to reference the _GLOBAL_OFFSET_TABLE_ for accessing // globals. The GOT is an external symbol (i.e., it is not defined in the // pexe) so we need to register it as such so that ELF emission won't barf // on an "unknown" symbol. The GOT is added to the External symbols list // here because staticInit() is invoked in a single-thread context. Ctx->getConstantExternSym(Ctx->getGlobalString(::Ice::GlobalOffsetTable)); } } bool shouldBePooled(const ::Ice::Constant *C) { return ::Ice::ARM32::TargetARM32::shouldBePooled(C); } ::Ice::Type getPointerType() { return ::Ice::ARM32::TargetARM32::getPointerType(); } } // end of namespace ARM32 namespace Ice { namespace ARM32 { namespace { /// SizeOf is used to obtain the size of an initializer list as a constexpr /// expression. This is only needed until our C++ library is updated to /// C++ 14 -- which defines constexpr members to std::initializer_list. class SizeOf { SizeOf(const SizeOf &) = delete; SizeOf &operator=(const SizeOf &) = delete; public: constexpr SizeOf() : Size(0) {} template
explicit constexpr SizeOf(T...) : Size(__length
::value) {} constexpr SizeT size() const { return Size; } private: template
struct __length { static constexpr std::size_t value = 1 + __length
::value; }; template
struct __length
{ static constexpr std::size_t value = 1; }; const std::size_t Size; }; } // end of anonymous namespace // Defines the RegARM32::Table table with register information. RegARM32::RegTableType RegARM32::RegTable[RegARM32::Reg_NUM] = { #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \ isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \ { \ name, encode, cc_arg, scratch, preserved, stackptr, frameptr, isGPR, \ isInt, isI64Pair, isFP32, isFP64, isVec128, \ (SizeOf alias_init).size(), alias_init \ } \ , REGARM32_TABLE #undef X }; namespace { // The following table summarizes the logic for lowering the icmp instruction // for i32 and narrower types. Each icmp condition has a clear mapping to an // ARM32 conditional move instruction. const struct TableIcmp32_ { CondARM32::Cond Mapping; } TableIcmp32[] = { #define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V) \ { CondARM32::C_32 } \ , ICMPARM32_TABLE #undef X }; // The following table summarizes the logic for lowering the icmp instruction // for the i64 type. Two conditional moves are needed for setting to 1 or 0. // The operands may need to be swapped, and there is a slight difference for // signed vs unsigned (comparing hi vs lo first, and using cmp vs sbc). const struct TableIcmp64_ { bool IsSigned; bool Swapped; CondARM32::Cond C1, C2; } TableIcmp64[] = { #define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V) \ { is_signed, swapped64, CondARM32::C1_64, CondARM32::C2_64 } \ , ICMPARM32_TABLE #undef X }; CondARM32::Cond getIcmp32Mapping(InstIcmp::ICond Cond) { assert(Cond < llvm::array_lengthof(TableIcmp32)); return TableIcmp32[Cond].Mapping; } // In some cases, there are x-macros tables for both high-level and low-level // instructions/operands that use the same enum key value. The tables are kept // separate to maintain a proper separation between abstraction layers. There // is a risk that the tables could get out of sync if enum values are reordered // or if entries are added or deleted. The following anonymous namespaces use // static_asserts to ensure everything is kept in sync. // Validate the enum values in ICMPARM32_TABLE. namespace { // Define a temporary set of enum values based on low-level table entries. enum _icmp_ll_enum { #define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V) \ _icmp_ll_##val, ICMPARM32_TABLE #undef X _num }; // Define a set of constants based on high-level table entries. #define X(tag, reverse, str) \ static constexpr int _icmp_hl_##tag = InstIcmp::tag; ICEINSTICMP_TABLE #undef X // Define a set of constants based on low-level table entries, and ensure the // table entry keys are consistent. #define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V) \ static_assert( \ _icmp_ll_##val == _icmp_hl_##val, \ "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE: " #val); ICMPARM32_TABLE #undef X // Repeat the static asserts with respect to the high-level table entries in // case the high-level table has extra entries. #define X(tag, reverse, str) \ static_assert( \ _icmp_hl_##tag == _icmp_ll_##tag, \ "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE: " #tag); ICEINSTICMP_TABLE #undef X } // end of anonymous namespace // Stack alignment const uint32_t ARM32_STACK_ALIGNMENT_BYTES = 16; // Value is in bytes. Return Value adjusted to the next highest multiple of the // stack alignment. uint32_t applyStackAlignment(uint32_t Value) { return Utils::applyAlignment(Value, ARM32_STACK_ALIGNMENT_BYTES); } // Value is in bytes. Return Value adjusted to the next highest multiple of the // stack alignment required for the given type. uint32_t applyStackAlignmentTy(uint32_t Value, Type Ty) { // Use natural alignment, except that normally (non-NaCl) ARM only aligns // vectors to 8 bytes. // TODO(jvoung): Check this ... size_t typeAlignInBytes = typeWidthInBytes(Ty); if (isVectorType(Ty)) typeAlignInBytes = 8; return Utils::applyAlignment(Value, typeAlignInBytes); } // Conservatively check if at compile time we know that the operand is // definitely a non-zero integer. bool isGuaranteedNonzeroInt(const Operand *Op) { if (auto *Const = llvm::dyn_cast_or_null
(Op)) { return Const->getValue() != 0; } return false; } } // end of anonymous namespace TargetARM32Features::TargetARM32Features(const ClFlags &Flags) { static_assert( (ARM32InstructionSet::End - ARM32InstructionSet::Begin) == (TargetInstructionSet::ARM32InstructionSet_End - TargetInstructionSet::ARM32InstructionSet_Begin), "ARM32InstructionSet range different from TargetInstructionSet"); if (Flags.getTargetInstructionSet() != TargetInstructionSet::BaseInstructionSet) { InstructionSet = static_cast
( (Flags.getTargetInstructionSet() - TargetInstructionSet::ARM32InstructionSet_Begin) + ARM32InstructionSet::Begin); } } namespace { constexpr SizeT NumGPRArgs = #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \ isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \ +(((cc_arg) > 0) ? 1 : 0) REGARM32_GPR_TABLE #undef X ; std::array
GPRArgInitializer; constexpr SizeT NumI64Args = #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \ isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \ +(((cc_arg) > 0) ? 1 : 0) REGARM32_I64PAIR_TABLE #undef X ; std::array
I64ArgInitializer; constexpr SizeT NumFP32Args = #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \ isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \ +(((cc_arg) > 0) ? 1 : 0) REGARM32_FP32_TABLE #undef X ; std::array
FP32ArgInitializer; constexpr SizeT NumFP64Args = #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \ isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \ +(((cc_arg) > 0) ? 1 : 0) REGARM32_FP64_TABLE #undef X ; std::array
FP64ArgInitializer; constexpr SizeT NumVec128Args = #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \ isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \ +(((cc_arg > 0)) ? 1 : 0) REGARM32_VEC128_TABLE #undef X ; std::array
Vec128ArgInitializer; const char *getRegClassName(RegClass C) { auto ClassNum = static_cast
(C); assert(ClassNum < RegARM32::RCARM32_NUM); switch (ClassNum) { default: assert(C < RC_Target); return regClassString(C); // Add handling of new register classes below. case RegARM32::RCARM32_QtoS: return "QtoS"; } } } // end of anonymous namespace TargetARM32::TargetARM32(Cfg *Func) : TargetLowering(Func), NeedSandboxing(SandboxingType == ST_NaCl), CPUFeatures(getFlags()) {} void TargetARM32::staticInit(GlobalContext *Ctx) { RegNumT::setLimit(RegARM32::Reg_NUM); // Limit this size (or do all bitsets need to be the same width)??? SmallBitVector IntegerRegisters(RegARM32::Reg_NUM); SmallBitVector I64PairRegisters(RegARM32::Reg_NUM); SmallBitVector Float32Registers(RegARM32::Reg_NUM); SmallBitVector Float64Registers(RegARM32::Reg_NUM); SmallBitVector VectorRegisters(RegARM32::Reg_NUM); SmallBitVector QtoSRegisters(RegARM32::Reg_NUM); SmallBitVector InvalidRegisters(RegARM32::Reg_NUM); const unsigned EncodedReg_q8 = RegARM32::RegTable[RegARM32::Reg_q8].Encoding; for (int i = 0; i < RegARM32::Reg_NUM; ++i) { const auto &Entry = RegARM32::RegTable[i]; IntegerRegisters[i] = Entry.IsInt; I64PairRegisters[i] = Entry.IsI64Pair; Float32Registers[i] = Entry.IsFP32; Float64Registers[i] = Entry.IsFP64; VectorRegisters[i] = Entry.IsVec128; RegisterAliases[i].resize(RegARM32::Reg_NUM); // TODO(eholk): It would be better to store a QtoS flag in the // IceRegistersARM32 table than to compare their encodings here. QtoSRegisters[i] = Entry.IsVec128 && Entry.Encoding < EncodedReg_q8; for (int j = 0; j < Entry.NumAliases; ++j) { assert(i == j || !RegisterAliases[i][Entry.Aliases[j]]); RegisterAliases[i].set(Entry.Aliases[j]); } assert(RegisterAliases[i][i]); if (Entry.CCArg <= 0) { continue; } const auto RegNum = RegNumT::fromInt(i); if (Entry.IsGPR) { GPRArgInitializer[Entry.CCArg - 1] = RegNum; } else if (Entry.IsI64Pair) { I64ArgInitializer[Entry.CCArg - 1] = RegNum; } else if (Entry.IsFP32) { FP32ArgInitializer[Entry.CCArg - 1] = RegNum; } else if (Entry.IsFP64) { FP64ArgInitializer[Entry.CCArg - 1] = RegNum; } else if (Entry.IsVec128) { Vec128ArgInitializer[Entry.CCArg - 1] = RegNum; } } TypeToRegisterSet[IceType_void] = InvalidRegisters; TypeToRegisterSet[IceType_i1] = IntegerRegisters; TypeToRegisterSet[IceType_i8] = IntegerRegisters; TypeToRegisterSet[IceType_i16] = IntegerRegisters; TypeToRegisterSet[IceType_i32] = IntegerRegisters; TypeToRegisterSet[IceType_i64] = I64PairRegisters; TypeToRegisterSet[IceType_f32] = Float32Registers; TypeToRegisterSet[IceType_f64] = Float64Registers; TypeToRegisterSet[IceType_v4i1] = VectorRegisters; TypeToRegisterSet[IceType_v8i1] = VectorRegisters; TypeToRegisterSet[IceType_v16i1] = VectorRegisters; TypeToRegisterSet[IceType_v16i8] = VectorRegisters; TypeToRegisterSet[IceType_v8i16] = VectorRegisters; TypeToRegisterSet[IceType_v4i32] = VectorRegisters; TypeToRegisterSet[IceType_v4f32] = VectorRegisters; TypeToRegisterSet[RegARM32::RCARM32_QtoS] = QtoSRegisters; for (size_t i = 0; i < llvm::array_lengthof(TypeToRegisterSet); ++i) TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i]; filterTypeToRegisterSet(Ctx, RegARM32::Reg_NUM, TypeToRegisterSet, llvm::array_lengthof(TypeToRegisterSet), [](RegNumT RegNum) -> std::string { // This function simply removes ", " from the // register name. std::string Name = RegARM32::getRegName(RegNum); constexpr const char RegSeparator[] = ", "; constexpr size_t RegSeparatorWidth = llvm::array_lengthof(RegSeparator) - 1; for (size_t Pos = Name.find(RegSeparator); Pos != std::string::npos; Pos = Name.find(RegSeparator)) { Name.replace(Pos, RegSeparatorWidth, ""); } return Name; }, getRegClassName); } namespace { void copyRegAllocFromInfWeightVariable64On32(const VarList &Vars) { for (Variable *Var : Vars) { auto *Var64 = llvm::dyn_cast
(Var); if (!Var64) { // This is not the variable we are looking for. continue; } // only allow infinite-weight i64 temporaries to be register allocated. assert(!Var64->hasReg() || Var64->mustHaveReg()); if (!Var64->hasReg()) { continue; } const auto FirstReg = RegNumT::fixme(RegARM32::getI64PairFirstGPRNum(Var->getRegNum())); // This assumes little endian. Variable *Lo = Var64->getLo(); Variable *Hi = Var64->getHi(); assert(Lo->hasReg() == Hi->hasReg()); if (Lo->hasReg()) { continue; } Lo->setRegNum(FirstReg); Lo->setMustHaveReg(); Hi->setRegNum(RegNumT::fixme(FirstReg + 1)); Hi->setMustHaveReg(); } } } // end of anonymous namespace uint32_t TargetARM32::getCallStackArgumentsSizeBytes(const InstCall *Call) { TargetARM32::CallingConv CC; RegNumT DummyReg; size_t OutArgsSizeBytes = 0; for (SizeT i = 0, NumArgs = Call->getNumArgs(); i < NumArgs; ++i) { Operand *Arg = legalizeUndef(Call->getArg(i)); const Type Ty = Arg->getType(); if (isScalarIntegerType(Ty)) { if (CC.argInGPR(Ty, &DummyReg)) { continue; } } else { if (CC.argInVFP(Ty, &DummyReg)) { continue; } } OutArgsSizeBytes = applyStackAlignmentTy(OutArgsSizeBytes, Ty); OutArgsSizeBytes += typeWidthInBytesOnStack(Ty); } return applyStackAlignment(OutArgsSizeBytes); } void TargetARM32::genTargetHelperCallFor(Inst *Instr) { constexpr bool NoTailCall = false; constexpr bool IsTargetHelperCall = true; switch (Instr->getKind()) { default: return; case Inst::Arithmetic: { Variable *Dest = Instr->getDest(); const Type DestTy = Dest->getType(); const InstArithmetic::OpKind Op = llvm::cast
(Instr)->getOp(); if (isVectorType(DestTy)) { switch (Op) { default: break; case InstArithmetic::Fdiv: case InstArithmetic::Frem: case InstArithmetic::Sdiv: case InstArithmetic::Srem: case InstArithmetic::Udiv: case InstArithmetic::Urem: scalarizeArithmetic(Op, Dest, Instr->getSrc(0), Instr->getSrc(1)); Instr->setDeleted(); return; } } switch (DestTy) { default: return; case IceType_i64: { // Technically, ARM has its own aeabi routines, but we can use the // non-aeabi routine as well. LLVM uses __aeabi_ldivmod for div, but uses // the more standard __moddi3 for rem. RuntimeHelper HelperID = RuntimeHelper::H_Num; switch (Op) { default: return; case InstArithmetic::Udiv: HelperID = RuntimeHelper::H_udiv_i64; break; case InstArithmetic::Sdiv: HelperID = RuntimeHelper::H_sdiv_i64; break; case InstArithmetic::Urem: HelperID = RuntimeHelper::H_urem_i64; break; case InstArithmetic::Srem: HelperID = RuntimeHelper::H_srem_i64; break; } Operand *TargetHelper = Ctx->getRuntimeHelperFunc(HelperID); ARM32HelpersPreamble[TargetHelper] = &TargetARM32::preambleDivRem; constexpr SizeT MaxArgs = 2; auto *Call = Context.insert
(MaxArgs, Dest, TargetHelper, NoTailCall, IsTargetHelperCall); Call->addArg(Instr->getSrc(0)); Call->addArg(Instr->getSrc(1)); Instr->setDeleted(); return; } case IceType_i32: case IceType_i16: case IceType_i8: { const bool HasHWDiv = hasCPUFeature(TargetARM32Features::HWDivArm); InstCast::OpKind CastKind; RuntimeHelper HelperID = RuntimeHelper::H_Num; switch (Op) { default: return; case InstArithmetic::Udiv: HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_udiv_i32; CastKind = InstCast::Zext; break; case InstArithmetic::Sdiv: HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_sdiv_i32; CastKind = InstCast::Sext; break; case InstArithmetic::Urem: HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_urem_i32; CastKind = InstCast::Zext; break; case InstArithmetic::Srem: HelperID = HasHWDiv ? RuntimeHelper::H_Num : RuntimeHelper::H_srem_i32; CastKind = InstCast::Sext; break; } if (HelperID == RuntimeHelper::H_Num) { // HelperID should only ever be undefined when the processor does not // have a hardware divider. If any other helpers are ever introduced, // the following assert will have to be modified. assert(HasHWDiv); return; } Operand *Src0 = Instr->getSrc(0); Operand *Src1 = Instr->getSrc(1); if (DestTy != IceType_i32) { // Src0 and Src1 have to be zero-, or signed-extended to i32. For Src0, // we just insert a InstCast right before the call to the helper. Variable *Src0_32 = Func->makeVariable(IceType_i32); Context.insert
(CastKind, Src0_32, Src0); Src0 = Src0_32; // For extending Src1, we will just insert an InstCast if Src1 is not a // Constant. If it is, then we extend it here, and not during program // runtime. This allows preambleDivRem to optimize-out the div-by-0 // check. if (auto *C = llvm::dyn_cast
(Src1)) { const int32_t ShAmt = (DestTy == IceType_i16) ? 16 : 24; int32_t NewC = C->getValue(); if (CastKind == InstCast::Zext) { NewC &= ~(0x80000000l >> ShAmt); } else { NewC = (NewC << ShAmt) >> ShAmt; } Src1 = Ctx->getConstantInt32(NewC); } else { Variable *Src1_32 = Func->makeVariable(IceType_i32); Context.insert
(CastKind, Src1_32, Src1); Src1 = Src1_32; } } Operand *TargetHelper = Ctx->getRuntimeHelperFunc(HelperID); ARM32HelpersPreamble[TargetHelper] = &TargetARM32::preambleDivRem; constexpr SizeT MaxArgs = 2; auto *Call = Context.insert
(MaxArgs, Dest, TargetHelper, NoTailCall, IsTargetHelperCall); assert(Src0->getType() == IceType_i32); Call->addArg(Src0); assert(Src1->getType() == IceType_i32); Call->addArg(Src1); Instr->setDeleted(); return; } case IceType_f64: case IceType_f32: { if (Op != InstArithmetic::Frem) { return; } constexpr SizeT MaxArgs = 2; Operand *TargetHelper = Ctx->getRuntimeHelperFunc( DestTy == IceType_f32 ? RuntimeHelper::H_frem_f32 : RuntimeHelper::H_frem_f64); auto *Call = Context.insert
(MaxArgs, Dest, TargetHelper, NoTailCall, IsTargetHelperCall); Call->addArg(Instr->getSrc(0)); Call->addArg(Instr->getSrc(1)); Instr->setDeleted(); return; } } llvm::report_fatal_error("Control flow should never have reached here."); } case Inst::Cast: { Variable *Dest = Instr->getDest(); Operand *Src0 = Instr->getSrc(0); const Type DestTy = Dest->getType(); const Type SrcTy = Src0->getType(); auto *CastInstr = llvm::cast
(Instr); const InstCast::OpKind CastKind = CastInstr->getCastKind(); switch (CastKind) { default: return; case InstCast::Fptosi: case InstCast::Fptoui: { if (DestTy != IceType_i64) { return; } const bool DestIsSigned = CastKind == InstCast::Fptosi; const bool Src0IsF32 = isFloat32Asserting32Or64(SrcTy); Operand *TargetHelper = Ctx->getRuntimeHelperFunc( Src0IsF32 ? (DestIsSigned ? RuntimeHelper::H_fptosi_f32_i64 : RuntimeHelper::H_fptoui_f32_i64) : (DestIsSigned ? RuntimeHelper::H_fptosi_f64_i64 : RuntimeHelper::H_fptoui_f64_i64)); static constexpr SizeT MaxArgs = 1; auto *Call = Context.insert
(MaxArgs, Dest, TargetHelper, NoTailCall, IsTargetHelperCall); Call->addArg(Src0); Instr->setDeleted(); return; } case InstCast::Sitofp: case InstCast::Uitofp: { if (SrcTy != IceType_i64) { return; } const bool SourceIsSigned = CastKind == InstCast::Sitofp; const bool DestIsF32 = isFloat32Asserting32Or64(Dest->getType()); Operand *TargetHelper = Ctx->getRuntimeHelperFunc( DestIsF32 ? (SourceIsSigned ? RuntimeHelper::H_sitofp_i64_f32 : RuntimeHelper::H_uitofp_i64_f32) : (SourceIsSigned ? RuntimeHelper::H_sitofp_i64_f64 : RuntimeHelper::H_uitofp_i64_f64)); static constexpr SizeT MaxArgs = 1; auto *Call = Context.insert
(MaxArgs, Dest, TargetHelper, NoTailCall, IsTargetHelperCall); Call->addArg(Src0); Instr->setDeleted(); return; } case InstCast::Bitcast: { if (DestTy == SrcTy) { return; } Variable *CallDest = Dest; RuntimeHelper HelperID = RuntimeHelper::H_Num; switch (DestTy) { default: return; case IceType_i8: assert(SrcTy == IceType_v8i1); HelperID = RuntimeHelper::H_bitcast_8xi1_i8; CallDest = Func->makeVariable(IceType_i32); break; case IceType_i16: assert(SrcTy == IceType_v16i1); HelperID = RuntimeHelper::H_bitcast_16xi1_i16; CallDest = Func->makeVariable(IceType_i32); break; case IceType_v8i1: { assert(SrcTy == IceType_i8); HelperID = RuntimeHelper::H_bitcast_i8_8xi1; Variable *Src0AsI32 = Func->makeVariable(stackSlotType()); // Arguments to functions are required to be at least 32 bits wide. Context.insert
(InstCast::Zext, Src0AsI32, Src0); Src0 = Src0AsI32; } break; case IceType_v16i1: { assert(SrcTy == IceType_i16); HelperID = RuntimeHelper::H_bitcast_i16_16xi1; Variable *Src0AsI32 = Func->makeVariable(stackSlotType()); // Arguments to functions are required to be at least 32 bits wide. Context.insert
(InstCast::Zext, Src0AsI32, Src0); Src0 = Src0AsI32; } break; } constexpr SizeT MaxSrcs = 1; InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs); Call->addArg(Src0); Context.insert(Call); // The PNaCl ABI disallows i8/i16 return types, so truncate the helper // call result to the appropriate type as necessary. if (CallDest->getType() != Dest->getType()) Context.insert
(InstCast::Trunc, Dest, CallDest); Instr->setDeleted(); return; } case InstCast::Trunc: { if (DestTy == SrcTy) { return; } if (!isVectorType(SrcTy)) { return; } assert(typeNumElements(DestTy) == typeNumElements(SrcTy)); assert(typeElementType(DestTy) == IceType_i1); assert(isVectorIntegerType(SrcTy)); return; } case InstCast::Sext: case InstCast::Zext: { if (DestTy == SrcTy) { return; } if (!isVectorType(DestTy)) { return; } assert(typeNumElements(DestTy) == typeNumElements(SrcTy)); assert(typeElementType(SrcTy) == IceType_i1); assert(isVectorIntegerType(DestTy)); return; } } llvm::report_fatal_error("Control flow should never have reached here."); } case Inst::IntrinsicCall: { Variable *Dest = Instr->getDest(); auto *IntrinsicCall = llvm::cast
(Instr); Intrinsics::IntrinsicID ID = IntrinsicCall->getIntrinsicInfo().ID; switch (ID) { default: return; case Intrinsics::Ctpop: { Operand *Src0 = IntrinsicCall->getArg(0); Operand *TargetHelper = Ctx->getRuntimeHelperFunc(isInt32Asserting32Or64(Src0->getType()) ? RuntimeHelper::H_call_ctpop_i32 : RuntimeHelper::H_call_ctpop_i64); static constexpr SizeT MaxArgs = 1; auto *Call = Context.insert
(MaxArgs, Dest, TargetHelper, NoTailCall, IsTargetHelperCall); Call->addArg(Src0); Instr->setDeleted(); if (Src0->getType() == IceType_i64) { ARM32HelpersPostamble[TargetHelper] = &TargetARM32::postambleCtpop64; } return; } case Intrinsics::Longjmp: { static constexpr SizeT MaxArgs = 2; static constexpr Variable *NoDest = nullptr; Operand *TargetHelper = Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_longjmp); auto *Call = Context.insert
(MaxArgs, NoDest, TargetHelper, NoTailCall, IsTargetHelperCall); Call->addArg(IntrinsicCall->getArg(0)); Call->addArg(IntrinsicCall->getArg(1)); Instr->setDeleted(); return; } case Intrinsics::Memcpy: { // In the future, we could potentially emit an inline memcpy/memset, etc. // for intrinsic calls w/ a known length. static constexpr SizeT MaxArgs = 3; static constexpr Variable *NoDest = nullptr; Operand *TargetHelper = Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_memcpy); auto *Call = Context.insert
(MaxArgs, NoDest, TargetHelper, NoTailCall, IsTargetHelperCall); Call->addArg(IntrinsicCall->getArg(0)); Call->addArg(IntrinsicCall->getArg(1)); Call->addArg(IntrinsicCall->getArg(2)); Instr->setDeleted(); return; } case Intrinsics::Memmove: { static constexpr SizeT MaxArgs = 3; static constexpr Variable *NoDest = nullptr; Operand *TargetHelper = Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_memmove); auto *Call = Context.insert
(MaxArgs, NoDest, TargetHelper, NoTailCall, IsTargetHelperCall); Call->addArg(IntrinsicCall->getArg(0)); Call->addArg(IntrinsicCall->getArg(1)); Call->addArg(IntrinsicCall->getArg(2)); Instr->setDeleted(); return; } case Intrinsics::Memset: { // The value operand needs to be extended to a stack slot size because the // PNaCl ABI requires arguments to be at least 32 bits wide. Operand *ValOp = IntrinsicCall->getArg(1); assert(ValOp->getType() == IceType_i8); Variable *ValExt = Func->makeVariable(stackSlotType()); Context.insert
(InstCast::Zext, ValExt, ValOp); // Technically, ARM has its own __aeabi_memset, but we can use plain // memset too. The value and size argument need to be flipped if we ever // decide to use __aeabi_memset. static constexpr SizeT MaxArgs = 3; static constexpr Variable *NoDest = nullptr; Operand *TargetHelper = Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_memset); auto *Call = Context.insert
(MaxArgs, NoDest, TargetHelper, NoTailCall, IsTargetHelperCall); Call->addArg(IntrinsicCall->getArg(0)); Call->addArg(ValExt); Call->addArg(IntrinsicCall->getArg(2)); Instr->setDeleted(); return; } case Intrinsics::NaClReadTP: { if (SandboxingType == ST_NaCl) { return; } static constexpr SizeT MaxArgs = 0; Operand *TargetHelper = SandboxingType == ST_Nonsfi ? Ctx->getConstantExternSym( Ctx->getGlobalString("__aeabi_read_tp")) : Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_read_tp); Context.insert
(MaxArgs, Dest, TargetHelper, NoTailCall, IsTargetHelperCall); Instr->setDeleted(); return; } case Intrinsics::Setjmp: { static constexpr SizeT MaxArgs = 1; Operand *TargetHelper = Ctx->getRuntimeHelperFunc(RuntimeHelper::H_call_setjmp); auto *Call = Context.insert
(MaxArgs, Dest, TargetHelper, NoTailCall, IsTargetHelperCall); Call->addArg(IntrinsicCall->getArg(0)); Instr->setDeleted(); return; } } llvm::report_fatal_error("Control flow should never have reached here."); } } } void TargetARM32::findMaxStackOutArgsSize() { // MinNeededOutArgsBytes should be updated if the Target ever creates a // high-level InstCall that requires more stack bytes. constexpr size_t MinNeededOutArgsBytes = 0; MaxOutArgsSizeBytes = MinNeededOutArgsBytes; for (CfgNode *Node : Func->getNodes()) { Context.init(Node); while (!Context.atEnd()) { PostIncrLoweringContext PostIncrement(Context); Inst *CurInstr = iteratorToInst(Context.getCur()); if (auto *Call = llvm::dyn_cast
(CurInstr)) { SizeT OutArgsSizeBytes = getCallStackArgumentsSizeBytes(Call); MaxOutArgsSizeBytes = std::max(MaxOutArgsSizeBytes, OutArgsSizeBytes); } } } } void TargetARM32::createGotPtr() { if (SandboxingType != ST_Nonsfi) { return; } GotPtr = Func->makeVariable(IceType_i32); } void TargetARM32::insertGotPtrInitPlaceholder() { if (SandboxingType != ST_Nonsfi) { return; } assert(GotPtr != nullptr); // We add the two placeholder instructions here. The first fakedefs T, an // infinite-weight temporary, while the second fakedefs the GotPtr "using" T. // This is needed because the GotPtr initialization, if needed, will require // a register: // // movw reg, _GLOBAL_OFFSET_TABLE_ - 16 - . // movt reg, _GLOBAL_OFFSET_TABLE_ - 12 - . // add reg, pc, reg // mov GotPtr, reg // // If GotPtr is not used, then both these pseudo-instructions are dce'd. Variable *T = makeReg(IceType_i32); Context.insert
(T); Context.insert
(GotPtr, T); } GlobalString TargetARM32::createGotoffRelocation(const ConstantRelocatable *CR) { GlobalString CRName = CR->getName(); GlobalString CRGotoffName = Ctx->getGlobalString("GOTOFF$" + Func->getFunctionName() + "$" + CRName); if (KnownGotoffs.count(CRGotoffName) == 0) { constexpr bool SuppressMangling = true; auto *Global = VariableDeclaration::create(Func->getGlobalPool(), SuppressMangling); Global->setIsConstant(true); Global->setName(CRName); Func->getGlobalPool()->willNotBeEmitted(Global); auto *Gotoff = VariableDeclaration::create(Func->getGlobalPool(), SuppressMangling); constexpr auto GotFixup = R_ARM_GOTOFF32; Gotoff->setIsConstant(true); Gotoff->addInitializer(VariableDeclaration::RelocInitializer::create( Func->getGlobalPool(), Global, {RelocOffset::create(Ctx, 0)}, GotFixup)); Gotoff->setName(CRGotoffName); Func->addGlobal(Gotoff); KnownGotoffs.emplace(CRGotoffName); } return CRGotoffName; } void TargetARM32::materializeGotAddr(CfgNode *Node) { if (SandboxingType != ST_Nonsfi) { return; } // At first, we try to find the // GotPtr = def T // pseudo-instruction that we placed for defining the got ptr. That // instruction is not just a place-holder for defining the GotPtr (thus // keeping liveness consistent), but it is also located at a point where it is // safe to materialize the got addr -- i.e., before loading parameters to // registers, but after moving register parameters from their home location. InstFakeDef *DefGotPtr = nullptr; for (auto &Inst : Node->getInsts()) { auto *FakeDef = llvm::dyn_cast
(&Inst); if (FakeDef != nullptr && FakeDef->getDest() == GotPtr) { DefGotPtr = FakeDef; break; } } if (DefGotPtr == nullptr || DefGotPtr->isDeleted()) { return; } // The got addr needs to be materialized at the same point where DefGotPtr // lives. Context.setInsertPoint(instToIterator(DefGotPtr)); assert(DefGotPtr->getSrcSize() == 1); auto *T = llvm::cast
(DefGotPtr->getSrc(0)); loadNamedConstantRelocatablePIC(Ctx->getGlobalString(GlobalOffsetTable), T, [this, T](Variable *PC) { _add(T, PC, T); }); _mov(GotPtr, T); DefGotPtr->setDeleted(); } void TargetARM32::loadNamedConstantRelocatablePIC( GlobalString Name, Variable *Register, std::function
Finish) { assert(SandboxingType == ST_Nonsfi); // We makeReg() here instead of getPhysicalRegister() because the latter ends // up creating multi-blocks temporaries that liveness fails to validate. auto *PC = makeReg(IceType_i32, RegARM32::Reg_pc); auto *AddPcReloc = RelocOffset::create(Ctx); AddPcReloc->setSubtract(true); auto *AddPcLabel = InstARM32Label::create(Func, this); AddPcLabel->setRelocOffset(AddPcReloc); auto *MovwReloc = RelocOffset::create(Ctx); auto *MovwLabel = InstARM32Label::create(Func, this); MovwLabel->setRelocOffset(MovwReloc); auto *MovtReloc = RelocOffset::create(Ctx); auto *MovtLabel = InstARM32Label::create(Func, this); MovtLabel->setRelocOffset(MovtReloc); // The EmitString for these constant relocatables have hardcoded offsets // attached to them. This could be dangerous if, e.g., we ever implemented // instruction scheduling but llvm-mc currently does not support // // movw reg, #:lower16:(Symbol - Label - Number) // movt reg, #:upper16:(Symbol - Label - Number) // // relocations. static constexpr RelocOffsetT PcOffset = -8; auto *CRLower = Ctx->getConstantSymWithEmitString( PcOffset, {MovwReloc, AddPcReloc}, Name, Name + " -16"); auto *CRUpper = Ctx->getConstantSymWithEmitString( PcOffset, {MovtReloc, AddPcReloc}, Name, Name + " -12"); Context.insert(MovwLabel); _movw(Register, CRLower); Context.insert(MovtLabel); _movt(Register, CRUpper); // PC = fake-def to keep liveness consistent. Context.insert
(PC); Context.insert(AddPcLabel); Finish(PC); } void TargetARM32::translateO2() { TimerMarker T(TimerStack::TT_O2, Func); // TODO(stichnot): share passes with other targets? // https://code.google.com/p/nativeclient/issues/detail?id=4094 if (SandboxingType == ST_Nonsfi) { createGotPtr(); } genTargetHelperCalls(); findMaxStackOutArgsSize(); // Do not merge Alloca instructions, and lay out the stack. static constexpr bool SortAndCombineAllocas = true; Func->processAllocas(SortAndCombineAllocas); Func->dump("After Alloca processing"); if (!getFlags().getEnablePhiEdgeSplit()) { // Lower Phi instructions. Func->placePhiLoads(); if (Func->hasError()) return; Func->placePhiStores(); if (Func->hasError()) return; Func->deletePhis(); if (Func->hasError()) return; Func->dump("After Phi lowering"); } // Address mode optimization. Func->getVMetadata()->init(VMK_SingleDefs); Func->doAddressOpt(); Func->materializeVectorShuffles(); // Argument lowering Func->doArgLowering(); // Target lowering. This requires liveness analysis for some parts of the // lowering decisions, such as compare/branch fusing. If non-lightweight // liveness analysis is used, the instructions need to be renumbered first. // TODO: This renumbering should only be necessary if we're actually // calculating live intervals, which we only do for register allocation. Func->renumberInstructions(); if (Func->hasError()) return; // TODO: It should be sufficient to use the fastest liveness calculation, // i.e. livenessLightweight(). However, for some reason that slows down the // rest of the translation. Investigate. Func->liveness(Liveness_Basic); if (Func->hasError()) return; Func->dump("After ARM32 address mode opt"); if (SandboxingType == ST_Nonsfi) { insertGotPtrInitPlaceholder(); } Func->genCode(); if (Func->hasError()) return; Func->dump("After ARM32 codegen"); // Register allocation. This requires instruction renumbering and full // liveness analysis. Func->renumberInstructions(); if (Func->hasError()) return; Func->liveness(Liveness_Intervals); if (Func->hasError()) return; // The post-codegen dump is done here, after liveness analysis and associated // cleanup, to make the dump cleaner and more useful. Func->dump("After initial ARM32 codegen"); // Validate the live range computations. The expensive validation call is // deliberately only made when assertions are enabled. assert(Func->validateLiveness()); Func->getVMetadata()->init(VMK_All); regAlloc(RAK_Global); if (Func->hasError()) return; copyRegAllocFromInfWeightVariable64On32(Func->getVariables()); Func->dump("After linear scan regalloc"); if (getFlags().getEnablePhiEdgeSplit()) { Func->advancedPhiLowering(); Func->dump("After advanced Phi lowering"); } ForbidTemporaryWithoutReg _(this); // Stack frame mapping. Func->genFrame(); if (Func->hasError()) return; Func->dump("After stack frame mapping"); postLowerLegalization(); if (Func->hasError()) return; Func->dump("After postLowerLegalization"); Func->contractEmptyNodes(); Func->reorderNodes(); // Branch optimization. This needs to be done just before code emission. In // particular, no transformations that insert or reorder CfgNodes should be // done after branch optimization. We go ahead and do it before nop insertion // to reduce the amount of work needed for searching for opportunities. Func->doBranchOpt(); Func->dump("After branch optimization"); // Nop insertion if (getFlags().getShouldDoNopInsertion()) { Func->doNopInsertion(); } } void TargetARM32::translateOm1() { TimerMarker T(TimerStack::TT_Om1, Func); // TODO(stichnot): share passes with other targets? if (SandboxingType == ST_Nonsfi) { createGotPtr(); } genTargetHelperCalls(); findMaxStackOutArgsSize(); // Do not merge Alloca instructions, and lay out the stack. static constexpr bool DontSortAndCombineAllocas = false; Func->processAllocas(DontSortAndCombineAllocas); Func->dump("After Alloca processing"); Func->placePhiLoads(); if (Func->hasError()) return; Func->placePhiStores(); if (Func->hasError()) return; Func->deletePhis(); if (Func->hasError()) return; Func->dump("After Phi lowering"); Func->doArgLowering(); if (SandboxingType == ST_Nonsfi) { insertGotPtrInitPlaceholder(); } Func->genCode(); if (Func->hasError()) return; Func->dump("After initial ARM32 codegen"); regAlloc(RAK_InfOnly); if (Func->hasError()) return; copyRegAllocFromInfWeightVariable64On32(Func->getVariables()); Func->dump("After regalloc of infinite-weight variables"); ForbidTemporaryWithoutReg _(this); Func->genFrame(); if (Func->hasError()) return; Func->dump("After stack frame mapping"); postLowerLegalization(); if (Func->hasError()) return; Func->dump("After postLowerLegalization"); // Nop insertion if (getFlags().getShouldDoNopInsertion()) { Func->doNopInsertion(); } } uint32_t TargetARM32::getStackAlignment() const { return ARM32_STACK_ALIGNMENT_BYTES; } bool TargetARM32::doBranchOpt(Inst *I, const CfgNode *NextNode) { if (auto *Br = llvm::dyn_cast
(I)) { return Br->optimizeBranch(NextNode); } return false; } const char *TargetARM32::getRegName(RegNumT RegNum, Type Ty) const { (void)Ty; return RegARM32::getRegName(RegNum); } Variable *TargetARM32::getPhysicalRegister(RegNumT RegNum, Type Ty) { static const Type DefaultType[] = { #define X(val, encode, name, cc_arg, scratch, preserved, stackptr, frameptr, \ isGPR, isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init) \ (isFP32) \ ? IceType_f32 \ : ((isFP64) ? IceType_f64 : ((isVec128 ? IceType_v4i32 : IceType_i32))), REGARM32_TABLE #undef X }; if (Ty == IceType_void) { assert(unsigned(RegNum) < llvm::array_lengthof(DefaultType)); Ty = DefaultType[RegNum]; } if (PhysicalRegisters[Ty].empty()) PhysicalRegisters[Ty].resize(RegARM32::Reg_NUM); assert(unsigned(RegNum) < PhysicalRegisters[Ty].size()); Variable *Reg = PhysicalRegisters[Ty][RegNum]; if (Reg == nullptr) { Reg = Func->makeVariable(Ty); Reg->setRegNum(RegNum); PhysicalRegisters[Ty][RegNum] = Reg; // Specially mark a named physical register as an "argument" so that it is // considered live upon function entry. Otherwise it's possible to get // liveness validation errors for saving callee-save registers. Func->addImplicitArg(Reg); // Don't bother tracking the live range of a named physical register. Reg->setIgnoreLiveness(); } return Reg; } void TargetARM32::emitJumpTable(const Cfg *Func, const InstJumpTable *JumpTable) const { (void)Func; (void)JumpTable; UnimplementedError(getFlags()); } void TargetARM32::emitVariable(const Variable *Var) const { if (!BuildDefs::dump()) return; Ostream &Str = Ctx->getStrEmit(); if (Var->hasReg()) { Str << getRegName(Var->getRegNum(), Var->getType()); return; } if (Var->mustHaveReg()) { llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() + ") has no register assigned - function " + Func->getFunctionName()); } assert(!Var->isRematerializable()); int32_t Offset = Var->getStackOffset(); auto BaseRegNum = Var->getBaseRegNum(); if (BaseRegNum.hasNoValue()) { BaseRegNum = getFrameOrStackReg(); } const Type VarTy = Var->getType(); Str << "[" << getRegName(BaseRegNum, VarTy); if (Offset != 0) { Str << ", #" << Offset; } Str << "]"; } TargetARM32::CallingConv::CallingConv() : GPRegsUsed(RegARM32::Reg_NUM), GPRArgs(GPRArgInitializer.rbegin(), GPRArgInitializer.rend()), I64Args(I64ArgInitializer.rbegin(), I64ArgInitializer.rend()), VFPRegsUsed(RegARM32::Reg_NUM), FP32Args(FP32ArgInitializer.rbegin(), FP32ArgInitializer.rend()), FP64Args(FP64ArgInitializer.rbegin(), FP64ArgInitializer.rend()), Vec128Args(Vec128ArgInitializer.rbegin(), Vec128ArgInitializer.rend()) {} bool TargetARM32::CallingConv::argInGPR(Type Ty, RegNumT *Reg) { CfgVector
*Source; switch (Ty) { default: { assert(isScalarIntegerType(Ty)); Source = &GPRArgs; } break; case IceType_i64: { Source = &I64Args; } break; } discardUnavailableGPRsAndTheirAliases(Source); if (Source->empty()) { GPRegsUsed.set(); return false; } *Reg = Source->back(); // Note that we don't Source->pop_back() here. This is intentional. Notice how // we mark all of Reg's aliases as Used. So, for the next argument, // Source->back() is marked as unavailable, and it is thus implicitly popped // from the stack. GPRegsUsed |= RegisterAliases[*Reg]; return true; } // GPR are not packed when passing parameters. Thus, a function foo(i32, i64, // i32) will have the first argument in r0, the second in r1-r2, and the third // on the stack. To model this behavior, whenever we pop a register from Regs, // we remove all of its aliases from the pool of available GPRs. This has the // effect of computing the "closure" on the GPR registers. void TargetARM32::CallingConv::discardUnavailableGPRsAndTheirAliases( CfgVector
*Regs) { while (!Regs->empty() && GPRegsUsed[Regs->back()]) { GPRegsUsed |= RegisterAliases[Regs->back()]; Regs->pop_back(); } } bool TargetARM32::CallingConv::argInVFP(Type Ty, RegNumT *Reg) { CfgVector
*Source; switch (Ty) { default: { assert(isVectorType(Ty)); Source = &Vec128Args; } break; case IceType_f32: { Source = &FP32Args; } break; case IceType_f64: { Source = &FP64Args; } break; } discardUnavailableVFPRegs(Source); if (Source->empty()) { VFPRegsUsed.set(); return false; } *Reg = Source->back(); VFPRegsUsed |= RegisterAliases[*Reg]; return true; } // Arguments in VFP registers are not packed, so we don't mark the popped // registers' aliases as unavailable. void TargetARM32::CallingConv::discardUnavailableVFPRegs( CfgVector
*Regs) { while (!Regs->empty() && VFPRegsUsed[Regs->back()]) { Regs->pop_back(); } } void TargetARM32::lowerArguments() { VarList &Args = Func->getArgs(); TargetARM32::CallingConv CC; // For each register argument, replace Arg in the argument list with the home // register. Then generate an instruction in the prolog to copy the home // register to the assigned location of Arg. Context.init(Func->getEntryNode()); Context.setInsertPoint(Context.getCur()); for (SizeT I = 0, E = Args.size(); I < E; ++I) { Variable *Arg = Args[I]; Type Ty = Arg->getType(); RegNumT RegNum; if (isScalarIntegerType(Ty)) { if (!CC.argInGPR(Ty, &RegNum)) { continue; } } else { if (!CC.argInVFP(Ty, &RegNum)) { continue; } } Variable *RegisterArg = Func->makeVariable(Ty); if (BuildDefs::dump()) { RegisterArg->setName(Func, "home_reg:" + Arg->getName()); } RegisterArg->setIsArg(); Arg->setIsArg(false); Args[I] = RegisterArg; switch (Ty) { default: { RegisterArg->setRegNum(RegNum); } break; case IceType_i64: { auto *RegisterArg64 = llvm::cast
(RegisterArg); RegisterArg64->initHiLo(Func); RegisterArg64->getLo()->setRegNum( RegNumT::fixme(RegARM32::getI64PairFirstGPRNum(RegNum))); RegisterArg64->getHi()->setRegNum( RegNumT::fixme(RegARM32::getI64PairSecondGPRNum(RegNum))); } break; } Context.insert
(Arg, RegisterArg); } } // Helper function for addProlog(). // // This assumes Arg is an argument passed on the stack. This sets the frame // offset for Arg and updates InArgsSizeBytes according to Arg's width. For an // I64 arg that has been split into Lo and Hi components, it calls itself // recursively on the components, taking care to handle Lo first because of the // little-endian architecture. Lastly, this function generates an instruction // to copy Arg into its assigned register if applicable. void TargetARM32::finishArgumentLowering(Variable *Arg, Variable *FramePtr, size_t BasicFrameOffset, size_t *InArgsSizeBytes) { const Type Ty = Arg->getType(); *InArgsSizeBytes = applyStackAlignmentTy(*InArgsSizeBytes, Ty); if (auto *Arg64On32 = llvm::dyn_cast
(Arg)) { Variable *const Lo = Arg64On32->getLo(); Variable *const Hi = Arg64On32->getHi(); finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes); finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes); return; } assert(Ty != IceType_i64); const int32_t ArgStackOffset = BasicFrameOffset + *InArgsSizeBytes; *InArgsSizeBytes += typeWidthInBytesOnStack(Ty); if (!Arg->hasReg()) { Arg->setStackOffset(ArgStackOffset); return; } // If the argument variable has been assigned a register, we need to copy the // value from the stack slot. Variable *Parameter = Func->makeVariable(Ty); Parameter->setMustNotHaveReg(); Parameter->setStackOffset(ArgStackOffset); _mov(Arg, Parameter); } Type TargetARM32::stackSlotType() { return IceType_i32; } void TargetARM32::addProlog(CfgNode *Node) { // Stack frame layout: // // +------------------------+ // | 1. preserved registers | // +------------------------+ // | 2. padding | // +------------------------+ <--- FramePointer (if used) // | 3. global spill area | // +------------------------+ // | 4. padding | // +------------------------+ // | 5. local spill area | // +------------------------+ // | 6. padding | // +------------------------+ // | 7. allocas (variable) | // +------------------------+ // | 8. padding | // +------------------------+ // | 9. out args | // +------------------------+ <--- StackPointer // // The following variables record the size in bytes of the given areas: // * PreservedRegsSizeBytes: area 1 // * SpillAreaPaddingBytes: area 2 // * GlobalsSize: area 3 // * GlobalsAndSubsequentPaddingSize: areas 3 - 4 // * LocalsSpillAreaSize: area 5 // * SpillAreaSizeBytes: areas 2 - 6, and 9 // * MaxOutArgsSizeBytes: area 9 // // Determine stack frame offsets for each Variable without a register // assignment. This can be done as one variable per stack slot. Or, do // coalescing by running the register allocator again with an infinite set of // registers (as a side effect, this gives variables a second chance at // physical register assignment). // // A middle ground approach is to leverage sparsity and allocate one block of // space on the frame for globals (variables with multi-block lifetime), and // one block to share for locals (single-block lifetime). Context.init(Node); Context.setInsertPoint(Context.getCur()); SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None); RegsUsed = SmallBitVector(CalleeSaves.size()); VarList SortedSpilledVariables; size_t GlobalsSize = 0; // If there is a separate locals area, this represents that area. Otherwise // it counts any variable not counted by GlobalsSize. SpillAreaSizeBytes = 0; // If there is a separate locals area, this specifies the alignment for it. uint32_t LocalsSlotsAlignmentBytes = 0; // The entire spill locations area gets aligned to largest natural alignment // of the variables that have a spill slot. uint32_t SpillAreaAlignmentBytes = 0; // For now, we don't have target-specific variables that need special // treatment (no stack-slot-linked SpillVariable type). std::function
TargetVarHook = [](Variable *Var) { static constexpr bool AssignStackSlot = false; static constexpr bool DontAssignStackSlot = !AssignStackSlot; if (llvm::isa
(Var)) { return DontAssignStackSlot; } return AssignStackSlot; }; // Compute the list of spilled variables and bounds for GlobalsSize, etc. getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize, &SpillAreaSizeBytes, &SpillAreaAlignmentBytes, &LocalsSlotsAlignmentBytes, TargetVarHook); uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes; SpillAreaSizeBytes += GlobalsSize; // Add push instructions for preserved registers. On ARM, "push" can push a // whole list of GPRs via a bitmask (0-15). Unlike x86, ARM also has // callee-saved float/vector registers. // // The "vpush" instruction can handle a whole list of float/vector registers, // but it only handles contiguous sequences of registers by specifying the // start and the length. PreservedGPRs.reserve(CalleeSaves.size()); PreservedSRegs.reserve(CalleeSaves.size()); // Consider FP and LR as callee-save / used as needed. if (UsesFramePointer) { if (RegsUsed[RegARM32::Reg_fp]) { llvm::report_fatal_error("Frame pointer has been used."); } CalleeSaves[RegARM32::Reg_fp] = true; RegsUsed[RegARM32::Reg_fp] = true; } if (!MaybeLeafFunc) { CalleeSaves[RegARM32::Reg_lr] = true; RegsUsed[RegARM32::Reg_lr] = true; } // Make two passes over the used registers. The first pass records all the // used registers -- and their aliases. Then, we figure out which GPRs and // VFP S registers should be saved. We don't bother saving D/Q registers // because their uses are recorded as S regs uses. SmallBitVector ToPreserve(RegARM32::Reg_NUM); for (SizeT i = 0; i < CalleeSaves.size(); ++i) { if (NeedSandboxing && i == RegARM32::Reg_r9) { // r9 is never updated in sandboxed code. continue; } if (CalleeSaves[i] && RegsUsed[i]) { ToPreserve |= RegisterAliases[i]; } } uint32_t NumCallee = 0; size_t PreservedRegsSizeBytes = 0; // RegClasses is a tuple of // //
// // We use this tuple to figure out which register we should push/pop during // prolog/epilog. using RegClassType = std::tuple
; const RegClassType RegClasses[] = { RegClassType(RegARM32::Reg_GPR_First, RegARM32::Reg_GPR_Last, &PreservedGPRs), RegClassType(RegARM32::Reg_SREG_First, RegARM32::Reg_SREG_Last, &PreservedSRegs)}; for (const auto &RegClass : RegClasses) { const uint32_t FirstRegInClass = std::get<0>(RegClass); const uint32_t LastRegInClass = std::get<1>(RegClass); VarList *const PreservedRegsInClass = std::get<2>(RegClass); for (uint32_t Reg = FirstRegInClass; Reg <= LastRegInClass; ++Reg) { if (!ToPreserve[Reg]) { continue; } ++NumCallee; Variable *PhysicalRegister = getPhysicalRegister(RegNumT::fromInt(Reg)); PreservedRegsSizeBytes += typeWidthInBytesOnStack(PhysicalRegister->getType()); PreservedRegsInClass->push_back(PhysicalRegister); } } Ctx->statsUpdateRegistersSaved(NumCallee); if (!PreservedSRegs.empty()) _push(PreservedSRegs); if (!PreservedGPRs.empty()) _push(PreservedGPRs); // Generate "mov FP, SP" if needed. if (UsesFramePointer) { Variable *FP = getPhysicalRegister(RegARM32::Reg_fp); Variable *SP = getPhysicalRegister(RegARM32::Reg_sp); _mov(FP, SP); // Keep FP live for late-stage liveness analysis (e.g. asm-verbose mode). Context.insert
(FP); } // Align the variables area. SpillAreaPaddingBytes is the size of the region // after the preserved registers and before the spill areas. // LocalsSlotsPaddingBytes is the amount of padding between the globals and // locals area if they are separate. assert(SpillAreaAlignmentBytes <= ARM32_STACK_ALIGNMENT_BYTES); assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes); uint32_t SpillAreaPaddingBytes = 0; uint32_t LocalsSlotsPaddingBytes = 0; alignStackSpillAreas(PreservedRegsSizeBytes, SpillAreaAlignmentBytes, GlobalsSize, LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes, &LocalsSlotsPaddingBytes); SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes; uint32_t GlobalsAndSubsequentPaddingSize = GlobalsSize + LocalsSlotsPaddingBytes; // Adds the out args space to the stack, and align SP if necessary. if (!NeedsStackAlignment) { SpillAreaSizeBytes += MaxOutArgsSizeBytes; } else { uint32_t StackOffset = PreservedRegsSizeBytes; uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes); StackSize = applyStackAlignment(StackSize + MaxOutArgsSizeBytes); SpillAreaSizeBytes = StackSize - StackOffset; } // Combine fixed alloca with SpillAreaSize. SpillAreaSizeBytes += FixedAllocaSizeBytes; // Generate "sub sp, SpillAreaSizeBytes" if (SpillAreaSizeBytes) { // Use the scratch register if needed to legalize the immediate. Operand *SubAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes), Legal_Reg | Legal_Flex, getReservedTmpReg()); Sandboxer(this).sub_sp(SubAmount); if (FixedAllocaAlignBytes > ARM32_STACK_ALIGNMENT_BYTES) { Sandboxer(this).align_sp(FixedAllocaAlignBytes); } } Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes); // Fill in stack offsets for stack args, and copy args into registers for // those that were register-allocated. Args are pushed right to left, so // Arg[0] is closest to the stack/frame pointer. Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg()); size_t BasicFrameOffset = PreservedRegsSizeBytes; if (!UsesFramePointer) BasicFrameOffset += SpillAreaSizeBytes; materializeGotAddr(Node); const VarList &Args = Func->getArgs(); size_t InArgsSizeBytes = 0; TargetARM32::CallingConv CC; for (Variable *Arg : Args) { RegNumT DummyReg; const Type Ty = Arg->getType(); // Skip arguments passed in registers. if (isScalarIntegerType(Ty)) { if (CC.argInGPR(Ty, &DummyReg)) { continue; } } else { if (CC.argInVFP(Ty, &DummyReg)) { continue; } } finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, &InArgsSizeBytes); } // Fill in stack offsets for locals. assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes, SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize, UsesFramePointer); this->HasComputedFrame = true; if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) { OstreamLocker _(Func->getContext()); Ostream &Str = Func->getContext()->getStrDump(); Str << "Stack layout:\n"; uint32_t SPAdjustmentPaddingSize = SpillAreaSizeBytes - LocalsSpillAreaSize - GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes - MaxOutArgsSizeBytes; Str << " in-args = " << InArgsSizeBytes << " bytes\n" << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n" << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n" << " globals spill area = " << GlobalsSize << " bytes\n" << " globals-locals spill areas intermediate padding = " << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n" << " locals spill area = " << LocalsSpillAreaSize << " bytes\n" << " SP alignment padding = " << SPAdjustmentPaddingSize << " bytes\n"; Str << "Stack details:\n" << " SP adjustment = " << SpillAreaSizeBytes << " bytes\n" << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n" << " outgoing args size = " << MaxOutArgsSizeBytes << " bytes\n" << " locals spill area alignment = " << LocalsSlotsAlignmentBytes << " bytes\n" << " is FP based = " << UsesFramePointer << "\n"; } } void TargetARM32::addEpilog(CfgNode *Node) { InstList &Insts = Node->getInsts(); InstList::reverse_iterator RI, E; for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) { if (llvm::isa
(*RI)) break; } if (RI == E) return; // Convert the reverse_iterator position into its corresponding (forward) // iterator position. InstList::iterator InsertPoint = reverseToForwardIterator(RI); --InsertPoint; Context.init(Node); Context.setInsertPoint(InsertPoint); Variable *SP = getPhysicalRegister(RegARM32::Reg_sp); if (UsesFramePointer) { Variable *FP = getPhysicalRegister(RegARM32::Reg_fp); // For late-stage liveness analysis (e.g. asm-verbose mode), adding a fake // use of SP before the assignment of SP=FP keeps previous SP adjustments // from being dead-code eliminated. Context.insert
(SP); Sandboxer(this).reset_sp(FP); } else { // add SP, SpillAreaSizeBytes if (SpillAreaSizeBytes) { // Use the scratch register if needed to legalize the immediate. Operand *AddAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes), Legal_Reg | Legal_Flex, getReservedTmpReg()); Sandboxer(this).add_sp(AddAmount); } } if (!PreservedGPRs.empty()) _pop(PreservedGPRs); if (!PreservedSRegs.empty()) _pop(PreservedSRegs); if (!getFlags().getUseSandboxing()) return; // Change the original ret instruction into a sandboxed return sequence. // // bundle_lock // bic lr, #0xc000000f // bx lr // bundle_unlock // // This isn't just aligning to the getBundleAlignLog2Bytes(). It needs to // restrict to the lower 1GB as well. Variable *LR = getPhysicalRegister(RegARM32::Reg_lr); Variable *RetValue = nullptr; if (RI->getSrcSize()) RetValue = llvm::cast
(RI->getSrc(0)); Sandboxer(this).ret(LR, RetValue); RI->setDeleted(); } bool TargetARM32::isLegalMemOffset(Type Ty, int32_t Offset) const { constexpr bool ZeroExt = false; return OperandARM32Mem::canHoldOffset(Ty, ZeroExt, Offset); } Variable *TargetARM32::PostLoweringLegalizer::newBaseRegister( Variable *Base, int32_t Offset, RegNumT ScratchRegNum) { // Legalize will likely need a movw/movt combination, but if the top bits are // all 0 from negating the offset and subtracting, we could use that instead. const bool ShouldSub = Offset != 0 && (-Offset & 0xFFFF0000) == 0; Variable *ScratchReg = Target->makeReg(IceType_i32, ScratchRegNum); if (ShouldSub) { Operand *OffsetVal = Target->legalize(Target->Ctx->getConstantInt32(-Offset), Legal_Reg | Legal_Flex, ScratchRegNum); Target->_sub(ScratchReg, Base, OffsetVal); } else { Operand *OffsetVal = Target->legalize(Target->Ctx->getConstantInt32(Offset), Legal_Reg | Legal_Flex, ScratchRegNum); Target->_add(ScratchReg, Base, OffsetVal); } if (ScratchRegNum == Target->getReservedTmpReg()) { const bool BaseIsStackOrFramePtr = Base->getRegNum() == Target->getFrameOrStackReg(); // There is currently no code path that would trigger this assertion, so we // leave this assertion here in case it is ever violated. This is not a // fatal error (thus the use of assert() and not llvm::report_fatal_error) // as the program compiled by subzero will still work correctly. assert(BaseIsStackOrFramePtr); // Side-effect: updates TempBase to reflect the new Temporary. if (BaseIsStackOrFramePtr) { TempBaseReg = ScratchReg; TempBaseOffset = Offset; } else { TempBaseReg = nullptr; TempBaseOffset = 0; } } return ScratchReg; } OperandARM32Mem *TargetARM32::PostLoweringLegalizer::createMemOperand( Type Ty, Variable *Base, int32_t Offset, bool AllowOffsets) { assert(!Base->isRematerializable()); if (Offset == 0 || (AllowOffsets && Target->isLegalMemOffset(Ty, Offset))) { return OperandARM32Mem::create( Target->Func, Ty, Base, llvm::cast
(Target->Ctx->getConstantInt32(Offset)), OperandARM32Mem::Offset); } if (!AllowOffsets || TempBaseReg == nullptr) { newBaseRegister(Base, Offset, Target->getReservedTmpReg()); } int32_t OffsetDiff = Offset - TempBaseOffset; assert(AllowOffsets || OffsetDiff == 0); if (!Target->isLegalMemOffset(Ty, OffsetDiff)) { newBaseRegister(Base, Offset, Target->getReservedTmpReg()); OffsetDiff = 0; } assert(!TempBaseReg->isRematerializable()); return OperandARM32Mem::create( Target->Func, Ty, TempBaseReg, llvm::cast
(Target->Ctx->getConstantInt32(OffsetDiff)), OperandARM32Mem::Offset); } void TargetARM32::PostLoweringLegalizer::resetTempBaseIfClobberedBy( const Inst *Instr) { bool ClobbersTempBase = false; if (TempBaseReg != nullptr) { Variable *Dest = Instr->getDest(); if (llvm::isa
(Instr)) { // The following assertion is an invariant, so we remove it from the if // test. If the invariant is ever broken/invalidated/changed, remember // to add it back to the if condition. assert(TempBaseReg->getRegNum() == Target->getReservedTmpReg()); // The linker may need to clobber IP if the call is too far from PC. Thus, // we assume IP will be overwritten. ClobbersTempBase = true; } else if (Dest != nullptr && Dest->getRegNum() == TempBaseReg->getRegNum()) { // Register redefinition. ClobbersTempBase = true; } } if (ClobbersTempBase) { TempBaseReg = nullptr; TempBaseOffset = 0; } } void TargetARM32::PostLoweringLegalizer::legalizeMov(InstARM32Mov *MovInstr) { Variable *Dest = MovInstr->getDest(); assert(Dest != nullptr); Type DestTy = Dest->getType(); assert(DestTy != IceType_i64); Operand *Src = MovInstr->getSrc(0); Type SrcTy = Src->getType(); (void)SrcTy; assert(SrcTy != IceType_i64); if (MovInstr->isMultiDest() || MovInstr->isMultiSource()) return; bool Legalized = false; if (!Dest->hasReg()) { auto *SrcR = llvm::cast
(Src); assert(SrcR->hasReg()); assert(!SrcR->isRematerializable()); const int32_t Offset = Dest->getStackOffset(); // This is a _mov(Mem(), Variable), i.e., a store. TargetARM32::Sandboxer(Target) .str(SrcR, createMemOperand(DestTy, StackOrFrameReg, Offset), MovInstr->getPredicate()); // _str() does not have a Dest, so we add a fake-def(Dest). Target->Context.insert
(Dest); Legalized = true; } else if (auto *Var = llvm::dyn_cast
(Src)) { if (Var->isRematerializable()) { // This is equivalent to an x86 _lea(RematOffset(%esp/%ebp), Variable). // ExtraOffset is only needed for frame-pointer based frames as we have // to account for spill storage. const int32_t ExtraOffset = (Var->getRegNum() == Target->getFrameReg()) ? Target->getFrameFixedAllocaOffset() : 0; const int32_t Offset = Var->getStackOffset() + ExtraOffset; Variable *Base = Target->getPhysicalRegister(Var->getRegNum()); Variable *T = newBaseRegister(Base, Offset, Dest->getRegNum()); Target->_mov(Dest, T); Legalized = true; } else { if (!Var->hasReg()) { // This is a _mov(Variable, Mem()), i.e., a load. const int32_t Offset = Var->getStackOffset(); TargetARM32::Sandboxer(Target) .ldr(Dest, createMemOperand(DestTy, StackOrFrameReg, Offset), MovInstr->getPredicate()); Legalized = true; } } } if (Legalized) { if (MovInstr->isDestRedefined()) { Target->_set_dest_redefined(); } MovInstr->setDeleted(); } } // ARM32 address modes: // ld/st i[8|16|32]: [reg], [reg +/- imm12], [pc +/- imm12], // [reg +/- reg << shamt5] // ld/st f[32|64] : [reg], [reg +/- imm8] , [pc +/- imm8] // ld/st vectors : [reg] // // For now, we don't handle address modes with Relocatables. namespace { // MemTraits contains per-type valid address mode information. #define X(tag, elementty, int_width, fp_width, uvec_width, svec_width, sbits, \ ubits, rraddr, shaddr) \ static_assert(!(shaddr) || rraddr, "Check ICETYPEARM32_TABLE::" #tag); ICETYPEARM32_TABLE #undef X static const struct { int32_t ValidImmMask; bool CanHaveImm; bool CanHaveIndex; bool CanHaveShiftedIndex; } MemTraits[] = { #define X(tag, elementty, int_width, fp_width, uvec_width, svec_width, sbits, \ ubits, rraddr, shaddr) \ { (1 << ubits) - 1, (ubits) > 0, rraddr, shaddr, } \ , ICETYPEARM32_TABLE #undef X }; static constexpr SizeT MemTraitsSize = llvm::array_lengthof(MemTraits); } // end of anonymous namespace OperandARM32Mem * TargetARM32::PostLoweringLegalizer::legalizeMemOperand(OperandARM32Mem *Mem, bool AllowOffsets) { assert(!Mem->isRegReg() || !Mem->getIndex()->isRematerializable()); assert( Mem->isRegReg() || Target->isLegalMemOffset(Mem->getType(), Mem->getOffset()->getValue())); bool Legalized = false; Variable *Base = Mem->getBase(); int32_t Offset = Mem->isRegReg() ? 0 : Mem->getOffset()->getValue(); if (Base->isRematerializable()) { const int32_t ExtraOffset = (Base->getRegNum() == Target->getFrameReg()) ? Target->getFrameFixedAllocaOffset() : 0; Offset += Base->getStackOffset() + ExtraOffset; Base = Target->getPhysicalRegister(Base->getRegNum()); assert(!Base->isRematerializable()); Legalized = true; } if (!Legalized && !Target->NeedSandboxing) { return nullptr; } if (!Mem->isRegReg()) { return createMemOperand(Mem->getType(), Base, Offset, AllowOffsets); } if (Target->NeedSandboxing) { llvm::report_fatal_error("Reg-Reg address mode is not allowed."); } assert(MemTraits[Mem->getType()].CanHaveIndex); if (Offset != 0) { if (TempBaseReg == nullptr) { Base = newBaseRegister(Base, Offset, Target->getReservedTmpReg()); } else { uint32_t Imm8, Rotate; const int32_t OffsetDiff = Offset - TempBaseOffset; if (OffsetDiff == 0) { Base = TempBaseReg; } else if (OperandARM32FlexImm::canHoldImm(OffsetDiff, &Rotate, &Imm8)) { auto *OffsetDiffF = OperandARM32FlexImm::create( Target->Func, IceType_i32, Imm8, Rotate); Target->_add(TempBaseReg, TempBaseReg, OffsetDiffF); TempBaseOffset += OffsetDiff; Base = TempBaseReg; } else if (OperandARM32FlexImm::canHoldImm(-OffsetDiff, &Rotate, &Imm8)) { auto *OffsetDiffF = OperandARM32FlexImm::create( Target->Func, IceType_i32, Imm8, Rotate); Target->_sub(TempBaseReg, TempBaseReg, OffsetDiffF); TempBaseOffset += OffsetDiff; Base = TempBaseReg; } else { Base = newBaseRegister(Base, Offset, Target->getReservedTmpReg()); } } } return OperandARM32Mem::create(Target->Func, Mem->getType(), Base, Mem->getIndex(), Mem->getShiftOp(), Mem->getShiftAmt(), Mem->getAddrMode()); } void TargetARM32::postLowerLegalization() { // If a stack variable's frame offset doesn't fit, convert from: // ldr X, OFF[SP] // to: // movw/movt TMP, OFF_PART // add TMP, TMP, SP // ldr X, OFF_MORE[TMP] // // This is safe because we have reserved TMP, and add for ARM does not // clobber the flags register. Func->dump("Before postLowerLegalization"); assert(hasComputedFrame()); // Do a fairly naive greedy clustering for now. Pick the first stack slot // that's out of bounds and make a new base reg using the architecture's temp // register. If that works for the next slot, then great. Otherwise, create a // new base register, clobbering the previous base register. Never share a // base reg across different basic blocks. This isn't ideal if local and // multi-block variables are far apart and their references are interspersed. // It may help to be more coordinated about assign stack slot numbers and may // help to assign smaller offsets to higher-weight variables so that they // don't depend on this legalization. for (CfgNode *Node : Func->getNodes()) { Context.init(Node); // One legalizer per basic block, otherwise we would share the Temporary // Base Register between basic blocks. PostLoweringLegalizer Legalizer(this); while (!Context.atEnd()) { PostIncrLoweringContext PostIncrement(Context); Inst *CurInstr = iteratorToInst(Context.getCur()); // Check if the previous TempBaseReg is clobbered, and reset if needed. Legalizer.resetTempBaseIfClobberedBy(CurInstr); if (auto *MovInstr = llvm::dyn_cast
(CurInstr)) { Legalizer.legalizeMov(MovInstr); } else if (auto *LdrInstr = llvm::dyn_cast
(CurInstr)) { if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand( llvm::cast
(LdrInstr->getSrc(0)))) { Sandboxer(this) .ldr(CurInstr->getDest(), LegalMem, LdrInstr->getPredicate()); CurInstr->setDeleted(); } } else if (auto *LdrexInstr = llvm::dyn_cast
(CurInstr)) { constexpr bool DisallowOffsetsBecauseLdrex = false; if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand( llvm::cast
(LdrexInstr->getSrc(0)), DisallowOffsetsBecauseLdrex)) { Sandboxer(this) .ldrex(CurInstr->getDest(), LegalMem, LdrexInstr->getPredicate()); CurInstr->setDeleted(); } } else if (auto *StrInstr = llvm::dyn_cast
(CurInstr)) { if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand( llvm::cast
(StrInstr->getSrc(1)))) { Sandboxer(this).str(llvm::cast
(CurInstr->getSrc(0)), LegalMem, StrInstr->getPredicate()); CurInstr->setDeleted(); } } else if (auto *StrexInstr = llvm::dyn_cast
(CurInstr)) { constexpr bool DisallowOffsetsBecauseStrex = false; if (OperandARM32Mem *LegalMem = Legalizer.legalizeMemOperand( llvm::cast
(StrexInstr->getSrc(1)), DisallowOffsetsBecauseStrex)) { Sandboxer(this).strex(CurInstr->getDest(), llvm::cast
(CurInstr->getSrc(0)), LegalMem, StrexInstr->getPredicate()); CurInstr->setDeleted(); } } // Sanity-check: the Legalizer will either have no Temp, or it will be // bound to IP. Legalizer.assertNoTempOrAssignedToIP(); } } } Operand *TargetARM32::loOperand(Operand *Operand) { assert(Operand->getType() == IceType_i64); if (Operand->getType() != IceType_i64) return Operand; if (auto *Var64On32 = llvm::dyn_cast
(Operand)) return Var64On32->getLo(); if (auto *Const = llvm::dyn_cast
(Operand)) return Ctx->getConstantInt32(static_cast
(Const->getValue())); if (auto *Mem = llvm::dyn_cast
(Operand)) { // Conservatively disallow memory operands with side-effects (pre/post // increment) in case of duplication. assert(Mem->getAddrMode() == OperandARM32Mem::Offset || Mem->getAddrMode() == OperandARM32Mem::NegOffset); if (Mem->isRegReg()) { Variable *IndexR = legalizeToReg(Mem->getIndex()); return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(), IndexR, Mem->getShiftOp(), Mem->getShiftAmt(), Mem->getAddrMode()); } else { return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(), Mem->getOffset(), Mem->getAddrMode()); } } llvm::report_fatal_error("Unsupported operand type"); return nullptr; } Operand *TargetARM32::hiOperand(Operand *Operand) { assert(Operand->getType() == IceType_i64); if (Operand->getType() != IceType_i64) return Operand; if (auto *Var64On32 = llvm::dyn_cast
(Operand)) return Var64On32->getHi(); if (auto *Const = llvm::dyn_cast
(Operand)) { return Ctx->getConstantInt32( static_cast
(Const->getValue() >> 32)); } if (auto *Mem = llvm::dyn_cast
(Operand)) { // Conservatively disallow memory operands with side-effects in case of // duplication. assert(Mem->getAddrMode() == OperandARM32Mem::Offset || Mem->getAddrMode() == OperandARM32Mem::NegOffset); const Type SplitType = IceType_i32; if (Mem->isRegReg()) { // We have to make a temp variable T, and add 4 to either Base or Index. // The Index may be shifted, so adding 4 can mean something else. Thus, // prefer T := Base + 4, and use T as the new Base. Variable *Base = Mem->getBase(); Constant *Four = Ctx->getConstantInt32(4); Variable *NewBase = Func->makeVariable(Base->getType()); lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add, NewBase, Base, Four)); Variable *BaseR = legalizeToReg(NewBase); Variable *IndexR = legalizeToReg(Mem->getIndex()); return OperandARM32Mem::create(Func, SplitType, BaseR, IndexR, Mem->getShiftOp(), Mem->getShiftAmt(), Mem->getAddrMode()); } else { Variable *Base = Mem->getBase(); ConstantInteger32 *Offset = Mem->getOffset(); assert(!Utils::WouldOverflowAdd(Offset->getValue(), 4)); int32_t NextOffsetVal = Offset->getValue() + 4; constexpr bool ZeroExt = false; if (!OperandARM32Mem::canHoldOffset(SplitType, ZeroExt, NextOffsetVal)) { // We have to make a temp variable and add 4 to either Base or Offset. // If we add 4 to Offset, this will convert a non-RegReg addressing // mode into a RegReg addressing mode. Since NaCl sandboxing disallows // RegReg addressing modes, prefer adding to base and replacing // instead. Thus we leave the old offset alone. Constant *_4 = Ctx->getConstantInt32(4); Variable *NewBase = Func->makeVariable(Base->getType()); lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add, NewBase, Base, _4)); Base = NewBase; } else { Offset = llvm::cast
(Ctx->getConstantInt32(NextOffsetVal)); } Variable *BaseR = legalizeToReg(Base); return OperandARM32Mem::create(Func, SplitType, BaseR, Offset, Mem->getAddrMode()); } } llvm::report_fatal_error("Unsupported operand type"); return nullptr; } SmallBitVector TargetARM32::getRegisterSet(RegSetMask Include, RegSetMask Exclude) const { SmallBitVector Registers(RegARM32::Reg_NUM); for (uint32_t i = 0; i < RegARM32::Reg_NUM; ++i) { const auto &Entry = RegARM32::RegTable[i]; if (Entry.Scratch && (Include & RegSet_CallerSave)) Registers[i] = true; if (Entry.Preserved && (Include & RegSet_CalleeSave)) Registers[i] = true; if (Entry.StackPtr && (Include & RegSet_StackPointer)) Registers[i] = true; if (Entry.FramePtr && (Include & RegSet_FramePointer)) Registers[i] = true; if (Entry.Scratch && (Exclude & RegSet_CallerSave)) Registers[i] = false; if (Entry.Preserved && (Exclude & RegSet_CalleeSave)) Registers[i] = false; if (Entry.StackPtr && (Exclude & RegSet_StackPointer)) Registers[i] = false; if (Entry.FramePtr && (Exclude & RegSet_FramePointer)) Registers[i] = false; } return Registers; } void TargetARM32::lowerAlloca(const InstAlloca *Instr) { // Conservatively require the stack to be aligned. Some stack adjustment // operations implemented below assume that the stack is aligned before the // alloca. All the alloca code ensures that the stack alignment is preserved // after the alloca. The stack alignment restriction can be relaxed in some // cases. NeedsStackAlignment = true; // For default align=0, set it to the real value 1, to avoid any // bit-manipulation problems below. const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes()); // LLVM enforces power of 2 alignment. assert(llvm::isPowerOf2_32(AlignmentParam)); assert(llvm::isPowerOf2_32(ARM32_STACK_ALIGNMENT_BYTES)); const uint32_t Alignment = std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES); const bool OverAligned = Alignment > ARM32_STACK_ALIGNMENT_BYTES; const bool OptM1 = Func->getOptLevel() == Opt_m1; const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset(); const bool UseFramePointer = hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1; if (UseFramePointer) setHasFramePointer(); Variable *SP = getPhysicalRegister(RegARM32::Reg_sp); if (OverAligned) { Sandboxer(this).align_sp(Alignment); } Variable *Dest = Instr->getDest(); Operand *TotalSize = Instr->getSizeInBytes(); if (const auto *ConstantTotalSize = llvm::dyn_cast
(TotalSize)) { const uint32_t Value = Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment); // Constant size alloca. if (!UseFramePointer) { // If we don't need a Frame Pointer, this alloca has a known offset to the // stack pointer. We don't need adjust the stack pointer, nor assign any // value to Dest, as Dest is rematerializable. assert(Dest->isRematerializable()); FixedAllocaSizeBytes += Value; Context.insert
(Dest); return; } // If a frame pointer is required, then we need to store the alloca'd result // in Dest. Operand *SubAmountRF = legalize(Ctx->getConstantInt32(Value), Legal_Reg | Legal_Flex); Sandboxer(this).sub_sp(SubAmountRF); } else { // Non-constant sizes need to be adjusted to the next highest multiple of // the required alignment at runtime. TotalSize = legalize(TotalSize, Legal_Reg | Legal_Flex); Variable *T = makeReg(IceType_i32); _mov(T, TotalSize); Operand *AddAmount = legalize(Ctx->getConstantInt32(Alignment - 1)); _add(T, T, AddAmount); alignRegisterPow2(T, Alignment); Sandboxer(this).sub_sp(T); } // Adds back a few bytes to SP to account for the out args area. Variable *T = SP; if (MaxOutArgsSizeBytes != 0) { T = makeReg(getPointerType()); Operand *OutArgsSizeRF = legalize( Ctx->getConstantInt32(MaxOutArgsSizeBytes), Legal_Reg | Legal_Flex); _add(T, SP, OutArgsSizeRF); } _mov(Dest, T); } void TargetARM32::div0Check(Type Ty, Operand *SrcLo, Operand *SrcHi) { if (isGuaranteedNonzeroInt(SrcLo) || isGuaranteedNonzeroInt(SrcHi)) return; Variable *SrcLoReg = legalizeToReg(SrcLo); switch (Ty) { default: llvm_unreachable( ("Unexpected type in div0Check: " + typeStdString(Ty)).c_str()); case IceType_i8: case IceType_i16: { Operand *ShAmtImm = shAmtImm(32 - getScalarIntBitWidth(Ty)); Variable *T = makeReg(IceType_i32); _lsls(T, SrcLoReg, ShAmtImm); Context.insert
(T); } break; case IceType_i32: { _tst(SrcLoReg, SrcLoReg); break; } case IceType_i64: { Variable *T = makeReg(IceType_i32); _orrs(T, SrcLoReg, legalize(SrcHi, Legal_Reg | Legal_Flex)); // T isn't going to be used, but we need the side-effect of setting flags // from this operation. Context.insert
(T); } } auto *Label = InstARM32Label::create(Func, this); _br(Label, CondARM32::NE); _trap(); Context.insert(Label); } void TargetARM32::lowerIDivRem(Variable *Dest, Variable *T, Variable *Src0R, Operand *Src1, ExtInstr ExtFunc, DivInstr DivFunc, bool IsRemainder) { div0Check(Dest->getType(), Src1, nullptr); Variable *Src1R = legalizeToReg(Src1); Variable *T0R = Src0R; Variable *T1R = Src1R; if (Dest->getType() != IceType_i32) { T0R = makeReg(IceType_i32); (this->*ExtFunc)(T0R, Src0R, CondARM32::AL); T1R = makeReg(IceType_i32); (this->*ExtFunc)(T1R, Src1R, CondARM32::AL); } if (hasCPUFeature(TargetARM32Features::HWDivArm)) { (this->*DivFunc)(T, T0R, T1R, CondARM32::AL); if (IsRemainder) { Variable *T2 = makeReg(IceType_i32); _mls(T2, T, T1R, T0R); T = T2; } _mov(Dest, T); } else { llvm::report_fatal_error("div should have already been turned into a call"); } } TargetARM32::SafeBoolChain TargetARM32::lowerInt1Arithmetic(const InstArithmetic *Instr) { Variable *Dest = Instr->getDest(); assert(Dest->getType() == IceType_i1); // So folding didn't work for Instr. Not a problem: We just need to // materialize the Sources, and perform the operation. We create regular // Variables (and not infinite-weight ones) because this call might recurse a // lot, and we might end up with tons of infinite weight temporaries. assert(Instr->getSrcSize() == 2); Variable *Src0 = Func->makeVariable(IceType_i1); SafeBoolChain Src0Safe = lowerInt1(Src0, Instr->getSrc(0)); Operand *Src1 = Instr->getSrc(1); SafeBoolChain Src1Safe = SBC_Yes; if (!llvm::isa
(Src1)) { Variable *Src1V = Func->makeVariable(IceType_i1); Src1Safe = lowerInt1(Src1V, Src1); Src1 = Src1V; } Variable *T = makeReg(IceType_i1); Src0 = legalizeToReg(Src0); Operand *Src1RF = legalize(Src1, Legal_Reg | Legal_Flex); switch (Instr->getOp()) { default: // If this Unreachable is ever executed, add the offending operation to // the list of valid consumers. llvm::report_fatal_error("Unhandled i1 Op"); case InstArithmetic::And: _and(T, Src0, Src1RF); break; case InstArithmetic::Or: _orr(T, Src0, Src1RF); break; case InstArithmetic::Xor: _eor(T, Src0, Src1RF); break; } _mov(Dest, T); return Src0Safe == SBC_Yes && Src1Safe == SBC_Yes ? SBC_Yes : SBC_No; } namespace { // NumericOperands is used during arithmetic/icmp lowering for constant folding. // It holds the two sources operands, and maintains some state as to whether one // of them is a constant. If one of the operands is a constant, then it will be // be stored as the operation's second source, with a bit indicating whether the // operands were swapped. // // The class is split into a base class with operand type-independent methods, // and a derived, templated class, for each type of operand we want to fold // constants for: // // NumericOperandsBase --> NumericOperands
// --> NumericOperands
// --> NumericOperands
// // NumericOperands
also exposes helper methods for emitting // inverted/negated immediates. class NumericOperandsBase { NumericOperandsBase() = delete; NumericOperandsBase(const NumericOperandsBase &) = delete; NumericOperandsBase &operator=(const NumericOperandsBase &) = delete; public: NumericOperandsBase(Operand *S0, Operand *S1) : Src0(NonConstOperand(S0, S1)), Src1(ConstOperand(S0, S1)), Swapped(Src0 == S1 && S0 != S1) { assert(Src0 != nullptr); assert(Src1 != nullptr); assert(Src0 != Src1 || S0 == S1); } bool hasConstOperand() const { return llvm::isa
(Src1) && !llvm::isa
(Src1); } bool swappedOperands() const { return Swapped; } Variable *src0R(TargetARM32 *Target) const { return legalizeToReg(Target, Src0); } Variable *unswappedSrc0R(TargetARM32 *Target) const { return legalizeToReg(Target, Swapped ? Src1 : Src0); } Operand *src1RF(TargetARM32 *Target) const { return legalizeToRegOrFlex(Target, Src1); } Variable *unswappedSrc1R(TargetARM32 *Target) const { return legalizeToReg(Target, Swapped ? Src0 : Src1); } Operand *src1() const { return Src1; } protected: Operand *const Src0; Operand *const Src1; const bool Swapped; static Variable *legalizeToReg(TargetARM32 *Target, Operand *Src) { return Target->legalizeToReg(Src); } static Operand *legalizeToRegOrFlex(TargetARM32 *Target, Operand *Src) { return Target->legalize(Src, TargetARM32::Legal_Reg | TargetARM32::Legal_Flex); } private: static Operand *NonConstOperand(Operand *S0, Operand *S1) { if (!llvm::isa
(S0)) return S0; if (!llvm::isa
(S1)) return S1; if (llvm::isa
(S1) && !llvm::isa
(S0)) return S1; return S0; } static Operand *ConstOperand(Operand *S0, Operand *S1) { if (!llvm::isa
(S0)) return S1; if (!llvm::isa
(S1)) return S0; if (llvm::isa
(S1) && !llvm::isa
(S0)) return S0; return S1; } }; template
class NumericOperands : public NumericOperandsBase { NumericOperands() = delete; NumericOperands(const NumericOperands &) = delete; NumericOperands &operator=(const NumericOperands &) = delete; public: NumericOperands(Operand *S0, Operand *S1) : NumericOperandsBase(S0, S1) { assert(!hasConstOperand() || llvm::isa
(this->Src1)); } typename C::PrimType getConstantValue() const { return llvm::cast
(Src1)->getValue(); } }; using FloatOperands = NumericOperands
; using DoubleOperands = NumericOperands
; class Int32Operands : public NumericOperands
{ Int32Operands() = delete; Int32Operands(const Int32Operands &) = delete; Int32Operands &operator=(const Int32Operands &) = delete; public: Int32Operands(Operand *S0, Operand *S1) : NumericOperands(S0, S1) {} Operand *unswappedSrc1RShAmtImm(TargetARM32 *Target) const { if (!swappedOperands() && hasConstOperand()) { return Target->shAmtImm(getConstantValue() & 0x1F); } return legalizeToReg(Target, Swapped ? Src0 : Src1); } bool isSrc1ImmediateZero() const { if (!swappedOperands() && hasConstOperand()) { return getConstantValue() == 0; } return false; } bool immediateIsFlexEncodable() const { uint32_t Rotate, Imm8; return OperandARM32FlexImm::canHoldImm(getConstantValue(), &Rotate, &Imm8); } bool negatedImmediateIsFlexEncodable() const { uint32_t Rotate, Imm8; return OperandARM32FlexImm::canHoldImm( -static_cast
(getConstantValue()), &Rotate, &Imm8); } Operand *negatedSrc1F(TargetARM32 *Target) const { return legalizeToRegOrFlex(Target, Target->getCtx()->getConstantInt32( -static_cast
(getConstantValue()))); } bool invertedImmediateIsFlexEncodable() const { uint32_t Rotate, Imm8; return OperandARM32FlexImm::canHoldImm( ~static_cast
(getConstantValue()), &Rotate, &Imm8); } Operand *invertedSrc1F(TargetARM32 *Target) const { return legalizeToRegOrFlex(Target, Target->getCtx()->getConstantInt32( ~static_cast
(getConstantValue()))); } }; } // end of anonymous namespace void TargetARM32::preambleDivRem(const InstCall *Instr) { Operand *Src1 = Instr->getArg(1); switch (Src1->getType()) { default: llvm::report_fatal_error("Invalid type for idiv."); case IceType_i64: { if (auto *C = llvm::dyn_cast
(Src1)) { if (C->getValue() == 0) { _trap(); return; } } div0Check(IceType_i64, loOperand(Src1), hiOperand(Src1)); return; } case IceType_i32: { // Src0 and Src1 have already been appropriately extended to an i32, so we // don't check for i8 and i16. if (auto *C = llvm::dyn_cast
(Src1)) { if (C->getValue() == 0) { _trap(); return; } } div0Check(IceType_i32, Src1, nullptr); return; } } } void TargetARM32::lowerInt64Arithmetic(InstArithmetic::OpKind Op, Variable *Dest, Operand *Src0, Operand *Src1) { Int32Operands SrcsLo(loOperand(Src0), loOperand(Src1)); Int32Operands SrcsHi(hiOperand(Src0), hiOperand(Src1)); assert(SrcsLo.swappedOperands() == SrcsHi.swappedOperands()); assert(SrcsLo.hasConstOperand() == SrcsHi.hasConstOperand()); auto *DestLo = llvm::cast
(loOperand(Dest)); auto *DestHi = llvm::cast
(hiOperand(Dest)); Variable *T_Lo = makeReg(DestLo->getType()); Variable *T_Hi = makeReg(DestHi->getType()); switch (Op) { case InstArithmetic::_num: llvm::report_fatal_error("Unknown arithmetic operator"); return; case InstArithmetic::Add: { Variable *Src0LoR = SrcsLo.src0R(this); Operand *Src1LoRF = SrcsLo.src1RF(this); Variable *Src0HiR = SrcsHi.src0R(this); Operand *Src1HiRF = SrcsHi.src1RF(this); _adds(T_Lo, Src0LoR, Src1LoRF); _mov(DestLo, T_Lo); _adc(T_Hi, Src0HiR, Src1HiRF); _mov(DestHi, T_Hi); return; } case InstArithmetic::And: { Variable *Src0LoR = SrcsLo.src0R(this); Operand *Src1LoRF = SrcsLo.src1RF(this); Variable *Src0HiR = SrcsHi.src0R(this); Operand *Src1HiRF = SrcsHi.src1RF(this); _and(T_Lo, Src0LoR, Src1LoRF); _mov(DestLo, T_Lo); _and(T_Hi, Src0HiR, Src1HiRF); _mov(DestHi, T_Hi); return; } case InstArithmetic::Or: { Variable *Src0LoR = SrcsLo.src0R(this); Operand *Src1LoRF = SrcsLo.src1RF(this); Variable *Src0HiR = SrcsHi.src0R(this); Operand *Src1HiRF = SrcsHi.src1RF(this); _orr(T_Lo, Src0LoR, Src1LoRF); _mov(DestLo, T_Lo); _orr(T_Hi, Src0HiR, Src1HiRF); _mov(DestHi, T_Hi); return; } case InstArithmetic::Xor: { Variable *Src0LoR = SrcsLo.src0R(this); Operand *Src1LoRF = SrcsLo.src1RF(this); Variable *Src0HiR = SrcsHi.src0R(this); Operand *Src1HiRF = SrcsHi.src1RF(this); _eor(T_Lo, Src0LoR, Src1LoRF); _mov(DestLo, T_Lo); _eor(T_Hi, Src0HiR, Src1HiRF); _mov(DestHi, T_Hi); return; } case InstArithmetic::Sub: { Variable *Src0LoR = SrcsLo.src0R(this); Operand *Src1LoRF = SrcsLo.src1RF(this); Variable *Src0HiR = SrcsHi.src0R(this); Operand *Src1HiRF = SrcsHi.src1RF(this); if (SrcsLo.swappedOperands()) { _rsbs(T_Lo, Src0LoR, Src1LoRF); _mov(DestLo, T_Lo); _rsc(T_Hi, Src0HiR, Src1HiRF); _mov(DestHi, T_Hi); } else { _subs(T_Lo, Src0LoR, Src1LoRF); _mov(DestLo, T_Lo); _sbc(T_Hi, Src0HiR, Src1HiRF); _mov(DestHi, T_Hi); } return; } case InstArithmetic::Mul: { // GCC 4.8 does: // a=b*c ==> // t_acc =(mul) (b.lo * c.hi) // t_acc =(mla) (c.lo * b.hi) + t_acc // t.hi,t.lo =(umull) b.lo * c.lo // t.hi += t_acc // a.lo = t.lo // a.hi = t.hi // // LLVM does: // t.hi,t.lo =(umull) b.lo * c.lo // t.hi =(mla) (b.lo * c.hi) + t.hi // t.hi =(mla) (b.hi * c.lo) + t.hi // a.lo = t.lo // a.hi = t.hi // // LLVM's lowering has fewer instructions, but more register pressure: // t.lo is live from beginning to end, while GCC delays the two-dest // instruction till the end, and kills c.hi immediately. Variable *T_Acc = makeReg(IceType_i32); Variable *T_Acc1 = makeReg(IceType_i32); Variable *T_Hi1 = makeReg(IceType_i32); Variable *Src0RLo = SrcsLo.unswappedSrc0R(this); Variable *Src0RHi = SrcsHi.unswappedSrc0R(this); Variable *Src1RLo = SrcsLo.unswappedSrc1R(this); Variable *Src1RHi = SrcsHi.unswappedSrc1R(this); _mul(T_Acc, Src0RLo, Src1RHi); _mla(T_Acc1, Src1RLo, Src0RHi, T_Acc); _umull(T_Lo, T_Hi1, Src0RLo, Src1RLo); _add(T_Hi, T_Hi1, T_Acc1); _mov(DestLo, T_Lo); _mov(DestHi, T_Hi); return; } case InstArithmetic::Shl: { if (!SrcsLo.swappedOperands() && SrcsLo.hasConstOperand()) { Variable *Src0RLo = SrcsLo.src0R(this); // Truncating the ShAmt to [0, 63] because that's what ARM does anyway. const int32_t ShAmtImm = SrcsLo.getConstantValue() & 0x3F; if (ShAmtImm == 0) { _mov(DestLo, Src0RLo); _mov(DestHi, SrcsHi.src0R(this)); return; } if (ShAmtImm >= 32) { if (ShAmtImm == 32) { _mov(DestHi, Src0RLo); } else { Operand *ShAmtOp = shAmtImm(ShAmtImm - 32); _lsl(T_Hi, Src0RLo, ShAmtOp); _mov(DestHi, T_Hi); } Operand *_0 = legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex); _mov(T_Lo, _0); _mov(DestLo, T_Lo); return; } Variable *Src0RHi = SrcsHi.src0R(this); Operand *ShAmtOp = shAmtImm(ShAmtImm); Operand *ComplShAmtOp = shAmtImm(32 - ShAmtImm); _lsl(T_Hi, Src0RHi, ShAmtOp); _orr(T_Hi, T_Hi, OperandARM32FlexReg::create(Func, IceType_i32, Src0RLo, OperandARM32::LSR, ComplShAmtOp)); _mov(DestHi, T_Hi); _lsl(T_Lo, Src0RLo, ShAmtOp); _mov(DestLo, T_Lo); return; } // a=b<
// pnacl-llc does: // mov t_b.lo, b.lo // mov t_b.hi, b.hi // mov t_c.lo, c.lo // rsb T0, t_c.lo, #32 // lsr T1, t_b.lo, T0 // orr t_a.hi, T1, t_b.hi, lsl t_c.lo // sub T2, t_c.lo, #32 // cmp T2, #0 // lslge t_a.hi, t_b.lo, T2 // lsl t_a.lo, t_b.lo, t_c.lo // mov a.lo, t_a.lo // mov a.hi, t_a.hi // // GCC 4.8 does: // sub t_c1, c.lo, #32 // lsl t_hi, b.hi, c.lo // orr t_hi, t_hi, b.lo, lsl t_c1 // rsb t_c2, c.lo, #32 // orr t_hi, t_hi, b.lo, lsr t_c2 // lsl t_lo, b.lo, c.lo // a.lo = t_lo // a.hi = t_hi // // These are incompatible, therefore we mimic pnacl-llc. // Can be strength-reduced for constant-shifts, but we don't do that for // now. // Given the sub/rsb T_C, C.lo, #32, one of the T_C will be negative. On // ARM, shifts only take the lower 8 bits of the shift register, and // saturate to the range 0-32, so the negative value will saturate to 32. Operand *_32 = legalize(Ctx->getConstantInt32(32), Legal_Reg | Legal_Flex); Operand *_0 = legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex); Variable *T0 = makeReg(IceType_i32); Variable *T1 = makeReg(IceType_i32); Variable *T2 = makeReg(IceType_i32); Variable *TA_Hi = makeReg(IceType_i32); Variable *TA_Lo = makeReg(IceType_i32); Variable *Src0RLo = SrcsLo.unswappedSrc0R(this); Variable *Src0RHi = SrcsHi.unswappedSrc0R(this); Variable *Src1RLo = SrcsLo.unswappedSrc1R(this); _rsb(T0, Src1RLo, _32); _lsr(T1, Src0RLo, T0); _orr(TA_Hi, T1, OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi, OperandARM32::LSL, Src1RLo)); _sub(T2, Src1RLo, _32); _cmp(T2, _0); _lsl(TA_Hi, Src0RLo, T2, CondARM32::GE); _set_dest_redefined(); _lsl(TA_Lo, Src0RLo, Src1RLo); _mov(DestLo, TA_Lo); _mov(DestHi, TA_Hi); return; } case InstArithmetic::Lshr: case InstArithmetic::Ashr: { const bool ASR = Op == InstArithmetic::Ashr; if (!SrcsLo.swappedOperands() && SrcsLo.hasConstOperand()) { Variable *Src0RHi = SrcsHi.src0R(this); // Truncating the ShAmt to [0, 63] because that's what ARM does anyway. const int32_t ShAmt = SrcsLo.getConstantValue() & 0x3F; if (ShAmt == 0) { _mov(DestHi, Src0RHi); _mov(DestLo, SrcsLo.src0R(this)); return; } if (ShAmt >= 32) { if (ShAmt == 32) { _mov(DestLo, Src0RHi); } else { Operand *ShAmtImm = shAmtImm(ShAmt - 32); if (ASR) { _asr(T_Lo, Src0RHi, ShAmtImm); } else { _lsr(T_Lo, Src0RHi, ShAmtImm); } _mov(DestLo, T_Lo); } if (ASR) { Operand *_31 = shAmtImm(31); _asr(T_Hi, Src0RHi, _31); } else { Operand *_0 = legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex); _mov(T_Hi, _0); } _mov(DestHi, T_Hi); return; } Variable *Src0RLo = SrcsLo.src0R(this); Operand *ShAmtImm = shAmtImm(ShAmt); Operand *ComplShAmtImm = shAmtImm(32 - ShAmt); _lsr(T_Lo, Src0RLo, ShAmtImm); _orr(T_Lo, T_Lo, OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi, OperandARM32::LSL, ComplShAmtImm)); _mov(DestLo, T_Lo); if (ASR) { _asr(T_Hi, Src0RHi, ShAmtImm); } else { _lsr(T_Hi, Src0RHi, ShAmtImm); } _mov(DestHi, T_Hi); return; } // a=b>>c // pnacl-llc does: // mov t_b.lo, b.lo // mov t_b.hi, b.hi // mov t_c.lo, c.lo // lsr T0, t_b.lo, t_c.lo // rsb T1, t_c.lo, #32 // orr t_a.lo, T0, t_b.hi, lsl T1 // sub T2, t_c.lo, #32 // cmp T2, #0 // [al]srge t_a.lo, t_b.hi, T2 // [al]sr t_a.hi, t_b.hi, t_c.lo // mov a.lo, t_a.lo // mov a.hi, t_a.hi // // GCC 4.8 does (lsr): // rsb t_c1, c.lo, #32 // lsr t_lo, b.lo, c.lo // orr t_lo, t_lo, b.hi, lsl t_c1 // sub t_c2, c.lo, #32 // orr t_lo, t_lo, b.hi, lsr t_c2 // lsr t_hi, b.hi, c.lo // mov a.lo, t_lo // mov a.hi, t_hi // // These are incompatible, therefore we mimic pnacl-llc. Operand *_32 = legalize(Ctx->getConstantInt32(32), Legal_Reg | Legal_Flex); Operand *_0 = legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex); Variable *T0 = makeReg(IceType_i32); Variable *T1 = makeReg(IceType_i32); Variable *T2 = makeReg(IceType_i32); Variable *TA_Lo = makeReg(IceType_i32); Variable *TA_Hi = makeReg(IceType_i32); Variable *Src0RLo = SrcsLo.unswappedSrc0R(this); Variable *Src0RHi = SrcsHi.unswappedSrc0R(this); Variable *Src1RLo = SrcsLo.unswappedSrc1R(this); _lsr(T0, Src0RLo, Src1RLo); _rsb(T1, Src1RLo, _32); _orr(TA_Lo, T0, OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi, OperandARM32::LSL, T1)); _sub(T2, Src1RLo, _32); _cmp(T2, _0); if (ASR) { _asr(TA_Lo, Src0RHi, T2, CondARM32::GE); _set_dest_redefined(); _asr(TA_Hi, Src0RHi, Src1RLo); } else { _lsr(TA_Lo, Src0RHi, T2, CondARM32::GE); _set_dest_redefined(); _lsr(TA_Hi, Src0RHi, Src1RLo); } _mov(DestLo, TA_Lo); _mov(DestHi, TA_Hi); return; } case InstArithmetic::Fadd: case InstArithmetic::Fsub: case InstArithmetic::Fmul: case InstArithmetic::Fdiv: case InstArithmetic::Frem: llvm::report_fatal_error("FP instruction with i64 type"); return; case InstArithmetic::Udiv: case InstArithmetic::Sdiv: case InstArithmetic::Urem: case InstArithmetic::Srem: llvm::report_fatal_error("Call-helper-involved instruction for i64 type " "should have already been handled before"); return; } } namespace { // StrengthReduction is a namespace with the strength reduction machinery. The // entry point is the StrengthReduction::tryToOptimize method. It returns true // if the optimization can be performed, and false otherwise. // // If the optimization can be performed, tryToOptimize sets its NumOperations // parameter to the number of shifts that are needed to perform the // multiplication; and it sets the Operations parameter with
// tuples that describe how to materialize the multiplication. // // The algorithm finds contiguous 1s in the Multiplication source, and uses one // or two shifts to materialize it. A sequence of 1s, e.g., // // M N // ...00000000000011111...111110000000... // // is materializable with (1 << (M + 1)) - (1 << N): // // ...00000000000100000...000000000000... [1 << (M + 1)] // ...00000000000000000...000010000000... (-) [1 << N] // -------------------------------------- // ...00000000000011111...111110000000... // // And a single bit set, which is just a left shift. namespace StrengthReduction { enum AggregationOperation { AO_Invalid, AO_Add, AO_Sub, }; // AggregateElement is a glorified
tuple. class AggregationElement { AggregationElement(const AggregationElement &) = delete; public: AggregationElement() = default; AggregationElement &operator=(const AggregationElement &) = default; AggregationElement(AggregationOperation Op, uint32_t ShAmt) : Op(Op), ShAmt(ShAmt) {} Operand *createShiftedOperand(Cfg *Func, Variable *OpR) const { assert(OpR->mustHaveReg()); if (ShAmt == 0) { return OpR; } return OperandARM32FlexReg::create( Func, IceType_i32, OpR, OperandARM32::LSL, OperandARM32ShAmtImm::create( Func, llvm::cast
( Func->getContext()->getConstantInt32(ShAmt)))); } bool aggregateWithAdd() const { switch (Op) { case AO_Invalid: llvm::report_fatal_error("Invalid Strength Reduction Operations."); case AO_Add: return true; case AO_Sub: return false; } llvm_unreachable("(silence g++ warning)"); } uint32_t shAmt() const { return ShAmt; } private: AggregationOperation Op = AO_Invalid; uint32_t ShAmt; }; // [RangeStart, RangeEnd] is a range of 1s in Src. template
bool addOperations(uint32_t RangeStart, uint32_t RangeEnd, SizeT *NumOperations, std::array
*Operations) { assert(*NumOperations < N); if (RangeStart == RangeEnd) { // Single bit set: // Src : 0...00010... // RangeStart : ^ // RangeEnd : ^ // NegSrc : 0...00001... (*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart); ++(*NumOperations); return true; } // Sequence of 1s: (two operations required.) // Src : 0...00011...110... // RangeStart : ^ // RangeEnd : ^ // NegSrc : 0...00000...001... if (*NumOperations + 1 >= N) { return false; } (*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart + 1); ++(*NumOperations); (*Operations)[*NumOperations] = AggregationElement(AO_Sub, RangeEnd); ++(*NumOperations); return true; } // tryToOptmize scans Src looking for sequences of 1s (including the unitary bit // 1 surrounded by zeroes. template
bool tryToOptimize(uint32_t Src, SizeT *NumOperations, std::array
*Operations) { constexpr uint32_t SrcSizeBits = sizeof(Src) * CHAR_BIT; uint32_t NegSrc = ~Src; *NumOperations = 0; while (Src != 0 && *NumOperations < N) { // Each step of the algorithm: // * finds L, the last bit set in Src; // * clears all the upper bits in NegSrc up to bit L; // * finds nL, the last bit set in NegSrc; // * clears all the upper bits in Src up to bit nL; // // if L == nL + 1, then a unitary 1 was found in Src. Otherwise, a sequence // of 1s starting at L, and ending at nL + 1, was found. const uint32_t SrcLastBitSet = llvm::findLastSet(Src); const uint32_t NegSrcClearMask = (SrcLastBitSet == 0) ? 0 : (0xFFFFFFFFu) >> (SrcSizeBits - SrcLastBitSet); NegSrc &= NegSrcClearMask; if (NegSrc == 0) { if (addOperations(SrcLastBitSet, 0, NumOperations, Operations)) { return true; } return false; } const uint32_t NegSrcLastBitSet = llvm::findLastSet(NegSrc); assert(NegSrcLastBitSet < SrcLastBitSet); const uint32_t SrcClearMask = (NegSrcLastBitSet == 0) ? 0 : (0xFFFFFFFFu) >> (SrcSizeBits - NegSrcLastBitSet); Src &= SrcClearMask; if (!addOperations(SrcLastBitSet, NegSrcLastBitSet + 1, NumOperations, Operations)) { return false; } } return Src == 0; } } // end of namespace StrengthReduction } // end of anonymous namespace void TargetARM32::lowerArithmetic(const InstArithmetic *Instr) { Variable *Dest = Instr->getDest(); if (Dest->isRematerializable()) { Context.insert
(Dest); return; } Type DestTy = Dest->getType(); if (DestTy == IceType_i1) { lowerInt1Arithmetic(Instr); return; } Operand *Src0 = legalizeUndef(Instr->getSrc(0)); Operand *Src1 = legalizeUndef(Instr->getSrc(1)); if (DestTy == IceType_i64) { lowerInt64Arithmetic(Instr->getOp(), Instr->getDest(), Src0, Src1); return; } if (isVectorType(DestTy)) { switch (Instr->getOp()) { default: UnimplementedLoweringError(this, Instr); return; // Explicitly whitelist vector instructions we have implemented/enabled. case InstArithmetic::Add: case InstArithmetic::And: case InstArithmetic::Ashr: case InstArithmetic::Fadd: case InstArithmetic::Fmul: case InstArithmetic::Fsub: case InstArithmetic::Lshr: case InstArithmetic::Mul: case InstArithmetic::Or: case InstArithmetic::Shl: case InstArithmetic::Sub: case InstArithmetic::Xor: break; } } Variable *T = makeReg(DestTy); // * Handle div/rem separately. They require a non-legalized Src1 to inspect // whether or not Src1 is a non-zero constant. Once legalized it is more // difficult to determine (constant may be moved to a register). // * Handle floating point arithmetic separately: they require Src1 to be // legalized to a register. switch (Instr->getOp()) { default: break; case InstArithmetic::Udiv: { constexpr bool NotRemainder = false; Variable *Src0R = legalizeToReg(Src0); lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_uxt, &TargetARM32::_udiv, NotRemainder); return; } case InstArithmetic::Sdiv: { constexpr bool NotRemainder = false; Variable *Src0R = legalizeToReg(Src0); lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_sxt, &TargetARM32::_sdiv, NotRemainder); return; } case InstArithmetic::Urem: { constexpr bool IsRemainder = true; Variable *Src0R = legalizeToReg(Src0); lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_uxt, &TargetARM32::_udiv, IsRemainder); return; } case InstArithmetic::Srem: { constexpr bool IsRemainder = true; Variable *Src0R = legalizeToReg(Src0); lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_sxt, &TargetARM32::_sdiv, IsRemainder); return; } case InstArithmetic::Frem: { if (!isScalarFloatingType(DestTy)) { llvm::report_fatal_error("Unexpected type when lowering frem."); } llvm::report_fatal_error("Frem should have already been lowered."); } case InstArithmetic::Fadd: { Variable *Src0R = legalizeToReg(Src0); if (const Inst *Src1Producer = Computations.getProducerOf(Src1)) { Variable *Src1R = legalizeToReg(Src1Producer->getSrc(0)); Variable *Src2R = legalizeToReg(Src1Producer->getSrc(1)); _vmla(Src0R, Src1R, Src2R); _mov(Dest, Src0R); return; } Variable *Src1R = legalizeToReg(Src1); _vadd(T, Src0R, Src1R); _mov(Dest, T); return; } case InstArithmetic::Fsub: { Variable *Src0R = legalizeToReg(Src0); if (const Inst *Src1Producer = Computations.getProducerOf(Src1)) { Variable *Src1R = legalizeToReg(Src1Producer->getSrc(0)); Variable *Src2R = legalizeToReg(Src1Producer->getSrc(1)); _vmls(Src0R, Src1R, Src2R); _mov(Dest, Src0R); return; } Variable *Src1R = legalizeToReg(Src1); _vsub(T, Src0R, Src1R); _mov(Dest, T); return; } case InstArithmetic::Fmul: { Variable *Src0R = legalizeToReg(Src0); Variable *Src1R = legalizeToReg(Src1); _vmul(T, Src0R, Src1R); _mov(Dest, T); return; } case InstArithmetic::Fdiv: { Variable *Src0R = legalizeToReg(Src0); Variable *Src1R = legalizeToReg(Src1); _vdiv(T, Src0R, Src1R); _mov(Dest, T); return; } } // Handle everything else here. Int32Operands Srcs(Src0, Src1); switch (Instr->getOp()) { case InstArithmetic::_num: llvm::report_fatal_error("Unknown arithmetic operator"); return; case InstArithmetic::Add: { if (const Inst *Src1Producer = Computations.getProducerOf(Src1)) { assert(!isVectorType(DestTy)); Variable *Src0R = legalizeToReg(Src0); Variable *Src1R = legalizeToReg(Src1Producer->getSrc(0)); Variable *Src2R = legalizeToReg(Src1Producer->getSrc(1)); _mla(T, Src1R, Src2R, Src0R); _mov(Dest, T); return; } if (Srcs.hasConstOperand()) { if (!Srcs.immediateIsFlexEncodable() && Srcs.negatedImmediateIsFlexEncodable()) { assert(!isVectorType(DestTy)); Variable *Src0R = Srcs.src0R(this); Operand *Src1F = Srcs.negatedSrc1F(this); if (!Srcs.swappedOperands()) { _sub(T, Src0R, Src1F); } else { _rsb(T, Src0R, Src1F); } _mov(Dest, T); return; } } Variable *Src0R = Srcs.src0R(this); if (isVectorType(DestTy)) { Variable *Src1R = legalizeToReg(Src1); _vadd(T, Src0R, Src1R); } else { Operand *Src1RF = Srcs.src1RF(this); _add(T, Src0R, Src1RF); } _mov(Dest, T); return; } case InstArithmetic::And: { if (Srcs.hasConstOperand()) { if (!Srcs.immediateIsFlexEncodable() && Srcs.invertedImmediateIsFlexEncodable()) { Variable *Src0R = Srcs.src0R(this); Operand *Src1F = Srcs.invertedSrc1F(this); _bic(T, Src0R, Src1F); _mov(Dest, T); return; } } assert(isIntegerType(DestTy)); Variable *Src0R = Srcs.src0R(this); if (isVectorType(DestTy)) { Variable *Src1R = legalizeToReg(Src1); _vand(T, Src0R, Src1R); } else { Operand *Src1RF = Srcs.src1RF(this); _and(T, Src0R, Src1RF); } _mov(Dest, T); return; } case InstArithmetic::Or: { Variable *Src0R = Srcs.src0R(this); assert(isIntegerType(DestTy)); if (isVectorType(DestTy)) { Variable *Src1R = legalizeToReg(Src1); _vorr(T, Src0R, Src1R); } else { Operand *Src1RF = Srcs.src1RF(this); _orr(T, Src0R, Src1RF); } _mov(Dest, T); return; } case InstArithmetic::Xor: { Variable *Src0R = Srcs.src0R(this); assert(isIntegerType(DestTy)); if (isVectorType(DestTy)) { Variable *Src1R = legalizeToReg(Src1); _veor(T, Src0R, Src1R); } else { Operand *Src1RF = Srcs.src1RF(this); _eor(T, Src0R, Src1RF); } _mov(Dest, T); return; } case InstArithmetic::Sub: { if (const Inst *Src1Producer = Computations.getProducerOf(Src1)) { assert(!isVectorType(DestTy)); Variable *Src0R = legalizeToReg(Src0); Variable *Src1R = legalizeToReg(Src1Producer->getSrc(0)); Variable *Src2R = legalizeToReg(Src1Producer->getSrc(1)); _mls(T, Src1R, Src2R, Src0R); _mov(Dest, T); return; } if (Srcs.hasConstOperand()) { assert(!isVectorType(DestTy)); if (Srcs.immediateIsFlexEncodable()) { Variable *Src0R = Srcs.src0R(this); Operand *Src1RF = Srcs.src1RF(this); if (Srcs.swappedOperands()) { _rsb(T, Src0R, Src1RF); } else { _sub(T, Src0R, Src1RF); } _mov(Dest, T); return; } if (!Srcs.swappedOperands() && Srcs.negatedImmediateIsFlexEncodable()) { Variable *Src0R = Srcs.src0R(this); Operand *Src1F = Srcs.negatedSrc1F(this); _add(T, Src0R, Src1F); _mov(Dest, T); return; } } Variable *Src0R = Srcs.unswappedSrc0R(this); Variable *Src1R = Srcs.unswappedSrc1R(this); if (isVectorType(DestTy)) { _vsub(T, Src0R, Src1R); } else { _sub(T, Src0R, Src1R); } _mov(Dest, T); return; } case InstArithmetic::Mul: { const bool OptM1 = Func->getOptLevel() == Opt_m1; if (!OptM1 && Srcs.hasConstOperand()) { constexpr std::size_t MaxShifts = 4; std::array
Shifts; SizeT NumOperations; int32_t Const = Srcs.getConstantValue(); const bool Invert = Const < 0; const bool MultiplyByZero = Const == 0; Operand *_0 = legalize(Ctx->getConstantZero(DestTy), Legal_Reg | Legal_Flex); if (MultiplyByZero) { _mov(T, _0); _mov(Dest, T); return; } if (Invert) { Const = -Const; } if (StrengthReduction::tryToOptimize(Const, &NumOperations, &Shifts)) { assert(NumOperations >= 1); Variable *Src0R = Srcs.src0R(this); int32_t Start; int32_t End; if (NumOperations == 1 || Shifts[NumOperations - 1].shAmt() != 0) { // Multiplication by a power of 2 (NumOperations == 1); or // Multiplication by a even number not a power of 2. Start = 1; End = NumOperations; assert(Shifts[0].aggregateWithAdd()); _lsl(T, Src0R, shAmtImm(Shifts[0].shAmt())); } else { // Multiplication by an odd number. Put the free barrel shifter to a // good use. Start = 0; End = NumOperations - 2; const StrengthReduction::AggregationElement &Last = Shifts[NumOperations - 1]; const StrengthReduction::AggregationElement &SecondToLast = Shifts[NumOperations - 2]; if (!Last.aggregateWithAdd()) { assert(SecondToLast.aggregateWithAdd()); _rsb(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R)); } else if (!SecondToLast.aggregateWithAdd()) { assert(Last.aggregateWithAdd()); _sub(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R)); } else { _add(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R)); } } // Odd numbers : S E I I // +---+---+---+---+---+---+ ... +---+---+---+---+ // Shifts = | | | | | | | ... | | | | | // +---+---+---+---+---+---+ ... +---+---+---+---+ // Even numbers: I S E // // S: Start; E: End; I: Init for (int32_t I = Start; I < End; ++I) { const StrengthReduction::AggregationElement &Current = Shifts[I]; Operand *SrcF = Current.createShiftedOperand(Func, Src0R); if (Current.aggregateWithAdd()) { _add(T, T, SrcF); } else { _sub(T, T, SrcF); } } if (Invert) { // T = 0 - T. _rsb(T, T, _0); } _mov(Dest, T); return; } } Variable *Src0R = Srcs.unswappedSrc0R(this); Variable *Src1R = Srcs.unswappedSrc1R(this); if (isVectorType(DestTy)) { _vmul(T, Src0R, Src1R); } else { _mul(T, Src0R, Src1R); } _mov(Dest, T); return; } case InstArithmetic::Shl: { Variable *Src0R = Srcs.unswappedSrc0R(this); if (!isVectorType(T->getType())) { if (Srcs.isSrc1ImmediateZero()) { _mov(T, Src0R); } else { Operand *Src1R = Srcs.unswappedSrc1RShAmtImm(this); _lsl(T, Src0R, Src1R); } } else { if (Srcs.hasConstOperand()) { ConstantInteger32 *ShAmt = llvm::cast
(Srcs.src1()); _vshl(T, Src0R, ShAmt); } else { auto *Src1R = Srcs.unswappedSrc1R(this); _vshl(T, Src0R, Src1R)->setSignType(InstARM32::FS_Unsigned); } } _mov(Dest, T); return; } case InstArithmetic::Lshr: { Variable *Src0R = Srcs.unswappedSrc0R(this); if (!isVectorType(T->getType())) { if (DestTy != IceType_i32) { _uxt(Src0R, Src0R); } if (Srcs.isSrc1ImmediateZero()) { _mov(T, Src0R); } else { Operand *Src1R = Srcs.unswappedSrc1RShAmtImm(this); _lsr(T, Src0R, Src1R); } } else { if (Srcs.hasConstOperand()) { ConstantInteger32 *ShAmt = llvm::cast
(Srcs.src1()); _vshr(T, Src0R, ShAmt)->setSignType(InstARM32::FS_Unsigned); } else { auto *Src1R = Srcs.unswappedSrc1R(this); auto *Src1RNeg = makeReg(Src1R->getType()); _vneg(Src1RNeg, Src1R); _vshl(T, Src0R, Src1RNeg)->setSignType(InstARM32::FS_Unsigned); } } _mov(Dest, T); return; } case InstArithmetic::Ashr: { Variable *Src0R = Srcs.unswappedSrc0R(this); if (!isVectorType(T->getType())) { if (DestTy != IceType_i32) { _sxt(Src0R, Src0R); } if (Srcs.isSrc1ImmediateZero()) { _mov(T, Src0R); } else { _asr(T, Src0R, Srcs.unswappedSrc1RShAmtImm(this)); } } else { if (Srcs.hasConstOperand()) { ConstantInteger32 *ShAmt = llvm::cast
(Srcs.src1()); _vshr(T, Src0R, ShAmt)->setSignType(InstARM32::FS_Signed); } else { auto *Src1R = Srcs.unswappedSrc1R(this); auto *Src1RNeg = makeReg(Src1R->getType()); _vneg(Src1RNeg, Src1R); _vshl(T, Src0R, Src1RNeg)->setSignType(InstARM32::FS_Signed); } } _mov(Dest, T); return; } case InstArithmetic::Udiv: case InstArithmetic::Sdiv: case InstArithmetic::Urem: case InstArithmetic::Srem: llvm::report_fatal_error( "Integer div/rem should have been handled earlier."); return; case InstArithmetic::Fadd: case InstArithmetic::Fsub: case InstArithmetic::Fmul: case InstArithmetic::Fdiv: case InstArithmetic::Frem: llvm::report_fatal_error( "Floating point arith should have been handled earlier."); return; } } void TargetARM32::lowerAssign(const InstAssign *Instr) { Variable *Dest = Instr->getDest(); if (Dest->isRematerializable()) { Context.insert
(Dest); return; } Operand *Src0 = Instr->getSrc(0); assert(Dest->getType() == Src0->getType()); if (Dest->getType() == IceType_i64) { Src0 = legalizeUndef(Src0); Variable *T_Lo = makeReg(IceType_i32); auto *DestLo = llvm::cast
(loOperand(Dest)); Operand *Src0Lo = legalize(loOperand(Src0), Legal_Reg | Legal_Flex); _mov(T_Lo, Src0Lo); _mov(DestLo, T_Lo); Variable *T_Hi = makeReg(IceType_i32); auto *DestHi = llvm::cast
(hiOperand(Dest)); Operand *Src0Hi = legalize(hiOperand(Src0), Legal_Reg | Legal_Flex); _mov(T_Hi, Src0Hi); _mov(DestHi, T_Hi); return; } Operand *NewSrc; if (Dest->hasReg()) { // If Dest already has a physical register, then legalize the Src operand // into a Variable with the same register assignment. This especially // helps allow the use of Flex operands. NewSrc = legalize(Src0, Legal_Reg | Legal_Flex, Dest->getRegNum()); } else { // Dest could be a stack operand. Since we could potentially need to do a // Store (and store can only have Register operands), legalize this to a // register. NewSrc = legalize(Src0, Legal_Reg); } if (isVectorType(Dest->getType()) || isScalarFloatingType(Dest->getType())) { NewSrc = legalize(NewSrc, Legal_Reg | Legal_Mem); } _mov(Dest, NewSrc); } TargetARM32::ShortCircuitCondAndLabel TargetARM32::lowerInt1ForBranch( Operand *Boolean, const LowerInt1BranchTarget &TargetTrue, const LowerInt1BranchTarget &TargetFalse, uint32_t ShortCircuitable) { InstARM32Label *NewShortCircuitLabel = nullptr; Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex); const Inst *Producer = Computations.getProducerOf(Boolean); if (Producer == nullptr) { // No producer, no problem: just do emit code to perform (Boolean & 1) and // set the flags register. The branch should be taken if the resulting flags // indicate a non-zero result. _tst(legalizeToReg(Boolean), _1); return ShortCircuitCondAndLabel(CondWhenTrue(CondARM32::NE)); } switch (Producer->getKind()) { default: llvm::report_fatal_error("Unexpected producer."); case Inst::Icmp: { return ShortCircuitCondAndLabel( lowerIcmpCond(llvm::cast
(Producer))); } break; case Inst::Fcmp: { return ShortCircuitCondAndLabel( lowerFcmpCond(llvm::cast
(Producer))); } break; case Inst::Cast: { const auto *CastProducer = llvm::cast
(Producer); assert(CastProducer->getCastKind() == InstCast::Trunc); Operand *Src = CastProducer->getSrc(0); if (Src->getType() == IceType_i64) Src = loOperand(Src); _tst(legalizeToReg(Src), _1); return ShortCircuitCondAndLabel(CondWhenTrue(CondARM32::NE)); } break; case Inst::Arithmetic: { const auto *ArithProducer = llvm::cast
(Producer); switch (ArithProducer->getOp()) { default: llvm::report_fatal_error("Unhandled Arithmetic Producer."); case InstArithmetic::And: { if (!(ShortCircuitable & SC_And)) { NewShortCircuitLabel = InstARM32Label::create(Func, this); } LowerInt1BranchTarget NewTarget = TargetFalse.createForLabelOrDuplicate(NewShortCircuitLabel); ShortCircuitCondAndLabel CondAndLabel = lowerInt1ForBranch( Producer->getSrc(0), TargetTrue, NewTarget, SC_And); const CondWhenTrue &Cond = CondAndLabel.Cond; _br_short_circuit(NewTarget, Cond.invert()); InstARM32Label *const ShortCircuitLabel = CondAndLabel.ShortCircuitTarget; if (ShortCircuitLabel != nullptr) Context.insert(ShortCircuitLabel); return ShortCircuitCondAndLabel( lowerInt1ForBranch(Producer->getSrc(1), TargetTrue, NewTarget, SC_All) .assertNoLabelAndReturnCond(), NewShortCircuitLabel); } break; case InstArithmetic::Or: { if (!(ShortCircuitable & SC_Or)) { NewShortCircuitLabel = InstARM32Label::create(Func, this); } LowerInt1BranchTarget NewTarget = TargetTrue.createForLabelOrDuplicate(NewShortCircuitLabel); ShortCircuitCondAndLabel CondAndLabel = lowerInt1ForBranch( Producer->getSrc(0), NewTarget, TargetFalse, SC_Or); const CondWhenTrue &Cond = CondAndLabel.Cond; _br_short_circuit(NewTarget, Cond); InstARM32Label *const ShortCircuitLabel = CondAndLabel.ShortCircuitTarget; if (ShortCircuitLabel != nullptr) Context.insert(ShortCircuitLabel); return ShortCircuitCondAndLabel(lowerInt1ForBranch(Producer->getSrc(1), NewTarget, TargetFalse, SC_All) .assertNoLabelAndReturnCond(), NewShortCircuitLabel); } break; } } } } void TargetARM32::lowerBr(const InstBr *Instr) { if (Instr->isUnconditional()) { _br(Instr->getTargetUnconditional()); return; } CfgNode *TargetTrue = Instr->getTargetTrue(); CfgNode *TargetFalse = Instr->getTargetFalse(); ShortCircuitCondAndLabel CondAndLabel = lowerInt1ForBranch( Instr->getCondition(), LowerInt1BranchTarget(TargetTrue), LowerInt1BranchTarget(TargetFalse), SC_All); assert(CondAndLabel.ShortCircuitTarget == nullptr); const CondWhenTrue &Cond = CondAndLabel.Cond; if (Cond.WhenTrue1 != CondARM32::kNone) { assert(Cond.WhenTrue0 != CondARM32::AL); _br(TargetTrue, Cond.WhenTrue1); } switch (Cond.WhenTrue0) { default: _br(TargetTrue, TargetFalse, Cond.WhenTrue0); break; case CondARM32::kNone: _br(TargetFalse); break; case CondARM32::AL: _br(TargetTrue); break; } } void TargetARM32::lowerCall(const InstCall *Instr) { Operand *CallTarget = Instr->getCallTarget(); if (Instr->isTargetHelperCall()) { auto TargetHelperPreamble = ARM32HelpersPreamble.find(CallTarget); if (TargetHelperPreamble != ARM32HelpersPreamble.end()) { (this->*TargetHelperPreamble->second)(Instr); } } MaybeLeafFunc = false; NeedsStackAlignment = true; // Assign arguments to registers and stack. Also reserve stack. TargetARM32::CallingConv CC; // Pair of Arg Operand -> GPR number assignments. llvm::SmallVector
, NumGPRArgs> GPRArgs; llvm::SmallVector
, NumFP32Args> FPArgs; // Pair of Arg Operand -> stack offset. llvm::SmallVector
, 8> StackArgs; size_t ParameterAreaSizeBytes = 0; // Classify each argument operand according to the location where the // argument is passed. for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) { Operand *Arg = legalizeUndef(Instr->getArg(i)); const Type Ty = Arg->getType(); bool InReg = false; RegNumT Reg; if (isScalarIntegerType(Ty)) { InReg = CC.argInGPR(Ty, &Reg); } else { InReg = CC.argInVFP(Ty, &Reg); } if (!InReg) { ParameterAreaSizeBytes = applyStackAlignmentTy(ParameterAreaSizeBytes, Ty); StackArgs.push_back(std::make_pair(Arg, ParameterAreaSizeBytes)); ParameterAreaSizeBytes += typeWidthInBytesOnStack(Ty); continue; } if (Ty == IceType_i64) { Operand *Lo = loOperand(Arg); Operand *Hi = hiOperand(Arg); GPRArgs.push_back(std::make_pair( Lo, RegNumT::fixme(RegARM32::getI64PairFirstGPRNum(Reg)))); GPRArgs.push_back(std::make_pair( Hi, RegNumT::fixme(RegARM32::getI64PairSecondGPRNum(Reg)))); } else if (isScalarIntegerType(Ty)) { GPRArgs.push_back(std::make_pair(Arg, Reg)); } else { FPArgs.push_back(std::make_pair(Arg, Reg)); } } // Adjust the parameter area so that the stack is aligned. It is assumed that // the stack is already aligned at the start of the calling sequence. ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes); if (ParameterAreaSizeBytes > MaxOutArgsSizeBytes) { llvm::report_fatal_error("MaxOutArgsSizeBytes is not really a max."); } // Copy arguments that are passed on the stack to the appropriate stack // locations. Variable *SP = getPhysicalRegister(RegARM32::Reg_sp); for (auto &StackArg : StackArgs) { ConstantInteger32 *Loc = llvm::cast
(Ctx->getConstantInt32(StackArg.second)); Type Ty = StackArg.first->getType(); OperandARM32Mem *Addr; constexpr bool SignExt = false; if (OperandARM32Mem::canHoldOffset(Ty, SignExt, StackArg.second)) { Addr = OperandARM32Mem::create(Func, Ty, SP, Loc); } else { Variable *NewBase = Func->makeVariable(SP->getType()); lowerArithmetic( InstArithmetic::create(Func, InstArithmetic::Add, NewBase, SP, Loc)); Addr = formMemoryOperand(NewBase, Ty); } lowerStore(InstStore::create(Func, StackArg.first, Addr)); } // Generate the call instruction. Assign its result to a temporary with high // register allocation weight. Variable *Dest = Instr->getDest(); // ReturnReg doubles as ReturnRegLo as necessary. Variable *ReturnReg = nullptr; Variable *ReturnRegHi = nullptr; if (Dest) { switch (Dest->getType()) { case IceType_NUM: llvm::report_fatal_error("Invalid Call dest type"); break; case IceType_void: break; case IceType_i1: assert(Computations.getProducerOf(Dest) == nullptr); // Fall-through intended. case IceType_i8: case IceType_i16: case IceType_i32: ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_r0); break; case IceType_i64: ReturnReg = makeReg(IceType_i32, RegARM32::Reg_r0); ReturnRegHi = makeReg(IceType_i32, RegARM32::Reg_r1); break; case IceType_f32: ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_s0); break; case IceType_f64: ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_d0); break; case IceType_v4i1: case IceType_v8i1: case IceType_v16i1: case IceType_v16i8: case IceType_v8i16: case IceType_v4i32: case IceType_v4f32: ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_q0); break; } } // Allow ConstantRelocatable to be left alone as a direct call, but force // other constants like ConstantInteger32 to be in a register and make it an // indirect call. if (!llvm::isa
(CallTarget)) { CallTarget = legalize(CallTarget, Legal_Reg); } // Copy arguments to be passed in registers to the appropriate registers. CfgVector
RegArgs; for (auto &FPArg : FPArgs) { RegArgs.emplace_back(legalizeToReg(FPArg.first, FPArg.second)); } for (auto &GPRArg : GPRArgs) { RegArgs.emplace_back(legalizeToReg(GPRArg.first, GPRArg.second)); } // Generate a FakeUse of register arguments so that they do not get dead code // eliminated as a result of the FakeKill of scratch registers after the call. // These fake-uses need to be placed here to avoid argument registers from // being used during the legalizeToReg() calls above. for (auto *RegArg : RegArgs) { Context.insert
(RegArg); } InstARM32Call *NewCall = Sandboxer(this, InstBundleLock::Opt_AlignToEnd).bl(ReturnReg, CallTarget); if (ReturnRegHi) Context.insert
(ReturnRegHi); // Insert a register-kill pseudo instruction. Context.insert
(NewCall); // Generate a FakeUse to keep the call live if necessary. if (Instr->hasSideEffects() && ReturnReg) { Context.insert
(ReturnReg); } if (Dest != nullptr) { // Assign the result of the call to Dest. if (ReturnReg != nullptr) { if (ReturnRegHi) { auto *Dest64On32 = llvm::cast
(Dest); Variable *DestLo = Dest64On32->getLo(); Variable *DestHi = Dest64On32->getHi(); _mov(DestLo, ReturnReg); _mov(DestHi, ReturnRegHi); } else { if (isFloatingType(Dest->getType()) || isVectorType(Dest->getType())) { _mov(Dest, ReturnReg); } else { assert(isIntegerType(Dest->getType()) && typeWidthInBytes(Dest->getType()) <= 4); _mov(Dest, ReturnReg); } } } } if (Instr->isTargetHelperCall()) { auto TargetHelpersPostamble = ARM32HelpersPostamble.find(CallTarget); if (TargetHelpersPostamble != ARM32HelpersPostamble.end()) { (this->*TargetHelpersPostamble->second)(Instr); } } } namespace { void configureBitcastTemporary(Variable64On32 *Var) { Var->setMustNotHaveReg(); Var->getHi()->setMustHaveReg(); Var->getLo()->setMustHaveReg(); } } // end of anonymous namespace void TargetARM32::lowerCast(const InstCast *Instr) { InstCast::OpKind CastKind = Instr->getCastKind(); Variable *Dest = Instr->getDest(); const Type DestTy = Dest->getType(); Operand *Src0 = legalizeUndef(Instr->getSrc(0)); switch (CastKind) { default: Func->setError("Cast type not supported"); return; case InstCast::Sext: { if (isVectorType(DestTy)) { Variable *T0 = makeReg(DestTy); Variable *T1 = makeReg(DestTy); ConstantInteger32 *ShAmt = nullptr; switch (DestTy) { default: llvm::report_fatal_error("Unexpected type in vector sext."); case IceType_v16i8: ShAmt = llvm::cast
(Ctx->getConstantInt32(7)); break; case IceType_v8i16: ShAmt = llvm::cast
(Ctx->getConstantInt32(15)); break; case IceType_v4i32: ShAmt = llvm::cast
(Ctx->getConstantInt32(31)); break; } auto *Src0R = legalizeToReg(Src0); _vshl(T0, Src0R, ShAmt); _vshr(T1, T0, ShAmt)->setSignType(InstARM32::FS_Signed); _mov(Dest, T1); } else if (DestTy == IceType_i64) { // t1=sxtb src; t2= mov t1 asr #31; dst.lo=t1; dst.hi=t2 Constant *ShiftAmt = Ctx->getConstantInt32(31); auto *DestLo = llvm::cast
(loOperand(Dest)); auto *DestHi = llvm::cast
(hiOperand(Dest)); Variable *T_Lo = makeReg(DestLo->getType()); if (Src0->getType() == IceType_i32) { Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex); _mov(T_Lo, Src0RF); } else if (Src0->getType() != IceType_i1) { Variable *Src0R = legalizeToReg(Src0); _sxt(T_Lo, Src0R); } else { Operand *_0 = Ctx->getConstantZero(IceType_i32); Operand *_m1 = Ctx->getConstantInt32(-1); lowerInt1ForSelect(T_Lo, Src0, _m1, _0); } _mov(DestLo, T_Lo); Variable *T_Hi = makeReg(DestHi->getType()); if (Src0->getType() != IceType_i1) { _mov(T_Hi, OperandARM32FlexReg::create(Func, IceType_i32, T_Lo, OperandARM32::ASR, ShiftAmt)); } else { // For i1, the asr instruction is already done above. _mov(T_Hi, T_Lo); } _mov(DestHi, T_Hi); } else if (Src0->getType() != IceType_i1) { // t1 = sxt src; dst = t1 Variable *Src0R = legalizeToReg(Src0); Variable *T = makeReg(DestTy); _sxt(T, Src0R); _mov(Dest, T); } else { Constant *_0 = Ctx->getConstantZero(IceType_i32); Operand *_m1 = Ctx->getConstantInt(DestTy, -1); Variable *T = makeReg(DestTy); lowerInt1ForSelect(T, Src0, _m1, _0); _mov(Dest, T); } break; } case InstCast::Zext: { if (isVectorType(DestTy)) { auto *Mask = makeReg(DestTy); auto *_1 = Ctx->getConstantInt32(1); auto *T = makeReg(DestTy); auto *Src0R = legalizeToReg(Src0); _mov(Mask, _1); _vand(T, Src0R, Mask); _mov(Dest, T); } else if (DestTy == IceType_i64) { // t1=uxtb src; dst.lo=t1; dst.hi=0 Operand *_0 = legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex); auto *DestLo = llvm::cast
(loOperand(Dest)); auto *DestHi = llvm::cast
(hiOperand(Dest)); Variable *T_Lo = makeReg(DestLo->getType()); switch (Src0->getType()) { default: { assert(Src0->getType() != IceType_i64); _uxt(T_Lo, legalizeToReg(Src0)); } break; case IceType_i32: { _mov(T_Lo, legalize(Src0, Legal_Reg | Legal_Flex)); } break; case IceType_i1: { SafeBoolChain Safe = lowerInt1(T_Lo, Src0); if (Safe == SBC_No) { Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex); _and(T_Lo, T_Lo, _1); } } break; } _mov(DestLo, T_Lo); Variable *T_Hi = makeReg(DestLo->getType()); _mov(T_Hi, _0); _mov(DestHi, T_Hi); } else if (Src0->getType() == IceType_i1) { Variable *T = makeReg(DestTy); SafeBoolChain Safe = lowerInt1(T, Src0); if (Safe == SBC_No) { Operand *_1 = legalize(Ctx->getConstantInt1(1), Legal_Reg | Legal_Flex); _and(T, T, _1); } _mov(Dest, T); } else { // t1 = uxt src; dst = t1 Variable *Src0R = legalizeToReg(Src0); Variable *T = makeReg(DestTy); _uxt(T, Src0R); _mov(Dest, T); } break; } case InstCast::Trunc: { if (isVectorType(DestTy)) { auto *T = makeReg(DestTy); auto *Src0R = legalizeToReg(Src0); _mov(T, Src0R); _mov(Dest, T); } else { if (Src0->getType() == IceType_i64) Src0 = loOperand(Src0); Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex); // t1 = trunc Src0RF; Dest = t1 Variable *T = makeReg(DestTy); _mov(T, Src0RF); if (DestTy == IceType_i1) _and(T, T, Ctx->getConstantInt1(1)); _mov(Dest, T); } break; } case InstCast::Fptrunc: case InstCast::Fpext: { // fptrunc: dest.f32 = fptrunc src0.fp64 // fpext: dest.f64 = fptrunc src0.fp32 const bool IsTrunc = CastKind == InstCast::Fptrunc; assert(!isVectorType(DestTy)); assert(DestTy == (IsTrunc ? IceType_f32 : IceType_f64)); assert(Src0->getType() == (IsTrunc ? IceType_f64 : IceType_f32)); Variable *Src0R = legalizeToReg(Src0); Variable *T = makeReg(DestTy); _vcvt(T, Src0R, IsTrunc ? InstARM32Vcvt::D2s : InstARM32Vcvt::S2d); _mov(Dest, T); break; } case InstCast::Fptosi: case InstCast::Fptoui: { const bool DestIsSigned = CastKind == InstCast::Fptosi; Variable *Src0R = legalizeToReg(Src0); if (isVectorType(DestTy)) { assert(typeElementType(Src0->getType()) == IceType_f32); auto *T = makeReg(DestTy); _vcvt(T, Src0R, DestIsSigned ? InstARM32Vcvt::Vs2si : InstARM32Vcvt::Vs2ui); _mov(Dest, T); break; } const bool Src0IsF32 = isFloat32Asserting32Or64(Src0->getType()); if (llvm::isa
(Dest)) { llvm::report_fatal_error("fp-to-i64 should have been pre-lowered."); } // fptosi: // t1.fp = vcvt src0.fp // t2.i32 = vmov t1.fp // dest.int = conv t2.i32 @ Truncates the result if needed. // fptoui: // t1.fp = vcvt src0.fp // t2.u32 = vmov t1.fp // dest.uint = conv t2.u32 @ Truncates the result if needed. Variable *T_fp = makeReg(IceType_f32); const InstARM32Vcvt::VcvtVariant Conversion = Src0IsF32 ? (DestIsSigned ? InstARM32Vcvt::S2si : InstARM32Vcvt::S2ui) : (DestIsSigned ? InstARM32Vcvt::D2si : InstARM32Vcvt::D2ui); _vcvt(T_fp, Src0R, Conversion); Variable *T = makeReg(IceType_i32); _mov(T, T_fp); if (DestTy != IceType_i32) { Variable *T_1 = makeReg(DestTy); lowerCast(InstCast::create(Func, InstCast::Trunc, T_1, T)); T = T_1; } _mov(Dest, T); break; } case InstCast::Sitofp: case InstCast::Uitofp: { const bool SourceIsSigned = CastKind == InstCast::Sitofp; if (isVectorType(DestTy)) { assert(typeElementType(DestTy) == IceType_f32); auto *T = makeReg(DestTy); Variable *Src0R = legalizeToReg(Src0); _vcvt(T, Src0R, SourceIsSigned ? InstARM32Vcvt::Vsi2s : InstARM32Vcvt::Vui2s); _mov(Dest, T); break; } const bool DestIsF32 = isFloat32Asserting32Or64(DestTy); if (Src0->getType() == IceType_i64) { llvm::report_fatal_error("i64-to-fp should have been pre-lowered."); } // sitofp: // t1.i32 = sext src.int @ sign-extends src0 if needed. // t2.fp32 = vmov t1.i32 // t3.fp = vcvt.{fp}.s32 @ fp is either f32 or f64 // uitofp: // t1.i32 = zext src.int @ zero-extends src0 if needed. // t2.fp32 = vmov t1.i32 // t3.fp = vcvt.{fp}.s32 @ fp is either f32 or f64 if (Src0->getType() != IceType_i32) { Variable *Src0R_32 = makeReg(IceType_i32); lowerCast(InstCast::create(Func, SourceIsSigned ? InstCast::Sext : InstCast::Zext, Src0R_32, Src0)); Src0 = Src0R_32; } Variable *Src0R = legalizeToReg(Src0); Variable *Src0R_f32 = makeReg(IceType_f32); _mov(Src0R_f32, Src0R); Src0R = Src0R_f32; Variable *T = makeReg(DestTy); const InstARM32Vcvt::VcvtVariant Conversion = DestIsF32 ? (SourceIsSigned ? InstARM32Vcvt::Si2s : InstARM32Vcvt::Ui2s) : (SourceIsSigned ? InstARM32Vcvt::Si2d : InstARM32Vcvt::Ui2d); _vcvt(T, Src0R, Conversion); _mov(Dest, T); break; } case InstCast::Bitcast: { Operand *Src0 = Instr->getSrc(0); if (DestTy == Src0->getType()) { auto *Assign = InstAssign::create(Func, Dest, Src0); lowerAssign(Assign); return; } switch (DestTy) { case IceType_NUM: case IceType_void: llvm::report_fatal_error("Unexpected bitcast."); case IceType_i1: UnimplementedLoweringError(this, Instr); break; case IceType_i8: assert(Src0->getType() == IceType_v8i1); llvm::report_fatal_error( "i8 to v8i1 conversion should have been prelowered."); break; case IceType_i16: assert(Src0->getType() == IceType_v16i1); llvm::report_fatal_error( "i16 to v16i1 conversion should have been prelowered."); break; case IceType_i32: case IceType_f32: { Variable *Src0R = legalizeToReg(Src0); Variable *T = makeReg(DestTy); _mov(T, Src0R); lowerAssign(InstAssign::create(Func, Dest, T)); break; } case IceType_i64: { // t0, t1 <- src0 // dest[31..0] = t0 // dest[63..32] = t1 assert(Src0->getType() == IceType_f64); auto *T = llvm::cast
(Func->makeVariable(IceType_i64)); T->initHiLo(Func); configureBitcastTemporary(T); Variable *Src0R = legalizeToReg(Src0); _mov(T, Src0R); Context.insert
(T->getHi()); Context.insert
(T->getLo()); lowerAssign(InstAssign::create(Func, Dest, T)); break; } case IceType_f64: { // T0 <- lo(src) // T1 <- hi(src) // vmov T2, T0, T1 // Dest <- T2 assert(Src0->getType() == IceType_i64); Variable *T = makeReg(DestTy); auto *Src64 = llvm::cast
(Func->makeVariable(IceType_i64)); Src64->initHiLo(Func); configureBitcastTemporary(Src64); lowerAssign(InstAssign::create(Func, Src64, Src0)); _mov(T, Src64); lowerAssign(InstAssign::create(Func, Dest, T)); break; } case IceType_v8i1: assert(Src0->getType() == IceType_i8); llvm::report_fatal_error( "v8i1 to i8 conversion should have been prelowered."); break; case IceType_v16i1: assert(Src0->getType() == IceType_i16); llvm::report_fatal_error( "v16i1 to i16 conversion should have been prelowered."); break; case IceType_v4i1: case IceType_v8i16: case IceType_v16i8: case IceType_v4f32: case IceType_v4i32: { assert(typeWidthInBytes(DestTy) == typeWidthInBytes(Src0->getType())); assert(isVectorType(DestTy) == isVectorType(Src0->getType())); Variable *T = makeReg(DestTy); _mov(T, Src0); _mov(Dest, T); break; } } break; } } } void TargetARM32::lowerExtractElement(const InstExtractElement *Instr) { Variable *Dest = Instr->getDest(); Type DestTy = Dest->getType(); Variable *Src0 = legalizeToReg(Instr->getSrc(0)); Operand *Src1 = Instr->getSrc(1); if (const auto *Imm = llvm::dyn_cast
(Src1)) { const uint32_t Index = Imm->getValue(); Variable *T = makeReg(DestTy); Variable *TSrc0 = makeReg(Src0->getType()); if (isFloatingType(DestTy)) { // We need to make sure the source is in a suitable register. TSrc0->setRegClass(RegARM32::RCARM32_QtoS); } _mov(TSrc0, Src0); _extractelement(T, TSrc0, Index); _mov(Dest, T); return; } assert(false && "extractelement requires a constant index"); } namespace { // Validates FCMPARM32_TABLE's declaration w.r.t. InstFcmp::FCondition ordering // (and naming). enum { #define X(val, CC0, CC1, CC0_V, CC1_V, INV_V, NEG_V) _fcmp_ll_##val, FCMPARM32_TABLE #undef X _fcmp_ll_NUM }; enum { #define X(tag, str) _fcmp_hl_##tag = InstFcmp::tag, ICEINSTFCMP_TABLE #undef X _fcmp_hl_NUM }; static_assert((uint32_t)_fcmp_hl_NUM == (uint32_t)_fcmp_ll_NUM, "Inconsistency between high-level and low-level fcmp tags."); #define X(tag, str) \ static_assert( \ (uint32_t)_fcmp_hl_##tag == (uint32_t)_fcmp_ll_##tag, \ "Inconsistency between high-level and low-level fcmp tag " #tag); ICEINSTFCMP_TABLE #undef X struct { CondARM32::Cond CC0; CondARM32::Cond CC1; } TableFcmp[] = { #define X(val, CC0, CC1, CC0_V, CC1_V, INV_V, NEG_V) \ { CondARM32::CC0, CondARM32::CC1 } \ , FCMPARM32_TABLE #undef X }; bool isFloatingPointZero(const Operand *Src) { if (const auto *F32 = llvm::dyn_cast
(Src)) { return Utils::isPositiveZero(F32->getValue()); } if (const auto *F64 = llvm::dyn_cast
(Src)) { return Utils::isPositiveZero(F64->getValue()); } return false; } } // end of anonymous namespace TargetARM32::CondWhenTrue TargetARM32::lowerFcmpCond(const InstFcmp *Instr) { InstFcmp::FCond Condition = Instr->getCondition(); switch (Condition) { case InstFcmp::False: return CondWhenTrue(CondARM32::kNone); case InstFcmp::True: return CondWhenTrue(CondARM32::AL); break; default: { Variable *Src0R = legalizeToReg(Instr->getSrc(0)); Operand *Src1 = Instr->getSrc(1); if (isFloatingPointZero(Src1)) { _vcmp(Src0R, OperandARM32FlexFpZero::create(Func, Src0R->getType())); } else { _vcmp(Src0R, legalizeToReg(Src1)); } _vmrs(); assert(Condition < llvm::array_lengthof(TableFcmp)); return CondWhenTrue(TableFcmp[Condition].CC0, TableFcmp[Condition].CC1); } } } void TargetARM32::lowerFcmp(const InstFcmp *Instr) { Variable *Dest = Instr->getDest(); const Type DestTy = Dest->getType(); if (isVectorType(DestTy)) { if (Instr->getCondition() == InstFcmp::False) { constexpr Type SafeTypeForMovingConstant = IceType_v4i32; auto *T = makeReg(SafeTypeForMovingConstant); _mov(T, llvm::cast
(Ctx->getConstantInt32(0))); _mov(Dest, T); return; } if (Instr->getCondition() == InstFcmp::True) { constexpr Type SafeTypeForMovingConstant = IceType_v4i32; auto *T = makeReg(SafeTypeForMovingConstant); _mov(T, llvm::cast
(Ctx->getConstantInt32(1))); _mov(Dest, T); return; } Variable *T0; Variable *T1; bool Negate = false; auto *Src0 = legalizeToReg(Instr->getSrc(0)); auto *Src1 = legalizeToReg(Instr->getSrc(1)); switch (Instr->getCondition()) { default: llvm::report_fatal_error("Unhandled fp comparison."); #define _Vcnone(Tptr, S0, S1) \ do { \ *(Tptr) = nullptr; \ } while (0) #define _Vceq(Tptr, S0, S1) \ do { \ *(Tptr) = makeReg(DestTy); \ _vceq(*(Tptr), S0, S1); \ } while (0) #define _Vcge(Tptr, S0, S1) \ do { \ *(Tptr) = makeReg(DestTy); \ _vcge(*(Tptr), S0, S1)->setSignType(InstARM32::FS_Signed); \ } while (0) #define _Vcgt(Tptr, S0, S1) \ do { \ *(Tptr) = makeReg(DestTy); \ _vcgt(*(Tptr), S0, S1)->setSignType(InstARM32::FS_Signed); \ } while (0) #define X(val, CC0, CC1, CC0_V, CC1_V, INV_V, NEG_V) \ case InstFcmp::val: { \ _Vc##CC0_V(&T0, (INV_V) ? Src1 : Src0, (INV_V) ? Src0 : Src1); \ _Vc##CC1_V(&T1, (INV_V) ? Src0 : Src1, (INV_V) ? Src1 : Src0); \ Negate = NEG_V; \ } break; FCMPARM32_TABLE #undef X #undef _Vcgt #undef _Vcge #undef _Vceq #undef _Vcnone } assert(T0 != nullptr); Variable *T = T0; if (T1 != nullptr) { T = makeReg(DestTy); _vorr(T, T0, T1); } if (Negate) { auto *TNeg = makeReg(DestTy); _vmvn(TNeg, T); T = TNeg; } _mov(Dest, T); return; } Variable *T = makeReg(IceType_i1); Operand *_1 = legalize(Ctx->getConstantInt32(1), Legal_Reg | Legal_Flex); Operand *_0 = legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex); CondWhenTrue Cond = lowerFcmpCond(Instr); bool RedefineT = false; if (Cond.WhenTrue0 != CondARM32::AL) { _mov(T, _0); RedefineT = true; } if (Cond.WhenTrue0 == CondARM32::kNone) { _mov(Dest, T); return; } if (RedefineT) { _mov_redefined(T, _1, Cond.WhenTrue0); } else { _mov(T, _1, Cond.WhenTrue0); } if (Cond.WhenTrue1 != CondARM32::kNone) { _mov_redefined(T, _1, Cond.WhenTrue1); } _mov(Dest, T); } TargetARM32::CondWhenTrue TargetARM32::lowerInt64IcmpCond(InstIcmp::ICond Condition, Operand *Src0, Operand *Src1) { assert(Condition < llvm::array_lengthof(TableIcmp64)); Int32Operands SrcsLo(loOperand(Src0), loOperand(Src1)); Int32Operands SrcsHi(hiOperand(Src0), hiOperand(Src1)); assert(SrcsLo.hasConstOperand() == SrcsHi.hasConstOperand()); assert(SrcsLo.swappedOperands() == SrcsHi.swappedOperands()); if (SrcsLo.hasConstOperand()) { const uint32_t ValueLo = SrcsLo.getConstantValue(); const uint32_t ValueHi = SrcsHi.getConstantValue(); const uint64_t Value = (static_cast
(ValueHi) << 32) | ValueLo; if ((Condition == InstIcmp::Eq || Condition == InstIcmp::Ne) && Value == 0) { Variable *T = makeReg(IceType_i32); Variable *Src0LoR = SrcsLo.src0R(this); Variable *Src0HiR = SrcsHi.src0R(this); _orrs(T, Src0LoR, Src0HiR); Context.insert
(T); return CondWhenTrue(TableIcmp64[Condition].C1); } Variable *Src0RLo = SrcsLo.src0R(this); Variable *Src0RHi = SrcsHi.src0R(this); Operand *Src1RFLo = SrcsLo.src1RF(this); Operand *Src1RFHi = ValueLo == ValueHi ? Src1RFLo : SrcsHi.src1RF(this); const bool UseRsb = TableIcmp64[Condition].Swapped != SrcsLo.swappedOperands(); if (UseRsb) { if (TableIcmp64[Condition].IsSigned) { Variable *T = makeReg(IceType_i32); _rsbs(T, Src0RLo, Src1RFLo); Context.insert
(T); T = makeReg(IceType_i32); _rscs(T, Src0RHi, Src1RFHi); // We need to add a FakeUse here because liveness gets mad at us (Def // without Use.) Note that flag-setting instructions are considered to // have side effects and, therefore, are not DCE'ed. Context.insert
(T); } else { Variable *T = makeReg(IceType_i32); _rsbs(T, Src0RHi, Src1RFHi); Context.insert
(T); T = makeReg(IceType_i32); _rsbs(T, Src0RLo, Src1RFLo, CondARM32::EQ); Context.insert
(T); } } else { if (TableIcmp64[Condition].IsSigned) { _cmp(Src0RLo, Src1RFLo); Variable *T = makeReg(IceType_i32); _sbcs(T, Src0RHi, Src1RFHi); Context.insert
(T); } else { _cmp(Src0RHi, Src1RFHi); _cmp(Src0RLo, Src1RFLo, CondARM32::EQ); } } return CondWhenTrue(TableIcmp64[Condition].C1); } Variable *Src0RLo, *Src0RHi; Operand *Src1RFLo, *Src1RFHi; if (TableIcmp64[Condition].Swapped) { Src0RLo = legalizeToReg(loOperand(Src1)); Src0RHi = legalizeToReg(hiOperand(Src1)); Src1RFLo = legalizeToReg(loOperand(Src0)); Src1RFHi = legalizeToReg(hiOperand(Src0)); } else { Src0RLo = legalizeToReg(loOperand(Src0)); Src0RHi = legalizeToReg(hiOperand(Src0)); Src1RFLo = legalizeToReg(loOperand(Src1)); Src1RFHi = legalizeToReg(hiOperand(Src1)); } // a=icmp cond, b, c ==> // GCC does: // cmp b.hi, c.hi or cmp b.lo, c.lo // cmp.eq b.lo, c.lo sbcs t1, b.hi, c.hi // mov.
t, #1 mov.
t, #1 // mov.
t, #0 mov.
t, #0 // mov a, t mov a, t // where the "cmp.eq b.lo, c.lo" is used for unsigned and "sbcs t1, hi, hi" // is used for signed compares. In some cases, b and c need to be swapped as // well. // // LLVM does: // for EQ and NE: // eor t1, b.hi, c.hi // eor t2, b.lo, c.hi // orrs t, t1, t2 // mov.
t, #1 // mov a, t // // that's nice in that it's just as short but has fewer dependencies for // better ILP at the cost of more registers. // // Otherwise for signed/unsigned <, <=, etc. LLVM uses a sequence with two // unconditional mov #0, two cmps, two conditional mov #1, and one // conditional reg mov. That has few dependencies for good ILP, but is a // longer sequence. // // So, we are going with the GCC version since it's usually better (except // perhaps for eq/ne). We could revisit special-casing eq/ne later. if (TableIcmp64[Condition].IsSigned) { Variable *ScratchReg = makeReg(IceType_i32); _cmp(Src0RLo, Src1RFLo); _sbcs(ScratchReg, Src0RHi, Src1RFHi); // ScratchReg isn't going to be used, but we need the side-effect of // setting flags from this operation. Context.insert
(ScratchReg); } else { _cmp(Src0RHi, Src1RFHi); _cmp(Src0RLo, Src1RFLo, CondARM32::EQ); } return CondWhenTrue(TableIcmp64[Condition].C1); } TargetARM32::CondWhenTrue TargetARM32::lowerInt32IcmpCond(InstIcmp::ICond Condition, Operand *Src0, Operand *Src1) { Int32Operands Srcs(Src0, Src1); if (!Srcs.hasConstOperand()) { Variable *Src0R = Srcs.src0R(this); Operand *Src1RF = Srcs.src1RF(this); _cmp(Src0R, Src1RF); return CondWhenTrue(getIcmp32Mapping(Condition)); } Variable *Src0R = Srcs.src0R(this); const int32_t Value = Srcs.getConstantValue(); if ((Condition == InstIcmp::Eq || Condition == InstIcmp::Ne) && Value == 0) { _tst(Src0R, Src0R); return CondWhenTrue(getIcmp32Mapping(Condition)); } if (!Srcs.swappedOperands() && !Srcs.immediateIsFlexEncodable() && Srcs.negatedImmediateIsFlexEncodable()) { Operand *Src1F = Srcs.negatedSrc1F(this); _cmn(Src0R, Src1F); return CondWhenTrue(getIcmp32Mapping(Condition)); } Operand *Src1RF = Srcs.src1RF(this); if (!Srcs.swappedOperands()) { _cmp(Src0R, Src1RF); } else { Variable *T = makeReg(IceType_i32); _rsbs(T, Src0R, Src1RF); Context.insert
(T); } return CondWhenTrue(getIcmp32Mapping(Condition)); } TargetARM32::CondWhenTrue TargetARM32::lowerInt8AndInt16IcmpCond(InstIcmp::ICond Condition, Operand *Src0, Operand *Src1) { Int32Operands Srcs(Src0, Src1); const int32_t ShAmt = 32 - getScalarIntBitWidth(Src0->getType()); assert(ShAmt >= 0); if (!Srcs.hasConstOperand()) { Variable *Src0R = makeReg(IceType_i32); Operand *ShAmtImm = shAmtImm(ShAmt); _lsl(Src0R, legalizeToReg(Src0), ShAmtImm); Variable *Src1R = legalizeToReg(Src1); auto *Src1F = OperandARM32FlexReg::create(Func, IceType_i32, Src1R, OperandARM32::LSL, ShAmtImm); _cmp(Src0R, Src1F); return CondWhenTrue(getIcmp32Mapping(Condition)); } const int32_t Value = Srcs.getConstantValue(); if ((Condition == InstIcmp::Eq || Condition == InstIcmp::Ne) && Value == 0) { Operand *ShAmtImm = shAmtImm(ShAmt); Variable *T = makeReg(IceType_i32); _lsls(T, Srcs.src0R(this), ShAmtImm); Context.insert
(T); return CondWhenTrue(getIcmp32Mapping(Condition)); } Variable *ConstR = makeReg(IceType_i32); _mov(ConstR, legalize(Ctx->getConstantInt32(Value << ShAmt), Legal_Reg | Legal_Flex)); Operand *NonConstF = OperandARM32FlexReg::create( Func, IceType_i32, Srcs.src0R(this), OperandARM32::LSL, Ctx->getConstantInt32(ShAmt)); if (Srcs.swappedOperands()) { _cmp(ConstR, NonConstF); } else { Variable *T = makeReg(IceType_i32); _rsbs(T, ConstR, NonConstF); Context.insert
(T); } return CondWhenTrue(getIcmp32Mapping(Condition)); } TargetARM32::CondWhenTrue TargetARM32::lowerIcmpCond(const InstIcmp *Instr) { return lowerIcmpCond(Instr->getCondition(), Instr->getSrc(0), Instr->getSrc(1)); } TargetARM32::CondWhenTrue TargetARM32::lowerIcmpCond(InstIcmp::ICond Condition, Operand *Src0, Operand *Src1) { Src0 = legalizeUndef(Src0); Src1 = legalizeUndef(Src1); // a=icmp cond b, c ==> // GCC does: //
xtb tb, b //
xtb tc, c // cmp tb, tc // mov.C1 t, #0 // mov.C2 t, #1 // mov a, t // where the unsigned/sign extension is not needed for 32-bit. They also have // special cases for EQ and NE. E.g., for NE: //
// subs t, tb, tc // movne t, #1 // mov a, t // // LLVM does: // lsl tb, b, #
// mov t, #0 // cmp tb, c, lsl #
// mov.
t, #1 // mov a, t // // the left shift is by 0, 16, or 24, which allows the comparison to focus on // the digits that actually matter (for 16-bit or 8-bit signed/unsigned). For // the unsigned case, for some reason it does similar to GCC and does a uxtb // first. It's not clear to me why that special-casing is needed. // // We'll go with the LLVM way for now, since it's shorter and has just as few // dependencies. switch (Src0->getType()) { default: llvm::report_fatal_error("Unhandled type in lowerIcmpCond"); case IceType_i1: case IceType_i8: case IceType_i16: return lowerInt8AndInt16IcmpCond(Condition, Src0, Src1); case IceType_i32: return lowerInt32IcmpCond(Condition, Src0, Src1); case IceType_i64: return lowerInt64IcmpCond(Condition, Src0, Src1); } } void TargetARM32::lowerIcmp(const InstIcmp *Instr) { Variable *Dest = Instr->getDest(); const Type DestTy = Dest->getType(); if (isVectorType(DestTy)) { auto *T = makeReg(DestTy); auto *Src0 = legalizeToReg(Instr->getSrc(0)); auto *Src1 = legalizeToReg(Instr->getSrc(1)); const Type SrcTy = Src0->getType(); bool NeedsShl = false; Type NewTypeAfterShl; SizeT ShAmt; switch (SrcTy) { default: break; case IceType_v16i1: NeedsShl = true; NewTypeAfterShl = IceType_v16i8; ShAmt = 7; break; case IceType_v8i1: NeedsShl = true; NewTypeAfterShl = IceType_v8i16; ShAmt = 15; break; case IceType_v4i1: NeedsShl = true; NewTypeAfterShl = IceType_v4i32; ShAmt = 31; break; } if (NeedsShl) { auto *Imm = llvm::cast
(Ctx->getConstantInt32(ShAmt)); auto *Src0T = makeReg(NewTypeAfterShl); auto *Src0Shl = makeReg(NewTypeAfterShl); _mov(Src0T, Src0); _vshl(Src0Shl, Src0T, Imm); Src0 = Src0Shl; auto *Src1T = makeReg(NewTypeAfterShl); auto *Src1Shl = makeReg(NewTypeAfterShl); _mov(Src1T, Src1); _vshl(Src1Shl, Src1T, Imm); Src1 = Src1Shl; } switch (Instr->getCondition()) { default: llvm::report_fatal_error("Unhandled integer comparison."); #define _Vceq(T, S0, S1, Signed) _vceq(T, S0, S1) #define _Vcge(T, S0, S1, Signed) \ _vcge(T, S0, S1) \ ->setSignType(Signed ? InstARM32::FS_Signed : InstARM32::FS_Unsigned) #define _Vcgt(T, S0, S1, Signed) \ _vcgt(T, S0, S1) \ ->setSignType(Signed ? InstARM32::FS_Signed : InstARM32::FS_Unsigned) #define X(val, is_signed, swapped64, C_32, C1_64, C2_64, C_V, INV_V, NEG_V) \ case InstIcmp::val: { \ _Vc##C_V(T, (INV_V) ? Src1 : Src0, (INV_V) ? Src0 : Src1, is_signed); \ if (NEG_V) { \ auto *TInv = makeReg(DestTy); \ _vmvn(TInv, T); \ T = TInv; \ } \ } break; ICMPARM32_TABLE #undef X #undef _Vcgt #undef _Vcge #undef _Vceq } _mov(Dest, T); return; } Operand *_0 = legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex); Operand *_1 = legalize(Ctx->getConstantInt32(1), Legal_Reg | Legal_Flex); Variable *T = makeReg(IceType_i1); _mov(T, _0); CondWhenTrue Cond = lowerIcmpCond(Instr); _mov_redefined(T, _1, Cond.WhenTrue0); _mov(Dest, T); assert(Cond.WhenTrue1 == CondARM32::kNone); return; } void TargetARM32::lowerInsertElement(const InstInsertElement *Instr) { Variable *Dest = Instr->getDest(); Type DestTy = Dest->getType(); Variable *Src0 = legalizeToReg(Instr->getSrc(0)); Variable *Src1 = legalizeToReg(Instr->getSrc(1)); Operand *Src2 = Instr->getSrc(2); if (const auto *Imm = llvm::dyn_cast
(Src2)) { const uint32_t Index = Imm->getValue(); Variable *T = makeReg(DestTy); if (isFloatingType(DestTy)) { T->setRegClass(RegARM32::RCARM32_QtoS); } _mov(T, Src0); _insertelement(T, Src1, Index); _set_dest_redefined(); _mov(Dest, T); return; } assert(false && "insertelement requires a constant index"); } namespace { inline uint64_t getConstantMemoryOrder(Operand *Opnd) { if (auto *Integer = llvm::dyn_cast
(Opnd)) return Integer->getValue(); return Intrinsics::MemoryOrderInvalid; } } // end of anonymous namespace void TargetARM32::lowerLoadLinkedStoreExclusive( Type Ty, Operand *Addr, std::function
Operation, CondARM32::Cond Cond) { auto *Retry = Context.insert
(this); { // scoping for loop highlighting. Variable *Success = makeReg(IceType_i32); Variable *Tmp = (Ty == IceType_i64) ? makeI64RegPair() : makeReg(Ty); auto *_0 = Ctx->getConstantZero(IceType_i32); Context.insert
(Tmp); Context.insert
(Tmp); Variable *AddrR = legalizeToReg(Addr); _ldrex(Tmp, formMemoryOperand(AddrR, Ty))->setDestRedefined(); auto *StoreValue = Operation(Tmp); assert(StoreValue->mustHaveReg()); // strex requires Dest to be a register other than Value or Addr. This // restriction is cleanly represented by adding an "early" definition of // Dest (or a latter use of all the sources.) Context.insert
(Success); if (Cond != CondARM32::AL) { _mov_redefined(Success, legalize(_0, Legal_Reg | Legal_Flex), InstARM32::getOppositeCondition(Cond)); } _strex(Success, StoreValue, formMemoryOperand(AddrR, Ty), Cond) ->setDestRedefined(); _cmp(Success, _0); } _br(Retry, CondARM32::NE); } namespace { InstArithmetic *createArithInst(Cfg *Func, uint32_t Operation, Variable *Dest, Variable *Src0, Operand *Src1) { InstArithmetic::OpKind Oper; switch (Operation) { default: llvm::report_fatal_error("Unknown AtomicRMW operation"); case Intrinsics::AtomicExchange: llvm::report_fatal_error("Can't handle Atomic xchg operation"); case Intrinsics::AtomicAdd: Oper = InstArithmetic::Add; break; case Intrinsics::AtomicAnd: Oper = InstArithmetic::And; break; case Intrinsics::AtomicSub: Oper = InstArithmetic::Sub; break; case Intrinsics::AtomicOr: Oper = InstArithmetic::Or; break; case Intrinsics::AtomicXor: Oper = InstArithmetic::Xor; break; } return InstArithmetic::create(Func, Oper, Dest, Src0, Src1); } } // end of anonymous namespace void TargetARM32::lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Addr, Operand *Val) { // retry: // ldrex tmp, [addr] // mov contents, tmp // op result, contents, Val // strex success, result, [addr] // cmp success, 0 // jne retry // fake-use(addr, operand) @ prevents undesirable clobbering. // mov dest, contents auto DestTy = Dest->getType(); if (DestTy == IceType_i64) { lowerInt64AtomicRMW(Dest, Operation, Addr, Val); return; } Operand *ValRF = nullptr; if (llvm::isa
(Val)) { ValRF = Val; } else { ValRF = legalizeToReg(Val); } auto *ContentsR = makeReg(DestTy); auto *ResultR = makeReg(DestTy); _dmb(); lowerLoadLinkedStoreExclusive( DestTy, Addr, [this, Operation, ResultR, ContentsR, ValRF](Variable *Tmp) { lowerAssign(InstAssign::create(Func, ContentsR, Tmp)); if (Operation == Intrinsics::AtomicExchange) { lowerAssign(InstAssign::create(Func, ResultR, ValRF)); } else { lowerArithmetic( createArithInst(Func, Operation, ResultR, ContentsR, ValRF)); } return ResultR; }); _dmb(); if (auto *ValR = llvm::dyn_cast
(ValRF)) { Context.insert
(ValR); } // Can't dce ContentsR. Context.insert
(ContentsR); lowerAssign(InstAssign::create(Func, Dest, ContentsR)); } void TargetARM32::lowerInt64AtomicRMW(Variable *Dest, uint32_t Operation, Operand *Addr, Operand *Val) { assert(Dest->getType() == IceType_i64); auto *ResultR = makeI64RegPair(); Context.insert
(ResultR); Operand *ValRF = nullptr; if (llvm::dyn_cast
(Val)) { ValRF = Val; } else { auto *ValR64 = llvm::cast
(Func->makeVariable(IceType_i64)); ValR64->initHiLo(Func); ValR64->setMustNotHaveReg(); ValR64->getLo()->setMustHaveReg(); ValR64->getHi()->setMustHaveReg(); lowerAssign(InstAssign::create(Func, ValR64, Val)); ValRF = ValR64; } auto *ContentsR = llvm::cast
(Func->makeVariable(IceType_i64)); ContentsR->initHiLo(Func); ContentsR->setMustNotHaveReg(); ContentsR->getLo()->setMustHaveReg(); ContentsR->getHi()->setMustHaveReg(); _dmb(); lowerLoadLinkedStoreExclusive( IceType_i64, Addr, [this, Operation, ResultR, ContentsR, ValRF](Variable *Tmp) { lowerAssign(InstAssign::create(Func, ContentsR, Tmp)); Context.insert
(Tmp); if (Operation == Intrinsics::AtomicExchange) { lowerAssign(InstAssign::create(Func, ResultR, ValRF)); } else { lowerArithmetic( createArithInst(Func, Operation, ResultR, ContentsR, ValRF)); } Context.insert
(ResultR->getHi()); Context.insert
(ResultR, ResultR->getLo()) ->setDestRedefined(); return ResultR; }); _dmb(); if (auto *ValR64 = llvm::dyn_cast
(ValRF)) { Context.insert
(ValR64->getLo()); Context.insert
(ValR64->getHi()); } lowerAssign(InstAssign::create(Func, Dest, ContentsR)); } void TargetARM32::postambleCtpop64(const InstCall *Instr) { Operand *Arg0 = Instr->getArg(0); if (isInt32Asserting32Or64(Arg0->getType())) { return; } // The popcount helpers always return 32-bit values, while the intrinsic's // signature matches some 64-bit platform's native instructions and expect to // fill a 64-bit reg. Thus, clear the upper bits of the dest just in case the // user doesn't do that in the IR or doesn't toss the bits via truncate. auto *DestHi = llvm::cast
(hiOperand(Instr->getDest())); Variable *T = makeReg(IceType_i32); Operand *_0 = legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex); _mov(T, _0); _mov(DestHi, T); } void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { Variable *Dest = Instr->getDest(); Type DestTy = (Dest != nullptr) ? Dest->getType() : IceType_void; Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID; switch (ID) { case Intrinsics::AtomicFence: case Intrinsics::AtomicFenceAll: assert(Dest == nullptr); _dmb(); return; case Intrinsics::AtomicIsLockFree: { Operand *ByteSize = Instr->getArg(0); auto *CI = llvm::dyn_cast
(ByteSize); if (CI == nullptr) { // The PNaCl ABI requires the byte size to be a compile-time constant. Func->setError("AtomicIsLockFree byte size should be compile-time const"); return; } static constexpr int32_t NotLockFree = 0; static constexpr int32_t LockFree = 1; int32_t Result = NotLockFree; switch (CI->getValue()) { case 1: case 2: case 4: case 8: Result = LockFree; break; } _mov(Dest, legalizeToReg(Ctx->getConstantInt32(Result))); return; } case Intrinsics::AtomicLoad: { assert(isScalarIntegerType(DestTy)); // We require the memory address to be naturally aligned. Given that is the // case, then normal loads are atomic. if (!Intrinsics::isMemoryOrderValid( ID, getConstantMemoryOrder(Instr->getArg(1)))) { Func->setError("Unexpected memory ordering for AtomicLoad"); return; } Variable *T; if (DestTy == IceType_i64) { // ldrex is the only arm instruction that is guaranteed to load a 64-bit // integer atomically. Everything else works with a regular ldr. T = makeI64RegPair(); _ldrex(T, formMemoryOperand(Instr->getArg(0), IceType_i64)); } else { T = makeReg(DestTy); _ldr(T, formMemoryOperand(Instr->getArg(0), DestTy)); } _dmb(); lowerAssign(InstAssign::create(Func, Dest, T)); // Adding a fake-use T to ensure the atomic load is not removed if Dest is // unused. Context.insert
(T); return; } case Intrinsics::AtomicStore: { // We require the memory address to be naturally aligned. Given that is the // case, then normal loads are atomic. if (!Intrinsics::isMemoryOrderValid( ID, getConstantMemoryOrder(Instr->getArg(2)))) { Func->setError("Unexpected memory ordering for AtomicStore"); return; } auto *Value = Instr->getArg(0); if (Value->getType() == IceType_i64) { auto *ValueR = makeI64RegPair(); Context.insert
(ValueR); lowerAssign(InstAssign::create(Func, ValueR, Value)); _dmb(); lowerLoadLinkedStoreExclusive( IceType_i64, Instr->getArg(1), [this, ValueR](Variable *Tmp) { // The following fake-use prevents the ldrex instruction from being // dead code eliminated. Context.insert
(llvm::cast
(loOperand(Tmp))); Context.insert
(llvm::cast
(hiOperand(Tmp))); Context.insert
(Tmp); return ValueR; }); Context.insert
(ValueR); _dmb(); return; } auto *ValueR = legalizeToReg(Instr->getArg(0)); const auto ValueTy = ValueR->getType(); assert(isScalarIntegerType(ValueTy)); auto *Addr = legalizeToReg(Instr->getArg(1)); // non-64-bit stores are atomically as long as the address is aligned. This // is PNaCl, so addresses are aligned. _dmb(); _str(ValueR, formMemoryOperand(Addr, ValueTy)); _dmb(); return; } case Intrinsics::AtomicCmpxchg: { // retry: // ldrex tmp, [addr] // cmp tmp, expected // mov expected, tmp // strexeq success, new, [addr] // cmpeq success, #0 // bne retry // mov dest, expected assert(isScalarIntegerType(DestTy)); // We require the memory address to be naturally aligned. Given that is the // case, then normal loads are atomic. if (!Intrinsics::isMemoryOrderValid( ID, getConstantMemoryOrder(Instr->getArg(3)), getConstantMemoryOrder(Instr->getArg(4)))) { Func->setError("Unexpected memory ordering for AtomicCmpxchg"); return; } if (DestTy == IceType_i64) { Variable *LoadedValue = nullptr; auto *New = makeI64RegPair(); Context.insert
(New); lowerAssign(InstAssign::create(Func, New, Instr->getArg(2))); auto *Expected = makeI64RegPair(); Context.insert
(Expected); lowerAssign(InstAssign::create(Func, Expected, Instr->getArg(1))); _dmb(); lowerLoadLinkedStoreExclusive( DestTy, Instr->getArg(0), [this, Expected, New, Instr, DestTy, &LoadedValue](Variable *Tmp) { auto *ExpectedLoR = llvm::cast
(loOperand(Expected)); auto *ExpectedHiR = llvm::cast
(hiOperand(Expected)); auto *TmpLoR = llvm::cast
(loOperand(Tmp)); auto *TmpHiR = llvm::cast
(hiOperand(Tmp)); _cmp(TmpLoR, ExpectedLoR); _cmp(TmpHiR, ExpectedHiR, CondARM32::EQ); LoadedValue = Tmp; return New; }, CondARM32::EQ); _dmb(); Context.insert
(LoadedValue); lowerAssign(InstAssign::create(Func, Dest, LoadedValue)); // The fake-use Expected prevents the assignments to Expected (above) // from being removed if Dest is not used. Context.insert
(Expected); // New needs to be alive here, or its live range will end in the // strex instruction. Context.insert
(New); return; } auto *New = legalizeToReg(Instr->getArg(2)); auto *Expected = legalizeToReg(Instr->getArg(1)); Variable *LoadedValue = nullptr; _dmb(); lowerLoadLinkedStoreExclusive( DestTy, Instr->getArg(0), [this, Expected, New, Instr, DestTy, &LoadedValue](Variable *Tmp) { lowerIcmpCond(InstIcmp::Eq, Tmp, Expected); LoadedValue = Tmp; return New; }, CondARM32::EQ); _dmb(); lowerAssign(InstAssign::create(Func, Dest, LoadedValue)); Context.insert
(Expected); Context.insert
(New); return; } case Intrinsics::AtomicRMW: { if (!Intrinsics::isMemoryOrderValid( ID, getConstantMemoryOrder(Instr->getArg(3)))) { Func->setError("Unexpected memory ordering for AtomicRMW"); return; } lowerAtomicRMW( Dest, static_cast
( llvm::cast
(Instr->getArg(0))->getValue()), Instr->getArg(1), Instr->getArg(2)); return; } case Intrinsics::Bswap: { Operand *Val = Instr->getArg(0); Type Ty = Val->getType(); if (Ty == IceType_i64) { Val = legalizeUndef(Val); Variable *Val_Lo = legalizeToReg(loOperand(Val)); Variable *Val_Hi = legalizeToReg(hiOperand(Val)); Variable *T_Lo = makeReg(IceType_i32); Variable *T_Hi = makeReg(IceType_i32); auto *DestLo = llvm::cast
(loOperand(Dest)); auto *DestHi = llvm::cast
(hiOperand(Dest)); _rev(T_Lo, Val_Lo); _rev(T_Hi, Val_Hi); _mov(DestLo, T_Hi); _mov(DestHi, T_Lo); } else { assert(Ty == IceType_i32 || Ty == IceType_i16); Variable *ValR = legalizeToReg(Val); Variable *T = makeReg(Ty); _rev(T, ValR); if (Val->getType() == IceType_i16) { Operand *_16 = shAmtImm(16); _lsr(T, T, _16); } _mov(Dest, T); } return; } case Intrinsics::Ctpop: { llvm::report_fatal_error("Ctpop should have been prelowered."); } case Intrinsics::Ctlz: { // The "is zero undef" parameter is ignored and we always return a // well-defined value. Operand *Val = Instr->getArg(0); Variable *ValLoR; Variable *ValHiR = nullptr; if (Val->getType() == IceType_i64) { Val = legalizeUndef(Val); ValLoR = legalizeToReg(loOperand(Val)); ValHiR = legalizeToReg(hiOperand(Val)); } else { ValLoR = legalizeToReg(Val); } lowerCLZ(Dest, ValLoR, ValHiR); return; } case Intrinsics::Cttz: { // Essentially like Clz, but reverse the bits first. Operand *Val = Instr->getArg(0); Variable *ValLoR; Variable *ValHiR = nullptr; if (Val->getType() == IceType_i64) { Val = legalizeUndef(Val); ValLoR = legalizeToReg(loOperand(Val)); ValHiR = legalizeToReg(hiOperand(Val)); Variable *TLo = makeReg(IceType_i32); Variable *THi = makeReg(IceType_i32); _rbit(TLo, ValLoR); _rbit(THi, ValHiR); ValLoR = THi; ValHiR = TLo; } else { ValLoR = legalizeToReg(Val); Variable *T = makeReg(IceType_i32); _rbit(T, ValLoR); ValLoR = T; } lowerCLZ(Dest, ValLoR, ValHiR); return; } case Intrinsics::Fabs: { Variable *T = makeReg(DestTy); _vabs(T, legalizeToReg(Instr->getArg(0))); _mov(Dest, T); return; } case Intrinsics::Longjmp: { llvm::report_fatal_error("longjmp should have been prelowered."); } case Intrinsics::Memcpy: { llvm::report_fatal_error("memcpy should have been prelowered."); } case Intrinsics::Memmove: { llvm::report_fatal_error("memmove should have been prelowered."); } case Intrinsics::Memset: { llvm::report_fatal_error("memmove should have been prelowered."); } case Intrinsics::NaClReadTP: { if (SandboxingType != ST_NaCl) { llvm::report_fatal_error("nacl-read-tp should have been prelowered."); } Variable *TP = legalizeToReg(OperandARM32Mem::create( Func, getPointerType(), getPhysicalRegister(RegARM32::Reg_r9), llvm::cast
(Ctx->getConstantZero(IceType_i32)))); _mov(Dest, TP); return; } case Intrinsics::Setjmp: { llvm::report_fatal_error("setjmp should have been prelowered."); } case Intrinsics::Sqrt: { assert(isScalarFloatingType(Dest->getType()) || getFlags().getApplicationBinaryInterface() != ::Ice::ABI_PNaCl); Variable *Src = legalizeToReg(Instr->getArg(0)); Variable *T = makeReg(DestTy); _vsqrt(T, Src); _mov(Dest, T); return; } case Intrinsics::Stacksave: { Variable *SP = getPhysicalRegister(RegARM32::Reg_sp); _mov(Dest, SP); return; } case Intrinsics::Stackrestore: { Variable *Val = legalizeToReg(Instr->getArg(0)); Sandboxer(this).reset_sp(Val); return; } case Intrinsics::Trap: _trap(); return; case Intrinsics::AddSaturateSigned: case Intrinsics::AddSaturateUnsigned: { bool Unsigned = (ID == Intrinsics::AddSaturateUnsigned); Variable *Src0 = legalizeToReg(Instr->getArg(0)); Variable *Src1 = legalizeToReg(Instr->getArg(1)); Variable *T = makeReg(DestTy); _vqadd(T, Src0, Src1, Unsigned); _mov(Dest, T); return; } case Intrinsics::LoadSubVector: { assert(llvm::isa
(Instr->getArg(1)) && "LoadSubVector second argument must be a constant"); Variable *Dest = Instr->getDest(); Type Ty = Dest->getType(); auto *SubVectorSize = llvm::cast
(Instr->getArg(1)); Operand *Addr = Instr->getArg(0); OperandARM32Mem *Src = formMemoryOperand(Addr, Ty); doMockBoundsCheck(Src); if (Dest->isRematerializable()) { Context.insert
(Dest); return; } auto *T = makeReg(Ty); switch (SubVectorSize->getValue()) { case 4: _vldr1d(T, Src); break; case 8: _vldr1q(T, Src); break; default: Func->setError("Unexpected size for LoadSubVector"); return; } _mov(Dest, T); return; } case Intrinsics::StoreSubVector: { assert(llvm::isa
(Instr->getArg(2)) && "StoreSubVector third argument must be a constant"); auto *SubVectorSize = llvm::cast
(Instr->getArg(2)); Variable *Value = legalizeToReg(Instr->getArg(0)); Operand *Addr = Instr->getArg(1); OperandARM32Mem *NewAddr = formMemoryOperand(Addr, Value->getType()); doMockBoundsCheck(NewAddr); Value = legalizeToReg(Value); switch (SubVectorSize->getValue()) { case 4: _vstr1d(Value, NewAddr); break; case 8: _vstr1q(Value, NewAddr); break; default: Func->setError("Unexpected size for StoreSubVector"); return; } return; } case Intrinsics::MultiplyAddPairs: { Variable *Src0 = legalizeToReg(Instr->getArg(0)); Variable *Src1 = legalizeToReg(Instr->getArg(1)); Variable *T = makeReg(DestTy); _vmlap(T, Src0, Src1); _mov(Dest, T); return; } case Intrinsics::MultiplyHighSigned: case Intrinsics::MultiplyHighUnsigned: { bool Unsigned = (ID == Intrinsics::MultiplyHighUnsigned); Variable *Src0 = legalizeToReg(Instr->getArg(0)); Variable *Src1 = legalizeToReg(Instr->getArg(1)); Variable *T = makeReg(DestTy); _vmulh(T, Src0, Src1, Unsigned); _mov(Dest, T); return; } case Intrinsics::Nearbyint: { UnimplementedLoweringError(this, Instr); return; } case Intrinsics::Round: { UnimplementedLoweringError(this, Instr); return; } case Intrinsics::SignMask: { UnimplementedLoweringError(this, Instr); return; } case Intrinsics::SubtractSaturateSigned: case Intrinsics::SubtractSaturateUnsigned: { bool Unsigned = (ID == Intrinsics::SubtractSaturateUnsigned); Variable *Src0 = legalizeToReg(Instr->getArg(0)); Variable *Src1 = legalizeToReg(Instr->getArg(1)); Variable *T = makeReg(DestTy); _vqsub(T, Src0, Src1, Unsigned); _mov(Dest, T); return; } case Intrinsics::VectorPackSigned: case Intrinsics::VectorPackUnsigned: { bool Unsigned = (ID == Intrinsics::VectorPackUnsigned); bool Saturating = true; Variable *Src0 = legalizeToReg(Instr->getArg(0)); Variable *Src1 = legalizeToReg(Instr->getArg(1)); Variable *T = makeReg(DestTy); _vqmovn2(T, Src0, Src1, Unsigned, Saturating); _mov(Dest, T); return; } default: // UnknownIntrinsic Func->setError("Unexpected intrinsic"); return; } return; } void TargetARM32::lowerCLZ(Variable *Dest, Variable *ValLoR, Variable *ValHiR) { Type Ty = Dest->getType(); assert(Ty == IceType_i32 || Ty == IceType_i64); Variable *T = makeReg(IceType_i32); _clz(T, ValLoR); if (Ty == IceType_i64) { auto *DestLo = llvm::cast
(loOperand(Dest)); auto *DestHi = llvm::cast
(hiOperand(Dest)); Operand *Zero = legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex); Operand *ThirtyTwo = legalize(Ctx->getConstantInt32(32), Legal_Reg | Legal_Flex); _cmp(ValHiR, Zero); Variable *T2 = makeReg(IceType_i32); _add(T2, T, ThirtyTwo); _clz(T2, ValHiR, CondARM32::NE); // T2 is actually a source as well when the predicate is not AL (since it // may leave T2 alone). We use _set_dest_redefined to prolong the liveness // of T2 as if it was used as a source. _set_dest_redefined(); _mov(DestLo, T2); Variable *T3 = makeReg(Zero->getType()); _mov(T3, Zero); _mov(DestHi, T3); return; } _mov(Dest, T); return; } void TargetARM32::lowerLoad(const InstLoad *Load) { // A Load instruction can be treated the same as an Assign instruction, after // the source operand is transformed into an OperandARM32Mem operand. Type Ty = Load->getDest()->getType(); Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty); Variable *DestLoad = Load->getDest(); // TODO(jvoung): handled folding opportunities. Sign and zero extension can // be folded into a load. auto *Assign = InstAssign::create(Func, DestLoad, Src0); lowerAssign(Assign); } namespace { void dumpAddressOpt(const Cfg *Func, const Variable *Base, int32_t Offset, const Variable *OffsetReg, int16_t OffsetRegShAmt, const Inst *Reason) { if (!BuildDefs::dump()) return; if (!Func->isVerbose(IceV_AddrOpt)) return; OstreamLocker _(Func->getContext()); Ostream &Str = Func->getContext()->getStrDump(); Str << "Instruction: "; Reason->dumpDecorated(Func); Str << " results in Base="; if (Base) Base->dump(Func); else Str << "
"; Str << ", OffsetReg="; if (OffsetReg) OffsetReg->dump(Func); else Str << "
"; Str << ", Shift=" << OffsetRegShAmt << ", Offset=" << Offset << "\n"; } bool matchAssign(const VariablesMetadata *VMetadata, Variable **Var, int32_t *Offset, const Inst **Reason) { // Var originates from Var=SrcVar ==> set Var:=SrcVar if (*Var == nullptr) return false; const Inst *VarAssign = VMetadata->getSingleDefinition(*Var); if (!VarAssign) return false; assert(!VMetadata->isMultiDef(*Var)); if (!llvm::isa
(VarAssign)) return false; Operand *SrcOp = VarAssign->getSrc(0); bool Optimized = false; if (auto *SrcVar = llvm::dyn_cast
(SrcOp)) { if (!VMetadata->isMultiDef(SrcVar) || // TODO: ensure SrcVar stays single-BB false) { Optimized = true; *Var = SrcVar; } else if (auto *Const = llvm::dyn_cast
(SrcOp)) { int32_t MoreOffset = Const->getValue(); int32_t NewOffset = MoreOffset + *Offset; if (Utils::WouldOverflowAdd(*Offset, MoreOffset)) return false; *Var = nullptr; *Offset += NewOffset; Optimized = true; } } if (Optimized) { *Reason = VarAssign; } return Optimized; } bool isAddOrSub(const Inst *Instr, InstArithmetic::OpKind *Kind) { if (const auto *Arith = llvm::dyn_cast
(Instr)) { switch (Arith->getOp()) { default: return false; case InstArithmetic::Add: case InstArithmetic::Sub: *Kind = Arith->getOp(); return true; } } return false; } bool matchCombinedBaseIndex(const VariablesMetadata *VMetadata, Variable **Base, Variable **OffsetReg, int32_t OffsetRegShamt, const Inst **Reason) { // OffsetReg==nullptr && Base is Base=Var1+Var2 ==> // set Base=Var1, OffsetReg=Var2, Shift=0 if (*Base == nullptr) return false; if (*OffsetReg != nullptr) return false; (void)OffsetRegShamt; assert(OffsetRegShamt == 0); const Inst *BaseInst = VMetadata->getSingleDefinition(*Base); if (BaseInst == nullptr) return false; assert(!VMetadata->isMultiDef(*Base)); if (BaseInst->getSrcSize() < 2) return false; auto *Var1 = llvm::dyn_cast