HELLO·Android
系统源代码
IT资讯
技术文章
我的收藏
注册
登录
-
我收藏的文章
创建代码块
我的代码块
我的账号
Nougat 7.0
|
7.0.0_r31
下载
查看原文件
收藏
根目录
external
llvm
lib
Target
X86
X86ISelLowering.cpp
//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the interfaces that X86 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86CallingConv.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include "X86IntrinsicsInfo.h" #include
#include
#include
using namespace llvm; #define DEBUG_TYPE "x86-isel" STATISTIC(NumTailCalls, "Number of tail calls"); static cl::opt
ExperimentalVectorWideningLegalization( "x86-experimental-vector-widening-legalization", cl::init(false), cl::desc("Enable an experimental vector type legalization through widening " "rather than promotion."), cl::Hidden); X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize()); // Set up the TargetLowering object. // X86 is weird. It always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // For 64-bit, since we have so many registers, use the ILP scheduler. // For 32-bit, use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget->isAtom()) setSchedulingPreference(Sched::ILP); else if (Subtarget->is64Bit()) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides on Atom when compiling with O2. if (TM.getOptLevel() >= CodeGenOpt::Default) { if (Subtarget->hasSlowDivide32()) addBypassSlowDiv(32, 8); if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit()) addBypassSlowDiv(64, 16); } if (Subtarget->isTargetKnownWindowsMSVC()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); setLibcallName(RTLIB::SREM_I64, "_allrem"); setLibcallName(RTLIB::UREM_I64, "_aullrem"); setLibcallName(RTLIB::MUL_I64, "_allmul"); setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); } if (Subtarget->isTargetDarwin()) { // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(false); setUseUnderscoreLongJmp(false); } else if (Subtarget->isTargetWindowsGNU()) { // MS runtime is weird: it exports _setjmp, but longjmp! setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(false); } else { setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(true); } // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); addRegisterClass(MVT::i32, &X86::GR32RegClass); if (Subtarget->is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); setTruncStoreAction(MVT::i64, MVT::i8 , Expand); setTruncStoreAction(MVT::i32, MVT::i16, Expand); setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); // SETOEQ and SETUNE require checking two conditions. setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); if (Subtarget->is64Bit()) { if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) // f32/f64 are legal, f80 is custom. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); else setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); } else if (!Subtarget->useSoftFloat()) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD or VCVTUSI2SS/SD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); } // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); if (!Subtarget->useSoftFloat()) { // SSE has no i16 to fp conversion, only i32 if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); } // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); if (!Subtarget->useSoftFloat()) { // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); if (X86ScalarSSEf32) { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand); setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand); } // Handle FP_TO_UINT by promoting the destination to a larger signed // conversion. setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); if (Subtarget->is64Bit()) { if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } else { setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); } } else if (!Subtarget->useSoftFloat()) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) // Expand FP_TO_UINT into a select. // FIXME: We would like to use a Custom expander here eventually to do // the optimal thing for SSE vs. the default expansion in the legalizer. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); else // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom. // With SSE3 we can use fisttpll to convert to a signed i64; without // SSE, we're stuck with a fistpll. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget->is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } } // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes // the two-result form to trivial CSE, which is able to combine x/y and x%y // into a single instruction. // // Scalar integer multiply-high is also lowered to use two-result // operations, to match the available instructions. However, plain multiply // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. setOperationAction(ISD::ADDC, VT, Custom); setOperationAction(ISD::ADDE, VT, Custom); setOperationAction(ISD::SUBC, VT, Custom); setOperationAction(ISD::SUBE, VT, Custom); } setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); setOperationAction(ISD::BR_CC , MVT::f32, Expand); setOperationAction(ISD::BR_CC , MVT::f64, Expand); setOperationAction(ISD::BR_CC , MVT::f80, Expand); setOperationAction(ISD::BR_CC , MVT::f128, Expand); setOperationAction(ISD::BR_CC , MVT::i8, Expand); setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); setOperationAction(ISD::BR_CC , MVT::i64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); setOperationAction(ISD::SELECT_CC , MVT::f128, Expand); setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); setOperationAction(ISD::SELECT_CC , MVT::i64, Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); if (Subtarget->is32Bit() && Subtarget->isTargetKnownWindowsMSVC()) { // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` // is. We should promote the value to 64-bits to solve this. // This is what the CRT headers do - `fmodf` is an inline header // function casting to f64 and calling `fmod`. setOperationAction(ISD::FREM , MVT::f32 , Promote); } else { setOperationAction(ISD::FREM , MVT::f32 , Expand); } setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. setOperationAction(ISD::CTTZ , MVT::i8 , Promote); AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); if (Subtarget->hasBMI()) { setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); if (Subtarget->is64Bit()) setOperationAction(ISD::CTTZ , MVT::i64 , Custom); } if (Subtarget->hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. setOperationAction(ISD::CTLZ , MVT::i8 , Promote); AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); } } // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. if (Subtarget->useSoftFloat() || !Subtarget->hasF16C()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } // There's never any support for operations beyond MVT::f32. setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f80, MVT::f16, Expand); if (Subtarget->hasPOPCNT()) { setOperationAction(ISD::CTPOP , MVT::i8 , Promote); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTPOP , MVT::i16 , Expand); setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); if (!Subtarget->hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); // These should be promoted to a larger select which is supported. setOperationAction(ISD::SELECT , MVT::i1 , Promote); // X86 wants to expand cmov itself. setOperationAction(ISD::SELECT , MVT::i8 , Custom); setOperationAction(ISD::SELECT , MVT::i16 , Custom); setOperationAction(ISD::SELECT , MVT::i32 , Custom); setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); setOperationAction(ISD::SELECT , MVT::f80 , Custom); setOperationAction(ISD::SELECT , MVT::f128 , Custom); setOperationAction(ISD::SETCC , MVT::i8 , Custom); setOperationAction(ISD::SETCC , MVT::i16 , Custom); setOperationAction(ISD::SETCC , MVT::i32 , Custom); setOperationAction(ISD::SETCC , MVT::f32 , Custom); setOperationAction(ISD::SETCC , MVT::f64 , Custom); setOperationAction(ISD::SETCC , MVT::f80 , Custom); setOperationAction(ISD::SETCC , MVT::f128 , Custom); setOperationAction(ISD::SETCCE , MVT::i8 , Custom); setOperationAction(ISD::SETCCE , MVT::i16 , Custom); setOperationAction(ISD::SETCCE , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SELECT , MVT::i64 , Custom); setOperationAction(ISD::SETCC , MVT::i64 , Custom); setOperationAction(ISD::SETCCE , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to // support continuation, user-level threading, and etc.. As a result, no // other SjLj exception interfaces are implemented and please don't build // your own exception handling based on them. // LLVM/Clang supports zero-cost DWARF exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); // Darwin ABI issue. setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); setOperationAction(ISD::JumpTable , MVT::i32 , Custom); setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); if (Subtarget->is64Bit()) setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); setOperationAction(ISD::JumpTable , MVT::i64 , Custom); setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); } // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); } if (Subtarget->hasSSE1()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } if (Subtarget->hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); if (Subtarget->is64Bit()) { setOperationAction(ISD::VAARG , MVT::Other, Custom); setOperationAction(ISD::VACOPY , MVT::Other, Custom); } else { // TargetInfo::CharPtrBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Expand); setOperationAction(ISD::VACOPY , MVT::Other, Expand); } setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); if (!Subtarget->useSoftFloat() && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::FR64RegClass); // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS , MVT::f64, Custom); setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f64, Custom); setOperationAction(ISD::FNEG , MVT::f32, Custom); // Use ANDPD and ORPD to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // Lower this to FGETSIGNx86 plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Expand FP immediates into loads from the stack, except for the special // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps } else if (!Subtarget->useSoftFloat() && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f32, Custom); setOperationAction(ISD::UNDEF, MVT::f64, Expand); // Use ANDPS and ORPS to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Special cases we handle for FP constants. addLegalFPImmediate(APFloat(+0.0f)); // xorps addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } } else if (!Subtarget->useSoftFloat()) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); addRegisterClass(MVT::f32, &X86::RFP32RegClass); setOperationAction(ISD::UNDEF, MVT::f64, Expand); setOperationAction(ISD::UNDEF, MVT::f32, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS addLegalFPImmediate(APFloat(+0.0f)); // FLD0 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS } // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87, except f128 in MMX. if (!Subtarget->useSoftFloat()) { if (Subtarget->is64Bit() && Subtarget->hasMMX()) { addRegisterClass(MVT::f128, &X86::FR128RegClass); ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat); setOperationAction(ISD::FABS , MVT::f128, Custom); setOperationAction(ISD::FNEG , MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); } addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS bool ignored; APFloat TmpFlt2(+1.0); TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, &ignored); addLegalFPImmediate(TmpFlt2); // FLD1 TmpFlt2.changeSign(); addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f80, Expand); setOperationAction(ISD::FCOS , MVT::f80, Expand); setOperationAction(ISD::FSINCOS, MVT::f80, Expand); } setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); setOperationAction(ISD::FTRUNC, MVT::f80, Expand); setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); } // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); setOperationAction(ISD::FMINNUM, MVT::f80, Expand); setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ADD , VT, Expand); setOperationAction(ISD::SUB , VT, Expand); setOperationAction(ISD::FADD, VT, Expand); setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); setOperationAction(ISD::MUL , VT, Expand); setOperationAction(ISD::FMUL, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::LOAD, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::SHL, VT, Expand); setOperationAction(ISD::SRA, VT, Expand); setOperationAction(ISD::SRL, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); setOperationAction(ISD::TRUNCATE, VT, Expand); setOperationAction(ISD::SIGN_EXTEND, VT, Expand); setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(InnerVT, VT, Expand); setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like // types, we have to deal with them whether we ask for Expansion or not. // Setting Expand causes its own optimisation problems though, so leave // them legal. if (VT.getVectorElementType() == MVT::i1) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are // split/scalarized right now. if (VT.getVectorElementType() == MVT::f16) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); } } // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!Subtarget->useSoftFloat() && Subtarget->hasMMX()) { addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } // MMX-sized vectors (other than x86mmx) are expected to be expanded // into smaller operations. for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) { setOperationAction(ISD::MULHS, MMXTy, Expand); setOperationAction(ISD::AND, MMXTy, Expand); setOperationAction(ISD::OR, MMXTy, Expand); setOperationAction(ISD::XOR, MMXTy, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, MMXTy, Expand); setOperationAction(ISD::SELECT, MMXTy, Expand); setOperationAction(ISD::BITCAST, MMXTy, Expand); } setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); if (!Subtarget->useSoftFloat() && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, &X86::VR128RegClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); setOperationAction(ISD::FSUB, MVT::v4f32, Legal); setOperationAction(ISD::FMUL, MVT::v4f32, Legal); setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); } if (!Subtarget->useSoftFloat() && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, &X86::VR128RegClass); // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, &X86::VR128RegClass); addRegisterClass(MVT::v8i16, &X86::VR128RegClass); addRegisterClass(MVT::v4i32, &X86::VR128RegClass); addRegisterClass(MVT::v2i64, &X86::VR128RegClass); setOperationAction(ISD::ADD, MVT::v16i8, Legal); setOperationAction(ISD::ADD, MVT::v8i16, Legal); setOperationAction(ISD::ADD, MVT::v4i32, Legal); setOperationAction(ISD::ADD, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v16i8, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v16i8, Legal); setOperationAction(ISD::SUB, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v4i32, Legal); setOperationAction(ISD::SUB, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::FADD, MVT::v2f64, Legal); setOperationAction(ISD::FSUB, MVT::v2f64, Legal); setOperationAction(ISD::FMUL, MVT::v2f64, Legal); setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::SMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v16i8, Legal); setOperationAction(ISD::SMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v16i8, Legal); setOperationAction(ISD::SETCC, MVT::v2i64, Custom); setOperationAction(ISD::SETCC, MVT::v16i8, Custom); setOperationAction(ISD::SETCC, MVT::v8i16, Custom); setOperationAction(ISD::SETCC, MVT::v4i32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::CTPOP, MVT::v16i8, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); // ISD::CTTZ v2i64 - scalarization is faster. setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); // ISD::CTTZ_ZERO_UNDEF v2i64 - scalarization is faster. // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } // We support custom legalizing of sext and anyext loads for specific // memory vector types which we can load as a scalar (or sequence of // scalars) and extend in-register to a legal 128-bit vector type. For sext // loads these must work with a single scalar load. for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom); } setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v2i64); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, MVT::v2i64); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, MVT::v2i64); setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); } // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::LOAD, MVT::v2i64, Legal); setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); // As there is no 64-bit GPR available, we need build a special custom // sequence to convert from v2i32 to v2f32. if (!Subtarget->is64Bit()) setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); } if (!Subtarget->useSoftFloat() && Subtarget->hasSSE41()) { for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, RoundedTy, Legal); setOperationAction(ISD::FCEIL, RoundedTy, Legal); setOperationAction(ISD::FTRUNC, RoundedTy, Legal); setOperationAction(ISD::FRINT, RoundedTy, Legal); setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); } setOperationAction(ISD::SMAX, MVT::v16i8, Legal); setOperationAction(ISD::SMAX, MVT::v4i32, Legal); setOperationAction(ISD::UMAX, MVT::v8i16, Legal); setOperationAction(ISD::UMAX, MVT::v4i32, Legal); setOperationAction(ISD::SMIN, MVT::v16i8, Legal); setOperationAction(ISD::SMIN, MVT::v4i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v4i32, Legal); // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); // We directly match byte blends in the backend as they match the VSELECT // condition form. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // SSE41 brings specific instructions for doing vector sign extend even in // cases where we don't have SRA. for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom); } // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal); // i8 and i16 vectors are custom because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional // information. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); // FIXME: these should be Legal, but that's only for the case where // the index is constant. For now custom expand to deal with that. if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } } if (Subtarget->hasSSE2()) { setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); setOperationAction(ISD::SRL, MVT::v8i16, Custom); setOperationAction(ISD::SRL, MVT::v16i8, Custom); setOperationAction(ISD::SHL, MVT::v8i16, Custom); setOperationAction(ISD::SHL, MVT::v16i8, Custom); setOperationAction(ISD::SRA, MVT::v8i16, Custom); setOperationAction(ISD::SRA, MVT::v16i8, Custom); // In the customized shift lowering, the legal cases in AVX2 will be // recognized. setOperationAction(ISD::SRL, MVT::v2i64, Custom); setOperationAction(ISD::SRL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v2i64, Custom); setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SRA, MVT::v2i64, Custom); setOperationAction(ISD::SRA, MVT::v4i32, Custom); } if (Subtarget->hasXOP()) { setOperationAction(ISD::ROTL, MVT::v16i8, Custom); setOperationAction(ISD::ROTL, MVT::v8i16, Custom); setOperationAction(ISD::ROTL, MVT::v4i32, Custom); setOperationAction(ISD::ROTL, MVT::v2i64, Custom); setOperationAction(ISD::ROTL, MVT::v32i8, Custom); setOperationAction(ISD::ROTL, MVT::v16i16, Custom); setOperationAction(ISD::ROTL, MVT::v8i32, Custom); setOperationAction(ISD::ROTL, MVT::v4i64, Custom); } if (!Subtarget->useSoftFloat() && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); addRegisterClass(MVT::v8i32, &X86::VR256RegClass); addRegisterClass(MVT::v8f32, &X86::VR256RegClass); addRegisterClass(MVT::v4i64, &X86::VR256RegClass); addRegisterClass(MVT::v4f64, &X86::VR256RegClass); setOperationAction(ISD::LOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v4f64, Legal); setOperationAction(ISD::LOAD, MVT::v4i64, Legal); setOperationAction(ISD::FADD, MVT::v8f32, Legal); setOperationAction(ISD::FSUB, MVT::v8f32, Legal); setOperationAction(ISD::FMUL, MVT::v8f32, Legal); setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); setOperationAction(ISD::FRINT, MVT::v8f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); setOperationAction(ISD::FABS, MVT::v8f32, Custom); setOperationAction(ISD::FADD, MVT::v4f64, Legal); setOperationAction(ISD::FSUB, MVT::v4f64, Legal); setOperationAction(ISD::FMUL, MVT::v4f64, Legal); setOperationAction(ISD::FDIV, MVT::v4f64, Legal); setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); setOperationAction(ISD::FRINT, MVT::v4f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); setOperationAction(ISD::FNEG, MVT::v4f64, Custom); setOperationAction(ISD::FABS, MVT::v4f64, Custom); // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); setOperationAction(ISD::SHL, MVT::v16i16, Custom); setOperationAction(ISD::SHL, MVT::v32i8, Custom); setOperationAction(ISD::SRA, MVT::v16i16, Custom); setOperationAction(ISD::SRA, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); setOperationAction(ISD::SETCC, MVT::v8i32, Custom); setOperationAction(ISD::SETCC, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v32i8, Custom); setOperationAction(ISD::CTPOP, MVT::v16i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i64, Custom); setOperationAction(ISD::CTTZ, MVT::v32i8, Custom); setOperationAction(ISD::CTTZ, MVT::v16i16, Custom); setOperationAction(ISD::CTTZ, MVT::v8i32, Custom); setOperationAction(ISD::CTTZ, MVT::v4i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v32i8, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i16, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); if (Subtarget->hasAnyFMA()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); setOperationAction(ISD::FMA, MVT::v2f64, Legal); setOperationAction(ISD::FMA, MVT::f32, Legal); setOperationAction(ISD::FMA, MVT::f64, Legal); } if (Subtarget->hasInt256()) { setOperationAction(ISD::ADD, MVT::v4i64, Legal); setOperationAction(ISD::ADD, MVT::v8i32, Legal); setOperationAction(ISD::ADD, MVT::v16i16, Legal); setOperationAction(ISD::ADD, MVT::v32i8, Legal); setOperationAction(ISD::SUB, MVT::v4i64, Legal); setOperationAction(ISD::SUB, MVT::v8i32, Legal); setOperationAction(ISD::SUB, MVT::v16i16, Legal); setOperationAction(ISD::SUB, MVT::v32i8, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Legal); setOperationAction(ISD::MUL, MVT::v16i16, Legal); setOperationAction(ISD::MUL, MVT::v32i8, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); setOperationAction(ISD::SMAX, MVT::v32i8, Legal); setOperationAction(ISD::SMAX, MVT::v16i16, Legal); setOperationAction(ISD::SMAX, MVT::v8i32, Legal); setOperationAction(ISD::UMAX, MVT::v32i8, Legal); setOperationAction(ISD::UMAX, MVT::v16i16, Legal); setOperationAction(ISD::UMAX, MVT::v8i32, Legal); setOperationAction(ISD::SMIN, MVT::v32i8, Legal); setOperationAction(ISD::SMIN, MVT::v16i16, Legal); setOperationAction(ISD::SMIN, MVT::v8i32, Legal); setOperationAction(ISD::UMIN, MVT::v32i8, Legal); setOperationAction(ISD::UMIN, MVT::v16i16, Legal); setOperationAction(ISD::UMIN, MVT::v8i32, Legal); // The custom lowering for UINT_TO_FP for v8i32 becomes interesting // when we have a 256bit-wide blend with immediate. setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); setOperationAction(ISD::ADD, MVT::v16i16, Custom); setOperationAction(ISD::ADD, MVT::v32i8, Custom); setOperationAction(ISD::SUB, MVT::v4i64, Custom); setOperationAction(ISD::SUB, MVT::v8i32, Custom); setOperationAction(ISD::SUB, MVT::v16i16, Custom); setOperationAction(ISD::SUB, MVT::v32i8, Custom); setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); setOperationAction(ISD::MUL, MVT::v32i8, Custom); setOperationAction(ISD::SMAX, MVT::v32i8, Custom); setOperationAction(ISD::SMAX, MVT::v16i16, Custom); setOperationAction(ISD::SMAX, MVT::v8i32, Custom); setOperationAction(ISD::UMAX, MVT::v32i8, Custom); setOperationAction(ISD::UMAX, MVT::v16i16, Custom); setOperationAction(ISD::UMAX, MVT::v8i32, Custom); setOperationAction(ISD::SMIN, MVT::v32i8, Custom); setOperationAction(ISD::SMIN, MVT::v16i16, Custom); setOperationAction(ISD::SMIN, MVT::v8i32, Custom); setOperationAction(ISD::UMIN, MVT::v32i8, Custom); setOperationAction(ISD::UMIN, MVT::v16i16, Custom); setOperationAction(ISD::UMIN, MVT::v8i32, Custom); } // In the customized shift lowering, the legal cases in AVX2 will be // recognized. setOperationAction(ISD::SRL, MVT::v4i64, Custom); setOperationAction(ISD::SRL, MVT::v8i32, Custom); setOperationAction(ISD::SHL, MVT::v4i64, Custom); setOperationAction(ISD::SHL, MVT::v8i32, Custom); setOperationAction(ISD::SRA, MVT::v4i64, Custom); setOperationAction(ISD::SRA, MVT::v8i32, Custom); // Custom lower several nodes for 256-bit types. for (MVT VT : MVT::vector_valuetypes()) { if (VT.getScalarSizeInBits() >= 32) { setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); } // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. if (VT.is128BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } if (Subtarget->hasInt256()) setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v4i64); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, MVT::v4i64); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, MVT::v4i64); setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); } } if (!Subtarget->useSoftFloat() && Subtarget->hasAVX512()) { addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); addRegisterClass(MVT::i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v16i32, MVT::v16i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v32i16, MVT::v32i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i8, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); setLoadExtAction(ISD::SEXTLOAD, MVT::v8i64, MVT::v8i32, Legal); setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); setOperationAction(ISD::SELECT_CC, MVT::i1, Expand); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); setOperationAction(ISD::SUB, MVT::i1, Custom); setOperationAction(ISD::ADD, MVT::i1, Custom); setOperationAction(ISD::MUL, MVT::i1, Custom); setOperationAction(ISD::LOAD, MVT::v16f32, Legal); setOperationAction(ISD::LOAD, MVT::v8f64, Legal); setOperationAction(ISD::LOAD, MVT::v8i64, Legal); setOperationAction(ISD::LOAD, MVT::v16i32, Legal); setOperationAction(ISD::LOAD, MVT::v16i1, Legal); setOperationAction(ISD::FADD, MVT::v16f32, Legal); setOperationAction(ISD::FSUB, MVT::v16f32, Legal); setOperationAction(ISD::FMUL, MVT::v16f32, Legal); setOperationAction(ISD::FDIV, MVT::v16f32, Legal); setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::FNEG, MVT::v16f32, Custom); setOperationAction(ISD::FABS, MVT::v16f32, Custom); setOperationAction(ISD::FADD, MVT::v8f64, Legal); setOperationAction(ISD::FSUB, MVT::v8f64, Legal); setOperationAction(ISD::FMUL, MVT::v8f64, Legal); setOperationAction(ISD::FDIV, MVT::v8f64, Legal); setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::FNEG, MVT::v8f64, Custom); setOperationAction(ISD::FABS, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); if (Subtarget->hasVLX()){ setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); } else { setOperationAction(ISD::MLOAD, MVT::v8i32, Custom); setOperationAction(ISD::MLOAD, MVT::v8f32, Custom); setOperationAction(ISD::MSTORE, MVT::v8i32, Custom); setOperationAction(ISD::MSTORE, MVT::v8f32, Custom); } setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom); if (Subtarget->hasDQI()) { setOperationAction(ISD::TRUNCATE, MVT::v2i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i1, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); if (Subtarget->hasVLX()) { setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal); } } if (Subtarget->hasVLX()) { setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); } setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); if (Subtarget->hasDQI()) { setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); } setOperationAction(ISD::FFLOOR, MVT::v16f32, Legal); setOperationAction(ISD::FFLOOR, MVT::v8f64, Legal); setOperationAction(ISD::FCEIL, MVT::v16f32, Legal); setOperationAction(ISD::FCEIL, MVT::v8f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v16f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v8f64, Legal); setOperationAction(ISD::FRINT, MVT::v16f32, Legal); setOperationAction(ISD::FRINT, MVT::v8f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v16f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v8f64, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v8i1, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); setOperationAction(ISD::SELECT, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8i1, Custom); setOperationAction(ISD::SMAX, MVT::v16i32, Legal); setOperationAction(ISD::SMAX, MVT::v8i64, Legal); setOperationAction(ISD::UMAX, MVT::v16i32, Legal); setOperationAction(ISD::UMAX, MVT::v8i64, Legal); setOperationAction(ISD::SMIN, MVT::v16i32, Legal); setOperationAction(ISD::SMIN, MVT::v8i64, Legal); setOperationAction(ISD::UMIN, MVT::v16i32, Legal); setOperationAction(ISD::UMIN, MVT::v8i64, Legal); setOperationAction(ISD::ADD, MVT::v8i64, Legal); setOperationAction(ISD::ADD, MVT::v16i32, Legal); setOperationAction(ISD::SUB, MVT::v8i64, Legal); setOperationAction(ISD::SUB, MVT::v16i32, Legal); setOperationAction(ISD::MUL, MVT::v16i32, Legal); setOperationAction(ISD::SRL, MVT::v8i64, Custom); setOperationAction(ISD::SRL, MVT::v16i32, Custom); setOperationAction(ISD::SHL, MVT::v8i64, Custom); setOperationAction(ISD::SHL, MVT::v16i32, Custom); setOperationAction(ISD::SRA, MVT::v8i64, Custom); setOperationAction(ISD::SRA, MVT::v16i32, Custom); setOperationAction(ISD::AND, MVT::v8i64, Legal); setOperationAction(ISD::OR, MVT::v8i64, Legal); setOperationAction(ISD::XOR, MVT::v8i64, Legal); setOperationAction(ISD::AND, MVT::v16i32, Legal); setOperationAction(ISD::OR, MVT::v16i32, Legal); setOperationAction(ISD::XOR, MVT::v16i32, Legal); if (Subtarget->hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i32, Legal); setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ, MVT::v16i16, Custom); setOperationAction(ISD::CTLZ, MVT::v32i8, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i16, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i8, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v16i16, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i8, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom); if (Subtarget->hasVLX()) { setOperationAction(ISD::CTLZ, MVT::v4i64, Legal); setOperationAction(ISD::CTLZ, MVT::v8i32, Legal); setOperationAction(ISD::CTLZ, MVT::v2i64, Legal); setOperationAction(ISD::CTLZ, MVT::v4i32, Legal); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Legal); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Legal); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Legal); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Legal); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); } else { setOperationAction(ISD::CTLZ, MVT::v4i64, Custom); setOperationAction(ISD::CTLZ, MVT::v8i32, Custom); setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i64, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v8i32, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v2i64, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v4i32, Custom); } } // Subtarget->hasCDI() if (Subtarget->hasDQI()) { setOperationAction(ISD::MUL, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Legal); setOperationAction(ISD::MUL, MVT::v8i64, Legal); } // Custom lower several nodes. for (MVT VT : MVT::vector_valuetypes()) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); if (EltSize == 1) { setOperationAction(ISD::AND, VT, Legal); setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); } if ((VT.is128BitVector() || VT.is256BitVector()) && EltSize >= 32) { setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } // Extract subvector is special because the value type // (result) is 256/128-bit but the source is 512-bit wide. if (VT.is128BitVector() || VT.is256BitVector()) { setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); } if (VT.getVectorElementType() == MVT::i1) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); // Do not attempt to custom lower other non-512-bit vectors if (!VT.is512BitVector()) continue; if (EltSize >= 32) { setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::MGATHER, VT, Legal); setOperationAction(ISD::MSCATTER, VT, Custom); } } for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); } }// has AVX-512 if (!Subtarget->useSoftFloat() && Subtarget->hasBWI()) { addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); setOperationAction(ISD::LOAD, MVT::v32i16, Legal); setOperationAction(ISD::LOAD, MVT::v64i8, Legal); setOperationAction(ISD::SETCC, MVT::v32i1, Custom); setOperationAction(ISD::SETCC, MVT::v64i1, Custom); setOperationAction(ISD::ADD, MVT::v32i16, Legal); setOperationAction(ISD::ADD, MVT::v64i8, Legal); setOperationAction(ISD::SUB, MVT::v32i16, Legal); setOperationAction(ISD::SUB, MVT::v64i8, Legal); setOperationAction(ISD::MUL, MVT::v32i16, Legal); setOperationAction(ISD::MULHS, MVT::v32i16, Legal); setOperationAction(ISD::MULHU, MVT::v32i16, Legal); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::SELECT, MVT::v32i1, Custom); setOperationAction(ISD::SELECT, MVT::v64i1, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); setOperationAction(ISD::SMAX, MVT::v64i8, Legal); setOperationAction(ISD::SMAX, MVT::v32i16, Legal); setOperationAction(ISD::UMAX, MVT::v64i8, Legal); setOperationAction(ISD::UMAX, MVT::v32i16, Legal); setOperationAction(ISD::SMIN, MVT::v64i8, Legal); setOperationAction(ISD::SMIN, MVT::v32i16, Legal); setOperationAction(ISD::UMIN, MVT::v64i8, Legal); setOperationAction(ISD::UMIN, MVT::v32i16, Legal); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); if (Subtarget->hasVLX()) setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); if (Subtarget->hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v32i16, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::v64i8, Custom); } for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v8i64); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, MVT::v8i64); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, MVT::v8i64); } } if (!Subtarget->useSoftFloat() && Subtarget->hasVLX()) { addRegisterClass(MVT::v4i1, &X86::VK4RegClass); addRegisterClass(MVT::v2i1, &X86::VK2RegClass); setOperationAction(ISD::SETCC, MVT::v4i1, Custom); setOperationAction(ISD::SETCC, MVT::v2i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); setOperationAction(ISD::SELECT, MVT::v4i1, Custom); setOperationAction(ISD::SELECT, MVT::v2i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v4i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i1, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i1, Custom); setOperationAction(ISD::AND, MVT::v8i32, Legal); setOperationAction(ISD::OR, MVT::v8i32, Legal); setOperationAction(ISD::XOR, MVT::v8i32, Legal); setOperationAction(ISD::AND, MVT::v4i32, Legal); setOperationAction(ISD::OR, MVT::v4i32, Legal); setOperationAction(ISD::XOR, MVT::v4i32, Legal); setOperationAction(ISD::SRA, MVT::v2i64, Custom); setOperationAction(ISD::SRA, MVT::v4i64, Custom); setOperationAction(ISD::SMAX, MVT::v2i64, Legal); setOperationAction(ISD::SMAX, MVT::v4i64, Legal); setOperationAction(ISD::UMAX, MVT::v2i64, Legal); setOperationAction(ISD::UMAX, MVT::v4i64, Legal); setOperationAction(ISD::SMIN, MVT::v2i64, Legal); setOperationAction(ISD::SMIN, MVT::v4i64, Legal); setOperationAction(ISD::UMIN, MVT::v2i64, Legal); setOperationAction(ISD::UMIN, MVT::v4i64, Legal); } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget->is64Bit()) { setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); } // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. // // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { if (VT == MVT::i64 && !Subtarget->is64Bit()) continue; // Add/Sub/Mul with overflow operations are custom lowered. setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); setOperationAction(ISD::USUBO, VT, Custom); setOperationAction(ISD::SMULO, VT, Custom); setOperationAction(ISD::UMULO, VT, Custom); } if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); } // Combine sin / cos into one node or libcall if possible. if (Subtarget->hasSinCos()) { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); if (Subtarget->isTargetDarwin()) { // For MacOSX, we don't want the normal expansion of a libcall to sincos. // We want to issue a libcall to __sincos_stret to avoid memory traffic. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } } if (Subtarget->isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); setOperationAction(ISD::UDIV, MVT::i128, Custom); setOperationAction(ISD::SREM, MVT::i128, Custom); setOperationAction(ISD::UREM, MVT::i128, Custom); setOperationAction(ISD::SDIVREM, MVT::i128, Custom); setOperationAction(ISD::UDIVREM, MVT::i128, Custom); } // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MLOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::MSTORE); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::MSCATTER); setTargetDAGCombine(ISD::MGATHER); computeRegisterProperties(Subtarget->getRegisterInfo()); MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; setPrefLoopAlignment(4); // 2^4 bytes. // A predictable cmov does not hurt on an in-order CPU. // FIXME: Use a CPU attribute to trigger this, not a CPU model. PredictableSelectIsExpensive = !Subtarget->isAtom(); EnableExtLdPromotion = true; setPrefFunctionAlignment(4); // 2^4 bytes. verifyIntrinsicTables(); } // This has so far only been implemented for 64-bit MachO. bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget->isTargetMachO() && Subtarget->is64Bit(); } TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(EVT VT) const { if (ExperimentalVectorWideningLegalization && VT.getVectorNumElements() != 1 && VT.getVectorElementType().getSimpleVT() != MVT::i1) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; if (VT.isSimple()) { MVT VVT = VT.getSimpleVT(); const unsigned NumElts = VVT.getVectorNumElements(); const MVT EltVT = VVT.getVectorElementType(); if (VVT.is512BitVector()) { if (Subtarget->hasAVX512()) if (EltVT == MVT::i32 || EltVT == MVT::i64 || EltVT == MVT::f32 || EltVT == MVT::f64) switch(NumElts) { case 8: return MVT::v8i1; case 16: return MVT::v16i1; } if (Subtarget->hasBWI()) if (EltVT == MVT::i8 || EltVT == MVT::i16) switch(NumElts) { case 32: return MVT::v32i1; case 64: return MVT::v64i1; } } if (VVT.is256BitVector() || VVT.is128BitVector()) { if (Subtarget->hasVLX()) if (EltVT == MVT::i32 || EltVT == MVT::i64 || EltVT == MVT::f32 || EltVT == MVT::f64) switch(NumElts) { case 2: return MVT::v2i1; case 4: return MVT::v4i1; case 8: return MVT::v8i1; } if (Subtarget->hasBWI() && Subtarget->hasVLX()) if (EltVT == MVT::i8 || EltVT == MVT::i16) switch(NumElts) { case 8: return MVT::v8i1; case 16: return MVT::v16i1; case 32: return MVT::v32i1; } } } return VT.changeVectorElementTypeToInteger(); } /// Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (MaxAlign == 16) return; if (VectorType *VTy = dyn_cast
(Ty)) { if (VTy->getBitWidth() == 128) MaxAlign = 16; } else if (ArrayType *ATy = dyn_cast
(Ty)) { unsigned EltAlign = 0; getMaxByValAlign(ATy->getElementType(), EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast
(Ty)) { for (auto *EltTy : STy->elements()) { unsigned EltAlign = 0; getMaxByValAlign(EltTy, EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == 16) break; } } } /// Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty, const DataLayout &DL) const { if (Subtarget->is64Bit()) { // Max of 8 and alignment of type. unsigned TyAlign = DL.getABITypeAlignment(Ty); if (TyAlign > 8) return TyAlign; return 8; } unsigned Align = 4; if (Subtarget->hasSSE1()) getMaxByValAlign(Ty, Align); return Align; } /// Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If 'IsMemset' is /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. EVT X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { const Function *F = MF.getFunction(); if ((!IsMemset || ZeroMemset) && !F->hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && (!Subtarget->isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { if (Size >= 32) { // FIXME: Check if unaligned 32-byte accesses are slow. if (Subtarget->hasInt256()) return MVT::v8i32; if (Subtarget->hasFp256()) return MVT::v8f32; } if (Subtarget->hasSSE2()) return MVT::v4i32; if (Subtarget->hasSSE1()) return MVT::v4f32; } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && Subtarget->hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. return MVT::f64; } } // This is a compromise. If we reach here, unaligned accesses may be slow on // this target. However, creating smaller, aligned accesses could be even // slower and would certainly be a lot more code. if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; } bool X86TargetLowering::isSafeMemOpType(MVT VT) const { if (VT == MVT::f32) return X86ScalarSSEf32; else if (VT == MVT::f64) return X86ScalarSSEf64; return true; } bool X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { if (Fast) { switch (VT.getSizeInBits()) { default: // 8-byte and under are always assumed to be fast. *Fast = true; break; case 128: *Fast = !Subtarget->isUnalignedMem16Slow(); break; case 256: *Fast = !Subtarget->isUnalignedMem32Slow(); break; // TODO: What about AVX-512 (512-bit) accesses? } } // Misaligned accesses of any size are always allowed. return true; } /// Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. unsigned X86TargetLowering::getJumpTableEncoding() const { // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF // symbol. if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()) return MachineJumpTableInfo::EK_Custom32; // Otherwise, use the normal jump table encoding heuristics. return TargetLowering::getJumpTableEncoding(); } bool X86TargetLowering::useSoftFloat() const { return Subtarget->useSoftFloat(); } const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid,MCContext &Ctx) const{ assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. return MCSymbolRefExpr::create(MBB->getSymbol(), MCSymbolRefExpr::VK_GOTOFF, Ctx); } /// Returns relocation base for the given PIC jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget->is64Bit()) // This doesn't have SDLoc associated with it, but is not really the // same as a Register. return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())); return Table; } /// This returns the relocation base for the given PIC jumptable, /// the same as getPICJumpTableRelocBase, but as an MCExpr. const MCExpr *X86TargetLowering:: getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { // X86-64 uses RIP relative addressing based on the jump table label. if (Subtarget->isPICStyleRIPRel()) return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); // Otherwise, the reference is relative to the PIC base. return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); } std::pair
X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: return TargetLowering::findRepresentativeClass(TRI, VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; break; case MVT::x86mmx: RRC = &X86::VR64RegClass; break; case MVT::f32: case MVT::f64: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: RRC = &X86::VR128RegClass; break; } return std::make_pair(RRC, Cost); } bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const { if (!Subtarget->isTargetLinux()) return false; if (Subtarget->is64Bit()) { // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: Offset = 0x28; if (getTargetMachine().getCodeModel() == CodeModel::Kernel) AddressSpace = 256; else AddressSpace = 257; } else { // %gs:0x14 on i386 Offset = 0x14; AddressSpace = 256; } return true; } Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { if (!Subtarget->isTargetAndroid()) return TargetLowering::getSafeStackPointerLocation(IRB); // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h unsigned AddressSpace, Offset; if (Subtarget->is64Bit()) { // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: Offset = 0x48; if (getTargetMachine().getCodeModel() == CodeModel::Kernel) AddressSpace = 256; else AddressSpace = 257; } else { // %gs:0x24 on i386 Offset = 0x24; AddressSpace = 256; } return ConstantExpr::getIntToPtr( ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); } bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); return SrcAS < 256 && DestAS < 256; } //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// #include "X86GenCallingConv.inc" bool X86TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl
&Outs, LLVMContext &Context) const { SmallVector
RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; return ScratchRegs; } SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl
&Outs, const SmallVectorImpl
&OutVals, SDLoc dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo
(); if (CallConv == CallingConv::X86_INTR && !Outs.empty()) report_fatal_error("X86 interrupts may not return any value"); SmallVector
RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); SDValue Flag; SmallVector
RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, MVT::i16)); // Copy the result values into the output registers. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue ValToCopy = OutVals[i]; EVT ValVT = ValToCopy.getValueType(); // Promote values to the appropriate types. if (VA.getLocInfo() == CCValAssign::SExt) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::ZExt) ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) { if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); } else if (VA.getLocInfo() == CCValAssign::BCvt) ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); assert(VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."); // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. if (ValVT == MVT::f64 && (Subtarget->is64Bit() && !Subtarget->hasSSE2())) report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) { // If this is a copy from an xmm register to ST(0), use an FPExtend to // change the value to the FP stack register class. if (isScalarFPTypeInSSEReg(VA.getValVT())) ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); RetOps.push_back(ValToCopy); // Don't emit a copytoreg. continue; } // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 // which is returned in RAX / RDX. if (Subtarget->is64Bit()) { if (ValVT == MVT::x86mmx) { if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. if (!Subtarget->hasSSE2()) ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); } } } Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } // All x86 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. // // Checking Function.hasStructRetAttr() here is insufficient because the IR // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is // false, then an sret argument may be implicitly inserted in the SelDAG. In // either case FuncInfo->setSRetReturnReg() will have been called. if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) { SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy(MF.getDataLayout())); unsigned RetValReg = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? X86::RAX : X86::EAX; Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); Flag = Chain.getValue(1); // RAX/EAX now acts like a return value. RetOps.push_back( DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); X86ISD::NodeType opcode = X86ISD::RET_FLAG; if (CallConv == CallingConv::X86_INTR) opcode = X86ISD::IRET; return DAG.getNode(opcode, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != X86ISD::RET_FLAG) return false; // If we are returning more than one value, we can definitely // not make a tail call see PR19530 if (UI->getNumOperands() > 4) return false; if (UI->getNumOperands() == 4 && UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } EVT X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const { MVT ReturnMVT; // TODO: Is this also valid on 32-bit? if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) ReturnMVT = MVT::i8; else ReturnMVT = MVT::i32; EVT MinVT = getRegisterType(Context, ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; } /// Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// SDValue X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl
&Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl
&InVals) const { // Assign locations to each value returned by this call. SmallVector
RVLocs; bool Is64Bit = Subtarget->is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; EVT CopyVT = VA.getLocVT(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. bool RoundAfterCopy = false; if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && isScalarFPTypeInSSEReg(VA.getValVT())) { CopyVT = MVT::f80; RoundAfterCopy = (CopyVT != VA.getLocVT()); } Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag).getValue(1); SDValue Val = Chain.getValue(0); if (RoundAfterCopy) Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1, dl)); if (VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1) Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); InFlag = Chain.getValue(2); InVals.push_back(Val); } return Chain; } //===----------------------------------------------------------------------===// // C & StdCall & Fast Calling Convention implementation //===----------------------------------------------------------------------===// // StdCall calling convention seems to be standard for many Windows' API // routines and around. It differs from C calling convention just a little: // callee should clean up the stack, not caller. Symbols should be also // decorated in some fancy way :) It doesn't support any vector arguments. // For info on fast calling convention see Fast Calling Convention (tail call) // implementation LowerX86_32FastCCCallTo. /// CallIsStructReturn - Determines whether a call uses struct return /// semantics. enum StructReturnType { NotStructReturn, RegStructReturn, StackStructReturn }; static StructReturnType callIsStructReturn(const SmallVectorImpl
&Outs) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg()) return RegStructReturn; return StackStructReturn; } /// Determines whether a function uses struct return semantics. static StructReturnType argsAreStructReturn(const SmallVectorImpl
&Ins) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg()) return RegStructReturn; return StackStructReturn; } /// Make a copy of an aggregate at address specified by "Src" to address /// "Dst" with size and alignment information specified by the specific /// parameter attribute. The copy will be passed as a byval function parameter. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*isVolatile*/false, /*AlwaysInline=*/true, /*isTailCall*/false, MachinePointerInfo(), MachinePointerInfo()); } /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::HiPE || CC == CallingConv::HHVM); } /// Return true if we might ever do TCO for calls with this calling convention. static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { // C calling conventions: case CallingConv::C: case CallingConv::X86_64_Win64: case CallingConv::X86_64_SysV: // Callee pop conventions: case CallingConv::X86_ThisCall: case CallingConv::X86_StdCall: case CallingConv::X86_VectorCall: case CallingConv::X86_FastCall: return true; default: return canGuaranteeTCO(CC); } } /// Return true if the function is being made into a tailcall target by /// changing its ABI. static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && canGuaranteeTCO(CC); } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { auto Attr = CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; CallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); if (!mayTailCallThisCC(CalleeCC)) return false; return true; } SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl
&Ins, SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo *MFI, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; bool AlwaysUseMutable = shouldGuaranteeTCO( CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; // If value is passed by pointer we have address passed instead of the value // itself. bool ExtendedInMem = VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1; if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) ValVT = VA.getLocVT(); else ValVT = VA.getValVT(); // Calculate SP offset of interrupt parameter, re-arrange the slot normally // taken by a return address. int Offset = 0; if (CallConv == CallingConv::X86_INTR) { const X86Subtarget& Subtarget = static_cast
(DAG.getSubtarget()); // X86 interrupts may take one or two arguments. // On the stack there will be no return address as in regular call. // Offset of last argument need to be set to -4/-8 bytes. // Where offset of the first argument out of two, should be set to 0 bytes. Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1); } // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they // could be overwritten by lowering of arguments in case of a tail call. if (Flags.isByVal()) { unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); // Adjust SP offset of interrupt parameter. if (CallConv == CallingConv::X86_INTR) { MFI->setObjectOffset(FI, Offset); } return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); // Adjust SP offset of interrupt parameter. if (CallConv == CallingConv::X86_INTR) { MFI->setObjectOffset(FI, Offset); } SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue Val = DAG.getLoad( ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, false, false, 0); return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val; } } // FIXME: Get this from tablegen. static ArrayRef
get64BitArgumentGPRs(CallingConv::ID CallConv, const X86Subtarget *Subtarget) { assert(Subtarget->is64Bit()); if (Subtarget->isCallingConvWin64(CallConv)) { static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); } static const MCPhysReg GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); } // FIXME: Get this from tablegen. static ArrayRef
get64BitArgumentXMMs(MachineFunction &MF, CallingConv::ID CallConv, const X86Subtarget *Subtarget) { assert(Subtarget->is64Bit()); if (Subtarget->isCallingConvWin64(CallConv)) { // The XMM registers which might contain var arg parameters are shadowed // in their paired GPR. So we only need to save the GPR to their home // slots. // TODO: __vectorcall will change this. return None; } const Function *Fn = MF.getFunction(); bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat); bool isSoftFloat = Subtarget->useSoftFloat(); assert(!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); if (isSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) // Kernel mode asks for SSE to be disabled, so there are no XMM argument // registers. return None; static const MCPhysReg XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); } SDValue X86TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl
&Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl
&InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo
(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); const Function* Fn = MF.getFunction(); if (Fn->hasExternalLinkage() && Subtarget->isTargetCygMing() && Fn->getName() == "main") FuncInfo->setForceFramePointer(true); MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); if (CallConv == CallingConv::X86_INTR) { bool isLegal = Ins.size() == 1 || (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) || (!Is64Bit && Ins[1].VT == MVT::i32))); if (!isLegal) report_fatal_error("X86 interrupts may take one or two arguments"); } // Assign locations to all of the incoming arguments. SmallVector
ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeFormalArguments(Ins, CC_X86); unsigned LastVal = ~0U; SDValue ArgValue; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; // TODO: If an arg is passed in two places (e.g. reg and stack), skip later // places. assert(VA.getValNo() != LastVal && "Don't support value assigned to multiple locs yet"); (void)LastVal; LastVal = VA.getValNo(); if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = &X86::GR32RegClass; else if (Is64Bit && RegVT == MVT::i64) RC = &X86::GR64RegClass; else if (RegVT == MVT::f32) RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; else if (RegVT == MVT::f128) RC = &X86::FR128RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) RC = &X86::VR256RegClass; else if (RegVT.is128BitVector()) RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; else if (RegVT == MVT::i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; else if (RegVT == MVT::v32i1) RC = &X86::VK32RegClass; else if (RegVT == MVT::v64i1) RC = &X86::VK64RegClass; else llvm_unreachable("Unknown argument type!"); unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); // If this is an 8 or 16-bit value, it is really passed promoted to 32 // bits. Insert an assert[sz]ext to capture this, then truncate to the // right size. if (VA.getLocInfo() == CCValAssign::SExt) ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::ZExt) ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::BCvt) ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); } } else { assert(VA.isMemLoc()); ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); } // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect) ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo(), false, false, false, 0); InVals.push_back(ArgValue); } for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // All x86 ABIs require that for returning structs by value we copy the // sret argument into %rax/%eax (depending on ABI) for the return. Save // the argument into a virtual register so that we can access it from the // return points. if (Ins[i].Flags.isSRet()) { unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { MVT PtrTy = getPointerTy(DAG.getDataLayout()); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); break; } } unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. if (shouldGuaranteeTCO(CallConv, MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. We // can skip this if there are no va_start calls. if (MFI->hasVAStart() && (Is64Bit || (CallConv != CallingConv::X86_FastCall && CallConv != CallingConv::X86_ThisCall))) { FuncInfo->setVarArgsFrameIndex( MFI->CreateFixedObject(1, StackSize, true)); } // Figure out if XMM registers are in use. assert(!(Subtarget->useSoftFloat() && Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); // 64-bit calling conventions support varargs and register parameters, so we // have to do extra work to spill them in the prologue. if (Is64Bit && isVarArg && MFI->hasVAStart()) { // Find the first unallocated argument registers. ArrayRef
ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); ArrayRef
ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget); unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); // Gather all the live in physical registers. SmallVector
LiveGPRs; SmallVector
LiveXMMRegs; SDValue ALVal; for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass); LiveGPRs.push_back( DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64)); } if (!ArgXMMs.empty()) { unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8); for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) { unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass); LiveXMMRegs.push_back( DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32)); } } if (IsWin64) { // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; FuncInfo->setRegSaveFrameIndex( MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); // Fixup to set vararg frame on shadow area (4 x i64). if (NumIntRegs < 4) FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { // For X86-64, if there are vararg parameters that are passed via // registers, then we must store them to their spots on the stack so // they may be loaded by deferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject( ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false)); } // Store the integer parameter registers. SmallVector
MemOps; SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy(DAG.getDataLayout())); unsigned Offset = FuncInfo->getVarArgsGPOffset(); for (SDValue Val : LiveGPRs) { SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), RSFIN, DAG.getIntPtrConstant(Offset, dl)); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(), Offset), false, false, 0); MemOps.push_back(Store); Offset += 8; } if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) { // Now store the XMM (fp + vector) parameter registers. SmallVector
SaveXMMOps; SaveXMMOps.push_back(Chain); SaveXMMOps.push_back(ALVal); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getRegSaveFrameIndex(), dl)); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getVarArgsFPOffset(), dl)); SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(), LiveXMMRegs.end()); MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } if (isVarArg && MFI->hasMustTailInVarArgFunc()) { // Find the largest legal vector type. MVT VecVT = MVT::Other; // FIXME: Only some x86_32 calling conventions support AVX512. if (Subtarget->hasAVX512() && (Is64Bit || (CallConv == CallingConv::X86_VectorCall || CallConv == CallingConv::Intel_OCL_BI))) VecVT = MVT::v16f32; else if (Subtarget->hasAVX()) VecVT = MVT::v8f32; else if (Subtarget->hasSSE2()) VecVT = MVT::v4f32; // We forward some GPRs and some vector types. SmallVector
RegParmTypes; MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32; RegParmTypes.push_back(IntVT); if (VecVT != MVT::Other) RegParmTypes.push_back(VecVT); // Compute the set of forwarded registers. The rest are scratch. SmallVectorImpl
&Forwards = FuncInfo->getForwardedMustTailRegParms(); CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); // Conservatively forward AL on x86_64, since it might be used for varargs. if (Is64Bit && !CCInfo.isAllocated(X86::AL)) { unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass); Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); } // Copy all forwards from physical to virtual registers. for (ForwardedRegister &F : Forwards) { // FIXME: Can we use a less constrained schedule? SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT)); Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal); } } // Some CCs need callee pop. if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { // X86 interrupts must pop the error code if present FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4); } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && argsAreStructReturn(Ins) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } if (!Is64Bit) { // RegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); if (CallConv == CallingConv::X86_FastCall || CallConv == CallingConv::X86_ThisCall) // fastcc functions can't have varargs. FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); } FuncInfo->setArgumentStackSize(StackSize); if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn()); if (Personality == EHPersonality::CoreCLR) { assert(Is64Bit); // TODO: Add a mechanism to frame lowering that will allow us to indicate // that we'd prefer this slot be allocated towards the bottom of the frame // (i.e. near the stack pointer after allocating the frame). Every // funclet needs a copy of this slot in its (mostly empty) frame, and the // offset from the bottom of this and each funclet's frame must be the // same, so the size of funclets' (mostly empty) frames is dictated by // how far this slot is from the bottom (since they allocate just enough // space to accomodate holding this slot at the correct offset). int PSPSymFI = MFI->CreateStackObject(8, 8, /*isSS=*/false); EHInfo->PSPSymFrameIdx = PSPSymFI; } } return Chain; } SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); return DAG.getStore( Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), false, false, 0); } /// Emit a load of return address if tail call /// optimization is performed and it is required. SDValue X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, SDLoc dl) const { // Adjust the Return address stack slot. EVT VT = getPointerTy(DAG.getDataLayout()); OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), false, false, false, 0); return SDValue(OutRetAddr.getNode(), 1); } /// Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, unsigned SlotSize, int FPDiff, SDLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int NewReturnAddrFI = MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), NewReturnAddrFI), false, false, 0); return Chain; } /// Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector
Mask; Mask.push_back(NumElems); for (unsigned i = 1; i != NumElems; ++i) Mask.push_back(i); return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl
&InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl
&Outs = CLI.Outs; SmallVectorImpl
&OutVals = CLI.OutVals; SmallVectorImpl
&Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; CallingConv::ID CallConv = CLI.CallConv; bool &isTailCall = CLI.IsTailCall; bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo
(); auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); if (CallConv == CallingConv::X86_INTR) report_fatal_error("X86 interrupts may not be called directly"); if (Attr.getValueAsString() == "true") isTailCall = false; if (Subtarget->isPICStyleGOT() && !MF.getTarget().Options.GuaranteedTailCallOpt) { // If we are using a GOT, disable tail calls to external symbols with // default visibility. Tail calling such a symbol requires using a GOT // relocation, which forces early binding of the symbol. This breaks code // that require lazy function symbol resolution. Using musttail or // GuaranteedTailCallOpt will override this. GlobalAddressSDNode *G = dyn_cast
(Callee); if (!G || (!G->getGlobal()->hasLocalLinkage() && G->getGlobal()->hasDefaultVisibility())) isTailCall = false; } bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); if (IsMustTail) { // Force this to be a tail call. The verifier rules are enough to ensure // that we can lower this successfully without moving the return address // around. isTailCall = true; } else if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, MF.getFunction()->hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require // ABI changes. if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) IsSibcall = true; if (isTailCall) ++NumTailCalls; } assert(!(isVarArg && canGuaranteeTCO(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Analyze operands of the call, assigning locations to each operand. SmallVector
ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); if (IsSibcall) // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; else if (MF.getTarget().Options.GuaranteedTailCallOpt && canGuaranteeTCO(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; // Set the delta of movement of the returnaddr stackslot. // But only set if delta is greater than previous delta. if (FPDiff < X86Info->getTCReturnAddrDelta()) X86Info->setTCReturnAddrDelta(FPDiff); } unsigned NumBytesToPush = NumBytes; unsigned NumBytesToPop = NumBytes; // If we have an inalloca argument, all stack space has already been allocated // for us and be right at the top of the stack. We don't support multiple // arguments passed in memory when using inalloca. if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { NumBytesToPush = 0; if (!ArgLocs.back().isMemLoc()) report_fatal_error("cannot use inalloca attribute on a register " "parameter"); if (ArgLocs.back().getLocMemOffset() != 0) report_fatal_error("any parameter with the inalloca attribute must be " "the only memory argument"); } if (!IsSibcall) Chain = DAG.getCALLSEQ_START( Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl); SDValue RetAddrFrIdx; // Load return address for tail calls. if (isTailCall && FPDiff) Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, FPDiff, dl); SmallVector
, 8> RegsToPass; SmallVector
MemOpChains; SDValue StackPtr; // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; if (Flags.isInAlloca()) continue; CCValAssign &VA = ArgLocs[i]; EVT RegVT = VA.getLocVT(); SDValue Arg = OutVals[i]; bool isByVal = Flags.isByVal(); // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); break; case CCValAssign::AExt: if (Arg.getValueType().isVector() && Arg.getValueType().getVectorElementType() == MVT::i1) Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); else if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getBitcast(MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); } else Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); break; case CCValAssign::BCvt: Arg = DAG.getBitcast(RegVT, Arg); break; case CCValAssign::Indirect: { // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast
(SpillSlot)->getIndex(); Chain = DAG.getStore( Chain, dl, Arg, SpillSlot, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, false, 0); Arg = SpillSlot; break; } } if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. unsigned ShadowReg = 0; switch (VA.getLocReg()) { case X86::XMM0: ShadowReg = X86::RCX; break; case X86::XMM1: ShadowReg = X86::RDX; break; case X86::XMM2: ShadowReg = X86::R8; break; case X86::XMM3: ShadowReg = X86::R9; break; } if (ShadowReg) RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); } } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); if (Subtarget->isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { RegsToPass.push_back(std::make_pair( unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy(DAG.getDataLayout())))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of // the tail jump. This is done to circumvent the ebx/callee-saved problem // for tail calls on PIC/GOT architectures. Normally we would just put the // address of GOT into ebx and then call target@PLT. But for tail calls // ebx would be restored (since ebx is callee saved) before jumping to the // target@PLT. // Note: The actual moving to ECX is done further down. GlobalAddressSDNode *G = dyn_cast
(Callee); if (G && !G->getGlobal()->hasLocalLinkage() && G->getGlobal()->hasDefaultVisibility()) Callee = LowerGlobalAddress(Callee, DAG); else if (isa
(Callee)) Callee = LowerExternalSymbol(Callee, DAG); } } if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in // the declaration) %al is used as hidden argument to specify the number // of SSE registers used. The contents of %al do not need to match exactly // the number of registers, but must be an ubound on the number of SSE // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); RegsToPass.push_back(std::make_pair(unsigned(X86::AL), DAG.getConstant(NumXMMRegs, dl, MVT::i8))); } if (isVarArg && IsMustTail) { const auto &Forwards = X86Info->getForwardedMustTailRegParms(); for (const auto &F : Forwards) { SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val)); } } // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. if (!IsSibcall && isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and // the alias isn't otherwise explicit. This is slightly more conservative // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); SmallVector
MemOpChains2; SDValue FIN; int FI = 0; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (VA.isRegLoc()) continue; assert(VA.isMemLoc()); SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; // Skip inalloca arguments. They don't require any work. if (Flags.isInAlloca()) continue; // Create frame index. int32_t Offset = VA.getLocMemOffset()+FPDiff; uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); if (Flags.isByVal()) { // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy(DAG.getDataLayout())); Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, Source); MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, ArgChain, Flags, DAG, dl)); } else { // Store relative to framepointer. MemOpChains2.push_back(DAG.getStore( ArgChain, dl, Arg, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), false, false, 0)); } } if (!MemOpChains2.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, getPointerTy(DAG.getDataLayout()), RegInfo->getSlotSize(), FPDiff, dl); } // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into registers. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } if (DAG.getTarget().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. } else if (Callee->getOpcode() == ISD::GlobalAddress) { // If the callee is a GlobalAddress node (quite common, every direct call // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack // it. GlobalAddressSDNode* G = cast
(Callee); // We should use extra load for direct calls to dllimported functions in // non-JIT mode. const GlobalValue *GV = G->getGlobal(); if (!GV->hasDLLImportStorageClass()) { unsigned char OpFlags = 0; bool ExtraLoad = false; unsigned WrapperKind = ISD::DELETED_NODE; // On ELF targets, in both X86-64 and X86-32 mode, direct calls to // external symbols most go through the PLT in PIC mode. If the symbol // has hidden or protected visibility, or if it is static or local, then // we don't need to use the PLT - we can directly call it. if (Subtarget->isTargetELF() && DAG.getTarget().getRelocationModel() == Reloc::PIC_ && GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && !GV->isStrongDefinitionForLinker() && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; } else if (Subtarget->isPICStyleRIPRel() && isa
(GV) && cast
(GV)->hasFnAttribute(Attribute::NonLazyBind)) { // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead // at the cost of eager binding (and one extra byte of encoding). OpFlags = X86II::MO_GOTPCREL; WrapperKind = X86ISD::WrapperRIP; ExtraLoad = true; } Callee = DAG.getTargetGlobalAddress( GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags); // Add a wrapper if needed. if (WrapperKind != ISD::DELETED_NODE) Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(DAG.getDataLayout()), Callee); // Add extra indirection if needed. if (ExtraLoad) Callee = DAG.getLoad( getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(DAG.getMachineFunction()), false, false, false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast
(Callee)) { unsigned char OpFlags = 0; // On ELF targets, in either X86-64 or X86-32 mode, direct calls to // external symbols should go through the PLT. if (Subtarget->isTargetELF() && DAG.getTarget().getRelocationModel() == Reloc::PIC_) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; } Callee = DAG.getTargetExternalSymbol( S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags); } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) { // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); } // Returns a chain & a flag for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector
Ops; if (!IsSibcall && isTailCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } Ops.push_back(Chain); Ops.push_back(Callee); if (isTailCall) Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); // If this is an invoke in a 32-bit function using a funclet-based // personality, assume the function clobbers all registers. If an exception // is thrown, the runtime will not restore CSRs. // FIXME: Model this more precisely so that we can register allocate across // the normal edge and spill and fill across the exceptional edge. if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) { const Function *CallerFn = MF.getFunction(); EHPersonality Pers = CallerFn->hasPersonalityFn() ? classifyEHPersonality(CallerFn->getPersonalityFn()) : EHPersonality::Unknown; if (isFuncletEHPersonality(Pers)) Mask = RegInfo->getNoPreservedMask(); } Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) Ops.push_back(InFlag); if (isTailCall) { // We used to do: //// If this is the first return lowered for this function, add the regs //// to the liveout set for the function. // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. MF.getFrameInfo()->setHasTailCall(); return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); } Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything else if (!Is64Bit && !canGuaranteeTCO(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. // For MSVC Win32 targets, the caller pops the hidden struct pointer. NumBytesForCalleeToPop = 4; else NumBytesForCalleeToPop = 0; // Callee pops nothing. // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, dl, true), DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl, true), InFlag, dl); InFlag = Chain.getValue(1); } // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals); } //===----------------------------------------------------------------------===// // Fast Calling Convention (tail call) implementation //===----------------------------------------------------------------------===// // Like std call, callee cleans arguments, convention except that ECX is // reserved for storing the tail called function address. Only 2 registers are // free for argument passing (inreg). Tail call optimization is performed // provided: // * tailcallopt is enabled // * caller/callee are fastcc // On X86_64 architecture with GOT-style position independent code only local // (within module) calls are supported at the moment. // To keep the stack aligned according to platform abi the function // GetAlignedArgumentStackSize ensures that argument delta is always multiples // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the // original RETADDR, but before the saved framepointer or the spilled registers // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) // stack layout: // arg1 // arg2 // RETADDR // [ new RETADDR // move area ] // (possible EBP) // ESI // EDI // local1 .. /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align /// requirement. unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); const TargetFrameLowering &TFI = *Subtarget->getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; unsigned SlotSize = RegInfo->getSlotSize(); if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { // Number smaller than 12 so just add the difference. Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); } else { // Mask out lower bits, add stackalignment once plus the 12 bytes. Offset = ((~AlignMask) & Offset) + StackAlignment + (StackAlignment-SlotSize); } return Offset; } /// Return true if the given stack call argument is already available in the /// same position (relatively) of the caller's incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, const X86InstrInfo *TII) { unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast
(Arg.getOperand(1))->getReg(); if (!TargetRegisterInfo::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) return false; if (!Flags.isByVal()) { if (!TII->isLoadFromStackSlot(Def, FI)) return false; } else { unsigned Opcode = Def->getOpcode(); if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || Opcode == X86::LEA64_32r) && Def->getOperand(1).isFI()) { FI = Def->getOperand(1).getIndex(); Bytes = Flags.getByValSize(); } else return false; } } else if (LoadSDNode *Ld = dyn_cast
(Arg)) { if (Flags.isByVal()) // ByVal argument is passed in as a pointer but it's now being // dereferenced. e.g. // define @foo(%struct.X* %A) { // tail call @bar(%struct.X* byval %A) // } return false; SDValue Ptr = Ld->getBasePtr(); FrameIndexSDNode *FINode = dyn_cast
(Ptr); if (!FINode) return false; FI = FINode->getIndex(); } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { FrameIndexSDNode *FINode = cast
(Arg); FI = FINode->getIndex(); Bytes = Flags.getByValSize(); } else return false; assert(FI != INT_MAX); if (!MFI->isFixedObjectIndex(FI)) return false; return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); } /// Check whether the call is eligible for tail call optimization. Targets /// that want to do tail call optimization should implement this function. bool X86TargetLowering::IsEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, const SmallVectorImpl
&Outs, const SmallVectorImpl
&OutVals, const SmallVectorImpl
&Ins, SelectionDAG &DAG) const { if (!mayTailCallThisCC(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, // then the FP_EXTEND of the call result is not a nop. It's not safe to // perform a tailcall optimization here. if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) return false; CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); // Win64 functions have extra shadow space for argument homing. Don't do the // sibcall if the caller and callee have mismatched expectations for this // space. if (IsCalleeWin64 != IsCallerWin64) return false; if (DAG.getTarget().Options.GuaranteedTailCallOpt) { if (canGuaranteeTCO(CalleeCC) && CCMatch) return true; return false; } // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); if (RegInfo->needsStackRealignment(MF)) return false; // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; // Do not sibcall optimize vararg calls unless all arguments are passed via // registers. if (isVarArg && !Outs.empty()) { // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. if (IsCalleeWin64 || IsCallerWin64) return false; SmallVector
ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) if (!ArgLocs[i].isRegLoc()) return false; } // If the call result is in ST0 / ST1, it needs to be popped off the x87 // stack. Therefore, if it's not used by the call it is not safe to optimize // this into a sibcall. bool Unused = false; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (!Ins[i].Used) { Unused = true; break; } } if (Unused) { SmallVector
RVLocs; CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; } } // If the calling conventions do not match, then we'd better make sure the // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector
RVLocs1; CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector
RVLocs2; CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) return false; for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) return false; if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) return false; if (RVLocs1[i].isRegLoc()) { if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) return false; } else { if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) return false; } } } unsigned StackArgsSize = 0; // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector
ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsCalleeWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); StackArgsSize = CCInfo.getNextStackOffset(); if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = Subtarget->getInstrInfo(); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; if (!VA.isRegLoc()) { if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, TII)) return false; } } } // If the tailcall address may be in a register, then make sure it's // possible to register allocate for it. In 32-bit, the call address can // only target EAX, EDX, or ECX since the tail call must be scheduled after // callee-saved registers are restored. These happen to be the same // registers used to pass 'inreg' arguments so watch out for those. if (!Subtarget->is64Bit() && ((!isa
(Callee) && !isa
(Callee)) || DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { unsigned NumInRegs = 0; // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) continue; unsigned Reg = VA.getLocReg(); switch (Reg) { default: break; case X86::EAX: case X86::EDX: case X86::ECX: if (++NumInRegs == MaxInRegs) return false; break; } } } } bool CalleeWillPop = X86::isCalleePop(CalleeCC, Subtarget->is64Bit(), isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt); if (unsigned BytesToPop = MF.getInfo
()->getBytesToPopOnReturn()) { // If we have bytes to pop, the callee must pop them. bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; if (!CalleePopMatches) return false; } else if (CalleeWillPop && StackArgsSize > 0) { // If we don't have bytes to pop, make sure the callee doesn't pop any. return false; } return true; } FastISel * X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return X86::createFastISel(funcInfo, libInfo); } //===----------------------------------------------------------------------===// // Other Lowering Hooks //===----------------------------------------------------------------------===// static bool MayFoldLoad(SDValue Op) { return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); } static bool MayFoldIntoStore(SDValue Op) { return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); } static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; case X86ISD::BLENDI: case X86ISD::PSHUFB: case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::PALIGNR: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: case X86ISD::MOVLPS: case X86ISD::MOVLPD: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILPI: case X86ISD::VPERM2X128: case X86ISD::VPERMI: case X86ISD::VPERMV: case X86ISD::VPERMV3: return true; } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::VPERMILPI: case X86ISD::VPERMI: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, dl, MVT::i8)); } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, MVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: case X86ISD::MOVLPS: case X86ISD::MOVLPD: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: return DAG.getNode(Opc, dl, VT, V1, V2); } } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); X86MachineFunctionInfo *FuncInfo = MF.getInfo
(); int ReturnAddrIndex = FuncInfo->getRAIndex(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); } bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra // restrictions. if (!hasSymbolicDisplacement) return true; // FIXME: Some tweaks might be needed for medium code model. if (M != CodeModel::Small && M != CodeModel::Kernel) return false; // For small code model we assume that latest object is 16MB before end of 31 // bits boundary. We may also accept pretty large negative constants knowing // that all objects are in the positive half of address space. if (M == CodeModel::Small && Offset < 16*1024*1024) return true; // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. if (M == CodeModel::Kernel && Offset >= 0) return true; return false; } /// Determines whether the callee is required to pop its own arguments. /// Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { // If GuaranteeTCO is true, we force some calls to be callee pop so that we // can guarantee TCO. if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) return true; switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: case CallingConv::X86_FastCall: case CallingConv::X86_ThisCall: case CallingConv::X86_VectorCall: return !is64Bit; } } /// \brief Return true if the condition is an unsigned comparison operation. static bool isX86CCUnsigned(unsigned X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); case X86::COND_E: return true; case X86::COND_G: return false; case X86::COND_GE: return false; case X86::COND_L: return false; case X86::COND_LE: return false; case X86::COND_NE: return true; case X86::COND_B: return true; case X86::COND_A: return true; case X86::COND_BE: return true; case X86::COND_AE: return true; } } static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { switch (SetCCOpcode) { default: llvm_unreachable("Invalid integer condition!"); case ISD::SETEQ: return X86::COND_E; case ISD::SETGT: return X86::COND_G; case ISD::SETGE: return X86::COND_GE; case ISD::SETLT: return X86::COND_L; case ISD::SETLE: return X86::COND_LE; case ISD::SETNE: return X86::COND_NE; case ISD::SETULT: return X86::COND_B; case ISD::SETUGT: return X86::COND_A; case ISD::SETULE: return X86::COND_BE; case ISD::SETUGE: return X86::COND_AE; } } /// Do a one-to-one translation of a ISD::CondCode to the X86-specific /// condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, SDLoc DL, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast
(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { // X < 0 -> X == 0, jump on sign. return X86::COND_S; } if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, DL, RHS.getValueType()); return X86::COND_LE; } } return TranslateIntegerX86CC(SetCCOpcode); } // First determine if it is required or is profitable to flip the operands. // If LHS is a foldable load, but RHS is not, flip the condition. if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) { SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); std::swap(LHS, RHS); } switch (SetCCOpcode) { default: break; case ISD::SETOLT: case ISD::SETOLE: case ISD::SETUGT: case ISD::SETUGE: std::swap(LHS, RHS); break; } // On a floating point condition, the flags are set as follows: // ZF PF CF op // 0 | 0 | 0 | X > Y // 0 | 0 | 1 | X < Y // 1 | 0 | 0 | X == Y // 1 | 1 | 1 | unordered switch (SetCCOpcode) { default: llvm_unreachable("Condcode should be pre-legalized away"); case ISD::SETUEQ: case ISD::SETEQ: return X86::COND_E; case ISD::SETOLT: // flipped case ISD::SETOGT: case ISD::SETGT: return X86::COND_A; case ISD::SETOLE: // flipped case ISD::SETOGE: case ISD::SETGE: return X86::COND_AE; case ISD::SETUGT: // flipped case ISD::SETULT: case ISD::SETLT: return X86::COND_B; case ISD::SETUGE: // flipped case ISD::SETULE: case ISD::SETLE: return X86::COND_BE; case ISD::SETONE: case ISD::SETNE: return X86::COND_NE; case ISD::SETUO: return X86::COND_P; case ISD::SETO: return X86::COND_NP; case ISD::SETOEQ: case ISD::SETUNE: return X86::COND_INVALID; } } /// Is there a floating point cmov for the specific X86 condition code? /// Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { default: return false; case X86::COND_B: case X86::COND_BE: case X86::COND_E: case X86::COND_P: case X86::COND_A: case X86::COND_AE: case X86::COND_NE: case X86::COND_NP: return true; } } /// Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) return true; } return false; } bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF // relocation target a movq or addq instruction: don't let the load shrink. SDValue BasePtr = cast
(Load)->getBasePtr(); if (BasePtr.getOpcode() == X86ISD::WrapperRIP) if (const auto *GA = dyn_cast
(BasePtr.getOperand(0))) return GA->getTargetFlags() != X86II::MO_GOTTPOFF; return true; } /// \brief Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0 || BitSize > 64) return false; return true; } bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; return (Index == 0 || Index == ResVT.getVectorNumElements()); } bool X86TargetLowering::isCheapToSpeculateCttz() const { // Speculate cttz only if we can directly use TZCNT. return Subtarget->hasBMI(); } bool X86TargetLowering::isCheapToSpeculateCtlz() const { // Speculate ctlz only if we can directly use LZCNT. return Subtarget->hasLZCNT(); } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size is undef. static bool isUndefInRange(ArrayRef
Mask, unsigned Pos, unsigned Size) { for (unsigned i = Pos, e = Pos + Size; i != e; ++i) if (0 <= Mask[i]) return false; return true; } /// Return true if Val is undef or if its value falls within the /// specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } /// Val is either less than zero (undef) or equal to the specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return (Val < 0 || Val == CmpVal); } /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef
Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; } /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector /// extract that is suitable for instruction that extract 128 or 256 bit vectors static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa
(N->getOperand(1).getNode())) return false; // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast
(N->getOperand(1).getNode())->getZExtValue(); MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; return Result; } /// Return true if the specified INSERT_SUBVECTOR /// operand specifies a subvector insert that is suitable for input to /// insertion of 128 or 256-bit subvectors static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa
(N->getOperand(2).getNode())) return false; // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast
(N->getOperand(2).getNode())->getZExtValue(); MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; return Result; } bool X86::isVINSERT128Index(SDNode *N) { return isVINSERTIndex(N, 128); } bool X86::isVINSERT256Index(SDNode *N) { return isVINSERTIndex(N, 256); } bool X86::isVEXTRACT128Index(SDNode *N) { return isVEXTRACTIndex(N, 128); } bool X86::isVEXTRACT256Index(SDNode *N) { return isVEXTRACTIndex(N, 256); } static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); assert(isa
(N->getOperand(1).getNode()) && "Illegal extract subvector for VEXTRACT"); uint64_t Index = cast
(N->getOperand(1).getNode())->getZExtValue(); MVT VecVT = N->getOperand(0).getSimpleValueType(); MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); assert(isa
(N->getOperand(2).getNode()) && "Illegal insert subvector for VINSERT"); uint64_t Index = cast
(N->getOperand(2).getNode())->getZExtValue(); MVT VecVT = N->getSimpleValueType(0); MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } /// Return the appropriate immediate to extract the specified /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions. unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 128); } /// Return the appropriate immediate to extract the specified /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions. unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 256); } /// Return the appropriate immediate to insert at the specified /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions. unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 128); } /// Return the appropriate immediate to insert at the specified /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions. unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); } // Build a vector of constants // Use an UNDEF node if MaskElt == -1. // Spilt 64-bit constants in the 32-bit mode. static SDValue getConstVector(ArrayRef
Values, MVT VT, SelectionDAG &DAG, SDLoc dl, bool IsMask = false) { SmallVector
Ops; bool Split = false; MVT ConstVecVT = VT; unsigned NumElts = VT.getVectorNumElements(); bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); Split = true; } MVT EltVT = ConstVecVT.getVectorElementType(); for (unsigned i = 0; i < NumElts; ++i) { bool IsUndef = Values[i] < 0 && IsMask; SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(Values[i], dl, EltVT); Ops.push_back(OpNode); if (Split) Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : DAG.getConstant(0, dl, EltVT)); } SDValue ConstsNode = DAG.getNode(ISD::BUILD_VECTOR, dl, ConstVecVT, Ops); if (Split) ConstsNode = DAG.getBitcast(VT, ConstsNode); return ConstsNode; } /// Returns a vector of specified type with all zero elements. static SDValue getZeroVector(MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); // Always build SSE zero vectors as <4 x i32> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; if (VT.is128BitVector()) { // SSE if (Subtarget->hasSSE2()) { // SSE2 SDValue Cst = DAG.getConstant(0, dl, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (VT.is256BitVector()) { // AVX if (Subtarget->hasInt256()) { // AVX2 SDValue Cst = DAG.getConstant(0, dl, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. SDValue Cst = DAG.getConstantFP(+0.0, dl, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); } } else if (VT.is512BitVector()) { // AVX-512 SDValue Cst = DAG.getConstant(0, dl, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getVectorElementType() == MVT::i1) { assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16) && "Unexpected vector type"); assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8) && "Unexpected vector type"); SDValue Cst = DAG.getConstant(0, dl, MVT::i1); SmallVector
Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else llvm_unreachable("Unexpected vector type"); return DAG.getBitcast(VT, Vec); } static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); unsigned Factor = VT.getSizeInBits()/vectorWidth; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); // Extract from UNDEF is UNDEF. if (Vec.getOpcode() == ISD::UNDEF) return DAG.getUNDEF(ResultVT); // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); } /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert((Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); } /// Generate a DAG to grab 256-bits from a 512-bit vector. static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); } static SDValue InsertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.getOpcode() == ISD::UNDEF) return Result; EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); // Insert the relevant vectorWidth bits. unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2"); // This is the index of the first element of the vectorWidth-bit chunk // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. IdxVal &= ~(ElemsPerChunk - 1); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit boundary. That makes /// lowering INSERT_VECTOR_ELT operations easier. static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128. // We are still creating an INSERT_SUBVECTOR below with an undef node to // extend the subvector to the size of the result vector. Make sure that // we are not recursing on that node by checking for undef here. if (IdxVal == 0 && Result.getValueType().is256BitVector() && Result.getOpcode() != ISD::UNDEF) { EVT ResultVT = Result.getValueType(); SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl); SDValue Undef = DAG.getUNDEF(ResultVT); SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef, Vec, ZeroIndex); // The blend instruction, and therefore its mask, depend on the data type. MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT(); if (ScalarType.isFloatingPoint()) { // Choose either vblendps (float) or vblendpd (double). unsigned ScalarSize = ScalarType.getSizeInBits(); assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type"); unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f; SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8); return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask); } const X86Subtarget &Subtarget = static_cast
(DAG.getSubtarget()); // AVX2 is needed for 256-bit integer blend support. // Integers must be cast to 32-bit because there is only vpblendd; // vpblendw can't be used for this because it has a handicapped mask. // If we don't have AVX2, then cast to float. Using a wrong domain blend // is still more efficient than using the wrong domain vinsertf128 that // will be created by InsertSubVector(). MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32; SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8); Vec256 = DAG.getBitcast(CastVT, Vec256); Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask); return DAG.getBitcast(ResultVT, Vec256); } return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } /// Insert i1-subvector to i1-vector. static SDValue Insert1BitVector(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue SubVec = Op.getOperand(1); SDValue Idx = Op.getOperand(2); if (!isa
(Idx)) return SDValue(); unsigned IdxVal = cast
(Idx)->getZExtValue(); if (IdxVal == 0 && Vec.isUndef()) // the operation is legal return Op; MVT OpVT = Op.getSimpleValueType(); MVT SubVecVT = SubVec.getSimpleValueType(); unsigned NumElems = OpVT.getVectorNumElements(); unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); assert(IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"); // There are 3 possible cases: // 1. Subvector should be inserted in the lower part (IdxVal == 0) // 2. Subvector should be inserted in the upper part // (IdxVal + SubVecNumElems == NumElems) // 3. Subvector should be inserted in the middle (for example v2i1 // to v16i1, index 2) SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); SDValue Undef = DAG.getUNDEF(OpVT); SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, SubVec, ZeroIdx); if (Vec.isUndef()) return DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, DAG.getConstant(IdxVal, dl, MVT::i8)); if (ISD::isBuildVectorAllZeros(Vec.getNode())) { unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, WideSubVec, DAG.getConstant(ShiftLeft, dl, MVT::i8)); return ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, OpVT, WideSubVec, DAG.getConstant(ShiftRight, dl, MVT::i8)) : WideSubVec; } if (IdxVal == 0) { // Zero lower bits of the Vec SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); // Merge them together return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { // Zero upper bits of the Vec WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, DAG.getConstant(IdxVal, dl, MVT::i8)); SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(X86ISD::VSHLI, dl, OpVT, Vec, ShiftBits); Vec = DAG.getNode(X86ISD::VSRLI, dl, OpVT, Vec, ShiftBits); return DAG.getNode(ISD::OR, dl, OpVT, Vec, WideSubVec); } // Subvector should be inserted in the middle - use shuffle SmallVector
Mask; for (unsigned i = 0; i < NumElems; ++i) Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? i : i + NumElems); return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask); } /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower /// large BUILD_VECTORS. static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, SDLoc dl) { SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return Insert128BitVector(V, V2, NumElems/2, DAG, dl); } static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, SDLoc dl) { SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } /// Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. static SDValue getOnesVector(EVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); SDValue Cst = DAG.getConstant(~0U, dl, MVT::i32); SDValue Vec; if (VT.is512BitVector()) { SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.is256BitVector()) { if (Subtarget->hasInt256()) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // AVX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); } } else if (VT.is128BitVector()) { Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else llvm_unreachable("Unexpected vector type"); return DAG.getBitcast(VT, Vec); } /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector
Mask; for (unsigned i = 0, e = NumElems/2; i != e; ++i) { Mask.push_back(i); Mask.push_back(i + NumElems); } return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } /// Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector
Mask; for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { Mask.push_back(i + Half); Mask.push_back(i + NumElems + Half); } return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } /// Return a vector_shuffle of the specified vector of zero or undef vector. /// This produces a shuffle where the low element of V2 is swizzled into the /// zero/undef vector, landing at element Idx. /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool IsZero, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); unsigned NumElems = VT.getVectorNumElements(); SmallVector
MaskVec; for (unsigned i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. MaskVec.push_back(i == Idx ? NumElems : i); return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); } /// Calculates the shuffle mask corresponding to the target-specific opcode. /// Returns true if the Mask could be calculated. Sets IsUnary to true if only /// uses one source. Note that this will set IsUnary for shuffles which use a /// single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. /// FIXME: Add support for Decode*Mask functions that return SM_SentinelZero. static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl
&Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; IsUnary = false; bool IsFakeUnary = false; switch(N->getOpcode()) { case X86ISD::BLENDI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeBLENDMask(VT, cast
(ImmN)->getZExtValue(), Mask); break; case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast
(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKL: DecodeUNPCKLMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVLHPS: DecodeMOVLHPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast
(ImmN)->getZExtValue(), Mask); break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFMask(VT, cast
(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFHW: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFHWMask(VT, cast
(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFLW: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFLWMask(VT, cast
(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFB: { IsUnary = true; SDValue MaskNode = N->getOperand(1); while (MaskNode->getOpcode() == ISD::BITCAST) MaskNode = MaskNode->getOperand(0); if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { // If we have a build-vector, then things are easy. MVT VT = MaskNode.getSimpleValueType(); assert(VT.isVector() && "Can't produce a non-vector with a build_vector!"); if (!VT.isInteger()) return false; int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8; SmallVector
RawMask; for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) { SDValue Op = MaskNode->getOperand(i); if (Op->getOpcode() == ISD::UNDEF) { RawMask.push_back((uint64_t)SM_SentinelUndef); continue; } auto *CN = dyn_cast
(Op.getNode()); if (!CN) return false; APInt MaskElement = CN->getAPIntValue(); // We now have to decode the element which could be any integer size and // extract each byte of it. for (int j = 0; j < NumBytesPerElement; ++j) { // Note that this is x86 and so always little endian: the low byte is // the first byte of the mask. RawMask.push_back(MaskElement.getLoBits(8).getZExtValue()); MaskElement = MaskElement.lshr(8); } } DecodePSHUFBMask(RawMask, Mask); break; } auto *MaskLoad = dyn_cast
(MaskNode); if (!MaskLoad) return false; SDValue Ptr = MaskLoad->getBasePtr(); if (Ptr->getOpcode() == X86ISD::Wrapper || Ptr->getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr->getOperand(0); auto *MaskCP = dyn_cast
(Ptr); if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; if (auto *C = dyn_cast
(MaskCP->getConstVal())) { DecodePSHUFBMask(C, Mask); if (Mask.empty()) return false; break; } return false; } case X86ISD::VPERMI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(cast
(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::MOVSS: case X86ISD::MOVSD: DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); break; case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast
(ImmN)->getZExtValue(), Mask); if (Mask.empty()) return false; // Mask only contains negative index if an element is zero. if (std::any_of(Mask.begin(), Mask.end(), [](int M){ return M == SM_SentinelZero; })) return false; break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); IsUnary = true; break; case X86ISD::MOVSHDUP: DecodeMOVSHDUPMask(VT, Mask); IsUnary = true; break; case X86ISD::MOVDDUP: DecodeMOVDDUPMask(VT, Mask); IsUnary = true; break; case X86ISD::MOVLHPD: case X86ISD::MOVLPD: case X86ISD::MOVLPS: // Not yet implemented return false; case X86ISD::VPERMV: { IsUnary = true; SDValue MaskNode = N->getOperand(0); while (MaskNode->getOpcode() == ISD::BITCAST) MaskNode = MaskNode->getOperand(0); unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()); SmallVector
RawMask; if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { // If we have a build-vector, then things are easy. assert(MaskNode.getSimpleValueType().isInteger() && MaskNode.getSimpleValueType().getVectorNumElements() == VT.getVectorNumElements()); for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { SDValue Op = MaskNode->getOperand(i); if (Op->getOpcode() == ISD::UNDEF) RawMask.push_back((uint64_t)SM_SentinelUndef); else if (isa
(Op)) { APInt MaskElement = cast
(Op)->getAPIntValue(); RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); } else return false; } DecodeVPERMVMask(RawMask, Mask); break; } if (MaskNode->getOpcode() == X86ISD::VBROADCAST) { unsigned NumEltsInMask = MaskNode->getNumOperands(); MaskNode = MaskNode->getOperand(0); auto *CN = dyn_cast
(MaskNode); if (CN) { APInt MaskEltValue = CN->getAPIntValue(); for (unsigned i = 0; i < NumEltsInMask; ++i) RawMask.push_back(MaskEltValue.getLoBits(MaskLoBits).getZExtValue()); DecodeVPERMVMask(RawMask, Mask); break; } // It may be a scalar load } auto *MaskLoad = dyn_cast
(MaskNode); if (!MaskLoad) return false; SDValue Ptr = MaskLoad->getBasePtr(); if (Ptr->getOpcode() == X86ISD::Wrapper || Ptr->getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr->getOperand(0); auto *MaskCP = dyn_cast
(Ptr); if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; auto *C = dyn_cast
(MaskCP->getConstVal()); if (C) { DecodeVPERMVMask(C, VT, Mask); if (Mask.empty()) return false; break; } return false; } case X86ISD::VPERMV3: { IsUnary = false; SDValue MaskNode = N->getOperand(1); while (MaskNode->getOpcode() == ISD::BITCAST) MaskNode = MaskNode->getOperand(1); if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) { // If we have a build-vector, then things are easy. assert(MaskNode.getSimpleValueType().isInteger() && MaskNode.getSimpleValueType().getVectorNumElements() == VT.getVectorNumElements()); SmallVector
RawMask; unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()*2); for (unsigned i = 0; i < MaskNode->getNumOperands(); ++i) { SDValue Op = MaskNode->getOperand(i); if (Op->getOpcode() == ISD::UNDEF) RawMask.push_back((uint64_t)SM_SentinelUndef); else { auto *CN = dyn_cast
(Op.getNode()); if (!CN) return false; APInt MaskElement = CN->getAPIntValue(); RawMask.push_back(MaskElement.getLoBits(MaskLoBits).getZExtValue()); } } DecodeVPERMV3Mask(RawMask, Mask); break; } auto *MaskLoad = dyn_cast
(MaskNode); if (!MaskLoad) return false; SDValue Ptr = MaskLoad->getBasePtr(); if (Ptr->getOpcode() == X86ISD::Wrapper || Ptr->getOpcode() == X86ISD::WrapperRIP) Ptr = Ptr->getOperand(0); auto *MaskCP = dyn_cast
(Ptr); if (!MaskCP || MaskCP->isMachineConstantPoolEntry()) return false; auto *C = dyn_cast
(MaskCP->getConstVal()); if (C) { DecodeVPERMV3Mask(C, VT, Mask); if (Mask.empty()) return false; break; } return false; } default: llvm_unreachable("unknown target shuffle node"); } // If we have a fake unary shuffle, the shuffle mask is spread across two // inputs that are actually the same node. Re-map the mask to always point // into the first input. if (IsFakeUnary) for (int &M : Mask) if (M >= (int)Mask.size()) M -= Mask.size(); return true; } /// Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { if (Depth == 6) return SDValue(); // Limit search depth. SDValue V = SDValue(N, 0); EVT VT = V.getValueType(); unsigned Opcode = V.getOpcode(); // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (const ShuffleVectorSDNode *SV = dyn_cast
(N)) { int Elt = SV->getMaskElt(Index); if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); unsigned NumElems = VT.getVectorNumElements(); SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); unsigned NumElems = ShufVT.getVectorNumElements(); SmallVector
ShuffleMask; bool IsUnary; if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt < 0) return DAG.getUNDEF(ShufVT.getVectorElementType()); SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Actual nodes that may contain scalar elements if (Opcode == ISD::BITCAST) { V = V.getOperand(0); EVT SrcVT = V.getValueType(); unsigned NumElems = VT.getVectorNumElements(); if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) return SDValue(); } if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) return (Index == 0) ? V.getOperand(0) : DAG.getUNDEF(VT.getVectorElementType()); if (V.getOpcode() == ISD::BUILD_VECTOR) return V.getOperand(Index); return SDValue(); } /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 8) return SDValue(); SDLoc dl(Op); SDValue V; bool First = true; // SSE4.1 - use PINSRB to insert each byte directly. if (Subtarget->hasSSE41()) { for (unsigned i = 0; i < 16; ++i) { bool isNonZero = (NonZeros & (1 << i)) != 0; if (isNonZero) { if (First) { if (NumZero) V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v16i8); First = false; } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } } return V; } // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { if (NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; } if ((i & 1) != 0) { SDValue ThisElt, LastElt; bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; if (LastIsNonZero) { LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i-1)); } if (ThisIsNonZero) { ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, DAG.getConstant(8, dl, MVT::i8)); if (LastIsNonZero) ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); } else ThisElt = LastElt; if (ThisElt.getNode()) V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, DAG.getIntPtrConstant(i/2, dl)); } } return DAG.getBitcast(MVT::v16i8, V); } /// Custom lower build_vector of v8i16. static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 4) return SDValue(); SDLoc dl(Op); SDValue V; bool First = true; for (unsigned i = 0; i < 8; ++i) { bool isNonZero = (NonZeros & (1 << i)) != 0; if (isNonZero) { if (First) { if (NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } } return V; } /// Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { // Find all zeroable elements. std::bitset<4> Zeroable; for (int i=0; i < 4; ++i) { SDValue Elt = Op->getOperand(i); Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt)); } assert(Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!"); // We only know how to deal with build_vector nodes where elements are either // zeroable or extract_vector_elt with constant index. SDValue FirstNonZero; unsigned FirstNonZeroIdx; for (unsigned i=0; i < 4; ++i) { if (Zeroable[i]) continue; SDValue Elt = Op->getOperand(i); if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa
(Elt.getOperand(1))) return SDValue(); // Make sure that this node is extracting from a 128-bit vector. MVT VT = Elt.getOperand(0).getSimpleValueType(); if (!VT.is128BitVector()) return SDValue(); if (!FirstNonZero.getNode()) { FirstNonZero = Elt; FirstNonZeroIdx = i; } } assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!"); SDValue V1 = FirstNonZero.getOperand(0); MVT VT = V1.getSimpleValueType(); // See if this build_vector can be lowered as a blend with zero. SDValue Elt; unsigned EltMaskIdx, EltIdx; int Mask[4]; for (EltIdx = 0; EltIdx < 4; ++EltIdx) { if (Zeroable[EltIdx]) { // The zero vector will be on the right hand side. Mask[EltIdx] = EltIdx+4; continue; } Elt = Op->getOperand(EltIdx); // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. EltMaskIdx = cast
(Elt.getOperand(1))->getZExtValue(); if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) break; Mask[EltIdx] = EltIdx; } if (EltIdx == 4) { // Let the shuffle legalizer deal with blend operations. SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); if (V1.getSimpleValueType() != VT) V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1); return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]); } // See if we can lower this build_vector to a INSERTPS. if (!Subtarget->hasSSE41()) return SDValue(); SDValue V2 = Elt.getOperand(0); if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) V1 = SDValue(); bool CanFold = true; for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { if (Zeroable[i]) continue; SDValue Current = Op->getOperand(i); SDValue SrcVector = Current->getOperand(0); if (!V1.getNode()) V1 = SrcVector; CanFold = SrcVector == V1 && cast
(Current.getOperand(1))->getZExtValue() == i; } if (!CanFold) return SDValue(); assert(V1.getNode() && "Expected at least two non-zero elements!"); if (V1.getSimpleValueType() != MVT::v4f32) V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1); if (V2.getSimpleValueType() != MVT::v4f32) V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2); // Ok, we can emit an INSERTPS instruction. unsigned ZMask = Zeroable.to_ulong(); unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); SDLoc DL(Op); SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getIntPtrConstant(InsertPSMask, DL)); return DAG.getBitcast(VT, Result); } /// Return a vector logical shift node. static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, SDLoc dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); MVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getBitcast(ShVT, SrcOp); MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT); assert(NumBits % 8 == 0 && "Only support byte sized shifts"); SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy); return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); } static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into // the shuffle mask. if (LoadSDNode *LD = dyn_cast
(SrcOp)) { SDValue Ptr = LD->getBasePtr(); if (!ISD::isNormalLoad(LD) || LD->isVolatile()) return SDValue(); EVT PVT = LD->getValueType(0); if (PVT != MVT::i32 && PVT != MVT::f32) return SDValue(); int FI = -1; int64_t Offset = 0; if (FrameIndexSDNode *FINode = dyn_cast
(Ptr)) { FI = FINode->getIndex(); Offset = 0; } else if (DAG.isBaseWithConstantOffset(Ptr) && isa
(Ptr.getOperand(0))) { FI = cast
(Ptr.getOperand(0))->getIndex(); Offset = Ptr.getConstantOperandVal(1); Ptr = Ptr.getOperand(0); } else { return SDValue(); } // FIXME: 256-bit vector instructions don't require a strict alignment, // improve this code to support it better. unsigned RequiredAlign = VT.getSizeInBits()/8; SDValue Chain = LD->getChain(); // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { if (MFI->isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { MFI->setObjectAlignment(FI, RequiredAlign); } } // (Offset % 16 or 32) must be multiple of 4. Then address is then // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); if ((Offset % RequiredAlign) & 3) return SDValue(); int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1); if (StartOffset) { SDLoc DL(Ptr); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(StartOffset, DL, Ptr.getValueType())); } int EltNo = (Offset - StartOffset) >> 2; unsigned NumElems = VT.getVectorNumElements(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset), false, false, false, 0); SmallVector
Mask(NumElems, EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } return SDValue(); } /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the /// elements can be replaced by a single large load which has the same value as /// a build_vector or insert_subvector whose loaded operands are 'Elts'. /// /// Example:
-> zextload a /// /// FIXME: we'd also like to handle the case where the last elements are zero /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef
Elts, SDLoc &DL, SelectionDAG &DAG, bool isAfterLegalize) { unsigned NumElems = Elts.size(); LoadSDNode *LDBase = nullptr; unsigned LastLoadedElt = -1U; // For each element in the initializer, see if we've found a load or an undef. // If we don't find an initial load element, or later load elements are // non-consecutive, bail out. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Elts[i]; // Look through a bitcast. if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST) Elt = Elt.getOperand(0); if (!Elt.getNode() || (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) return SDValue(); if (!LDBase) { if (Elt.getNode()->getOpcode() == ISD::UNDEF) return SDValue(); LDBase = cast
(Elt.getNode()); LastLoadedElt = i; continue; } if (Elt.getOpcode() == ISD::UNDEF) continue; LoadSDNode *LD = cast
(Elt); EVT LdVT = Elt.getValueType(); // Each loaded element must be the correct fractional portion of the // requested vector load. if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems) return SDValue(); if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i)) return SDValue(); LastLoadedElt = i; } // If we have found an entire vector of loads and undefs, then return a large // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { assert(LDBase && "Did not find base load for merging consecutive loads"); EVT EltVT = LDBase->getValueType(0); // Ensure that the input vector size for the merged loads matches the // cumulative size of the input elements. if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems) return SDValue(); if (isAfterLegalize && !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) return SDValue(); SDValue NewLd = SDValue(); NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), LDBase->isInvariant(), LDBase->getAlignment()); if (LDBase->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), SDValue(NewLd.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), SDValue(NewLd.getNode(), 1)); } return NewLd; } //TODO: The code below fires only for for loading the low v2i32 / v2f32 //of a v4i32 / v4f32. It's probably worth generalizing. EVT EltVT = VT.getVectorElementType(); if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, LDBase->getPointerInfo(), LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); // Make sure the newly-created LOAD is in the same position as LDBase in // terms of dependency. We create a TokenFactor for LDBase and ResNode, and // update uses of LDBase's output chain to use the TokenFactor. if (LDBase->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); } return DAG.getBitcast(VT, ResNode); } return SDValue(); } /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction /// to generate a splat value for the following cases: /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. /// 2. A splat shuffle which uses a scalar_to_vector node which comes from /// a scalar load, or a constant. /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE // instructions, but there's less potential gain for only 128-bit vectors. if (!Subtarget->hasAVX()) return SDValue(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); SDValue Ld; bool ConstSplatVal; switch (Op.getOpcode()) { default: // Unknown pattern found. return SDValue(); case ISD::BUILD_VECTOR: { auto *BVOp = cast
(Op.getNode()); BitVector UndefElements; SDValue Splat = BVOp->getSplatValue(&UndefElements); // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) return SDValue(); Ld = Splat; ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); // Make sure that all of the users of a non-constant load are from the // BUILD_VECTOR node. if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); break; } case ISD::VECTOR_SHUFFLE: { ShuffleVectorSDNode *SVOp = cast
(Op); // Shuffles must have a splat mask where the first element is // broadcasted. if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) return SDValue(); SDValue Sc = Op.getOperand(0); if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && Sc.getOpcode() != ISD::BUILD_VECTOR) { if (!Subtarget->hasInt256()) return SDValue(); // Use the register form of the broadcast instruction available on AVX2. if (VT.getSizeInBits() >= 256) Sc = Extract128BitVector(Sc, 0, DAG, dl); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); } Ld = Sc.getOperand(0); ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); // The scalar_to_vector node and the suspected // load node must have exactly one user. // Constants may have multiple users. // AVX-512 has register version of the broadcast bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && Ld.getValueType().getSizeInBits() >= 32; if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && !hasRegVer)) return SDValue(); break; } } unsigned ScalarSize = Ld.getValueType().getSizeInBits(); bool IsGE256 = (VT.getSizeInBits() >= 256); // When optimizing for size, generate up to 5 extra bytes for a broadcast // instruction to save 8 or more bytes of constant pool data. // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. // On Sandybridge (no AVX2), it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. // But override that restriction when optimizing for size. // TODO: Check if splatting is recommended for other AVX-capable CPUs. if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); // Splat f32, i32, v4f64, v4i64 in all cases with AVX2. // For size optimization, also splat v2f64 and v2i64, and for size opt // with AVX2, also splat i8 and i16. // With pattern matching, the VBROADCAST node may become a VMOVDDUP. if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast
(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast
(Ld)) C = CF->getConstantFPValue(); assert(C && "Invalid constant type"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); unsigned Alignment = cast
(CP)->getAlignment(); Ld = DAG.getLoad( CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), false, false, false, Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || (Subtarget->hasVLX() && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } // Unsupported broadcast. return SDValue(); } /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real /// underlying vector and index. /// /// Modifies \p ExtractedFromVec to the real vector and returns the real /// index. static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx) { int Idx = cast
(ExtIdx)->getZExtValue(); if (!isa
(ExtractedFromVec)) return Idx; // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already // lowered this: // (extract_vector_elt (v8f32 %vreg1), Constant<6>) // to: // (extract_vector_elt (vector_shuffle<2,u,u,u> // (extract_subvector (v8f32 %vreg0), Constant<4>), // undef) // Constant<0>) // In this case the vector is the extract_subvector expression and the index // is 2, as specified by the shuffle. ShuffleVectorSDNode *SVOp = cast
(ExtractedFromVec); SDValue ShuffleVec = SVOp->getOperand(0); MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); assert(ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()); int ShuffleIdx = SVOp->getMaskElt(Idx); if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { ExtractedFromVec = ShuffleVec; return ShuffleIdx; } return Idx; } static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // Skip if insert_vec_elt is not supported. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); SDLoc DL(Op); unsigned NumElems = Op.getNumOperands(); SDValue VecIn1; SDValue VecIn2; SmallVector
InsertIndices; SmallVector
Mask(NumElems, -1); for (unsigned i = 0; i != NumElems; ++i) { unsigned Opc = Op.getOperand(i).getOpcode(); if (Opc == ISD::UNDEF) continue; if (Opc != ISD::EXTRACT_VECTOR_ELT) { // Quit if more than 1 elements need inserting. if (InsertIndices.size() > 1) return SDValue(); InsertIndices.push_back(i); continue; } SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); // Quit if non-constant index. if (!isa
(ExtIdx)) return SDValue(); int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { unsigned Idx = InsertIndices[i]; NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getIntPtrConstant(Idx, DL)); } return NV; } static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) { assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && Op.getScalarValueSizeInBits() == 1 && "Can not convert non-constant vector"); uint64_t Immediate = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() != ISD::UNDEF) Immediate |= cast
(In)->getZExtValue() << idx; } SDLoc dl(Op); MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueType().getSizeInBits(), 8)); return DAG.getConstant(Immediate, dl, VT); } // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. SDValue X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); assert((VT.getVectorElementType() == MVT::i1) && "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(0, dl, MVT::i1); SmallVector
Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } if (ISD::isBuildVectorAllOnes(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(1, dl, MVT::i1); SmallVector
Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { SDValue Imm = ConvertI1VectorToInteger(Op, DAG); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, Imm); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } // Vector has one or more non-const elements uint64_t Immediate = 0; SmallVector
NonConstIdx; bool IsSplat = true; bool HasConstElts = false; int SplatIdx = -1; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() == ISD::UNDEF) continue; if (!isa
(In)) NonConstIdx.push_back(idx); else { Immediate |= cast
(In)->getZExtValue() << idx; HasConstElts = true; } if (SplatIdx == -1) SplatIdx = idx; else if (In != Op.getOperand(SplatIdx)) IsSplat = false; } // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" if (IsSplat) return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx), DAG.getConstant(1, dl, VT), DAG.getConstant(0, dl, VT)); // insert elements one by one SDValue DstVec; SDValue Imm; if (Immediate) { MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8)); Imm = DAG.getConstant(Immediate, dl, ImmVT); } else if (HasConstElts) Imm = DAG.getConstant(0, dl, VT); else Imm = DAG.getUNDEF(VT); if (Imm.getValueSizeInBits() == VT.getSizeInBits()) DstVec = DAG.getBitcast(VT, Imm); else { SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm); DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec, DAG.getIntPtrConstant(0, dl)); } for (unsigned i = 0; i < NonConstIdx.size(); ++i) { unsigned InsertIdx = NonConstIdx[i]; DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(InsertIdx), DAG.getIntPtrConstant(InsertIdx, dl)); } return DstVec; } /// \brief Return true if \p N implements a horizontal binop and return the /// operands for the horizontal binop into V0 and V1. /// /// This is a helper function of LowerToHorizontalOp(). /// This function checks that the build_vector \p N in input implements a /// horizontal operation. Parameter \p Opcode defines the kind of horizontal /// operation to match. /// For example, if \p Opcode is equal to ISD::ADD, then this function /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode /// is equal to ISD::SUB, then this function checks if this is a horizontal /// arithmetic sub. /// /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { EVT VT = N->getValueType(0); assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); // Skip UNDEFs. if (Op->getOpcode() == ISD::UNDEF) { // Update the expected vector extract index. if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; ExpectedVExtractIdx += 2; continue; } CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) break; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0) == Op1.getOperand(0) && isa
(Op0.getOperand(1)) && isa
(Op1.getOperand(1))); if (!CanFold) break; unsigned I0 = cast
(Op0.getOperand(1))->getZExtValue(); unsigned I1 = cast
(Op1.getOperand(1))->getZExtValue(); if (i * 2 < NumElts) { if (V0.getOpcode() == ISD::UNDEF) { V0 = Op0.getOperand(0); if (V0.getValueType() != VT) return false; } } else { if (V1.getOpcode() == ISD::UNDEF) { V1 = Op0.getOperand(0); if (V1.getValueType() != VT) return false; } if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; if (I0 == ExpectedVExtractIdx) CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; else if (IsCommutable && I1 == ExpectedVExtractIdx) { // Try to match the following dag sequence: // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; } else CanFold = false; ExpectedVExtractIdx += 2; } return CanFold; } /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by /// a concat_vector. /// /// This is a helper function of LowerToHorizontalOp(). /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two /// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to /// the two new horizontal binop. /// When Mode is set, the first horizontal binop dag node would take as input /// the lower 128-bit of V0 and the upper 128-bit of V0. The second /// horizontal binop dag node would take as input the lower 128-bit of V1 /// and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V0_HI /// HADD V1_LO, V1_HI /// /// Otherwise, the first horizontal binop dag node takes as input the lower /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI /// /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to /// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, SDLoc DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI) { EVT VT = V0.getValueType(); assert(VT.is256BitVector() && VT == V1.getValueType() && "Invalid nodes in input!"); unsigned NumElts = VT.getVectorNumElements(); SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL); SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL); SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL); SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); EVT NewVT = V0_LO.getValueType(); SDValue LO = DAG.getUNDEF(NewVT); SDValue HI = DAG.getUNDEF(NewVT); if (Mode) { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || V1_LO->getOpcode() != ISD::UNDEF)) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || V1_HI->getOpcode() != ISD::UNDEF)) HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB /// node. static SDValue LowerToAddSub(const BuildVectorSDNode *BV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = BV->getSimpleValueType(0); if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64))) return SDValue(); SDLoc DL(BV); unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v2f64) && "build_vector with an invalid type found!"); // Odd-numbered elements in the input build vector are obtained from // adding two integer/float elements. // Even-numbered elements in the input build vector are obtained from // subtracting two integer/float elements. unsigned ExpectedOpcode = ISD::FSUB; unsigned NextExpectedOpcode = ISD::FADD; bool AddFound = false; bool SubFound = false; for (unsigned i = 0, e = NumElts; i != e; ++i) { SDValue Op = BV->getOperand(i); // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) { std::swap(ExpectedOpcode, NextExpectedOpcode); continue; } // Early exit if we found an unexpected opcode. if (Opcode != ExpectedOpcode) return SDValue(); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) // Early exit if we cannot match that sequence. if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa
(Op0.getOperand(1)) || !isa
(Op1.getOperand(1)) || Op0.getOperand(1) != Op1.getOperand(1)) return SDValue(); unsigned I0 = cast
(Op0.getOperand(1))->getZExtValue(); if (I0 != i) return SDValue(); // We found a valid add/sub node. Update the information accordingly. if (i & 1) AddFound = true; else SubFound = true; // Update InVec0 and InVec1. if (InVec0.getOpcode() == ISD::UNDEF) { InVec0 = Op0.getOperand(0); if (InVec0.getSimpleValueType() != VT) return SDValue(); } if (InVec1.getOpcode() == ISD::UNDEF) { InVec1 = Op1.getOperand(0); if (InVec1.getSimpleValueType() != VT) return SDValue(); } // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. if (InVec0 != Op0.getOperand(0)) { if (ExpectedOpcode == ISD::FSUB) return SDValue(); // FADD is commutable. Try to commute the operands // and then test again. std::swap(Op0, Op1); if (InVec0 != Op0.getOperand(0)) return SDValue(); } if (InVec1 != Op1.getOperand(0)) return SDValue(); // Update the pair of expected opcodes. std::swap(ExpectedOpcode, NextExpectedOpcode); } // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && InVec1.getOpcode() != ISD::UNDEF) return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1); return SDValue(); } /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = BV->getSimpleValueType(0); unsigned NumElts = VT.getVectorNumElements(); unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; unsigned Half = NumElts/2; // Count the number of UNDEF operands in the build_vector in input. for (unsigned i = 0, e = Half; i != e; ++i) if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) NumUndefsLO++; for (unsigned i = Half, e = NumElts; i != e; ++i) if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) NumUndefsHI++; // Early exit if this is either a build_vector of all UNDEFs or all the // operands but one are UNDEF. if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) return SDValue(); SDLoc DL(BV); SDValue InVec0, InVec1; if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { // Try to match an SSE3 float HADD/HSUB. if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } if (!Subtarget->hasAVX()) return SDValue(); if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { // Try to match an AVX horizontal add/sub of packed single/double // precision floating point values from 256-bit vectors. SDValue InVec2, InVec3; if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { // Try to match an AVX2 horizontal add/sub of signed integers. SDValue InVec2, InVec3; unsigned X86Opcode; bool CanFold = true; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; if (CanFold) { // Fold this build_vector into a single horizontal add/sub. // Do this only if the target has AVX2. if (Subtarget->hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into a pair of horizontal binop followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, isUndefLO, isUndefHI); } } if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) && Subtarget->hasAVX()) { unsigned X86Opcode; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); // Don't try to expand this build_vector into a pair of horizontal add/sub // if we can simply emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into two horizontal add/sub followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, isUndefLO, isUndefHI); } return SDValue(); } SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. if (VT.getVectorElementType() == MVT::i1 && Subtarget->hasAVX512()) return LowerBUILD_VECTORvXi1(Op, DAG); // Vectors containing all zeros can be matched by pxor and xorps later if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getZeroVector(VT, Subtarget, DAG, dl); } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) return Op; if (!VT.is512BitVector()) return getOnesVector(VT, Subtarget, DAG, dl); } BuildVectorSDNode *BV = cast
(Op.getNode()); if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG)) return AddSub; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) return HorizontalOp; if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG)) return Broadcast; unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; unsigned NumNonZero = 0; uint64_t NonZeros = 0; bool IsAllConstants = true; SmallSet
Values; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() == ISD::UNDEF) continue; Values.insert(Elt); if (Elt.getOpcode() != ISD::Constant && Elt.getOpcode() != ISD::ConstantFP) IsAllConstants = false; if (X86::isZeroNode(Elt)) NumZero++; else { assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range. NonZeros |= ((uint64_t)1 << i); NumNonZero++; } } // All undef vector. Return an UNDEF. All zero vectors were handled above. if (NumNonZero == 0) return DAG.getUNDEF(VT); // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); // If this is an insertion of an i64 value on x86-32, and if the top bits of // the value are obviously zero, truncate the value to i32 and do the // insertion that way. Only do this if the value is non-constant or if the // value is a constant being inserted into element 0. It is cheaper to do // a constant pool load than it is to do a movd + shuffle. if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && (!IsAllConstants || Idx == 0)) { if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); MVT VecVT = MVT::v4i32; // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef( Item, Idx * 2, true, Subtarget, DAG)); } } // If we have a constant or non-constant insertion into the low element of // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { if (VT.is512BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, Item, DAG.getIntPtrConstant(0, dl)); } assert((VT.is128BitVector() || VT.is256BitVector()) && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } // We can't directly insert an i8 or i16 into a vector, so zero extend // it to i32 first. if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); if (VT.is256BitVector()) { if (Subtarget->hasAVX()) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } else { // Without AVX, we need to extend to a 128-bit vector and then // insert into the 256-bit vector. Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); } } else { assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } return DAG.getBitcast(VT, Item); } } // Is it a vector logical left shift? if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) && !X86::isZeroNode(Op.getOperand(1))) { unsigned NumBits = VT.getSizeInBits(); return getVShift(true, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)), NumBits/2, DAG, *this, dl); } if (IsAllConstants) // Otherwise, it's better to do a constpool load. return SDValue(); // Otherwise, if this is a vector with i32 or f32 elements, and the element // is a non-constant being inserted into an element other than the low one, // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka // movd/movss) to move this into the low element, then shuffle it into // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); } } // Splat is obviously ok. Let legalizer expand it to a shuffle. if (Values.size() == 1) { if (EVTBits == 32) { // Instead of a shuffle like this: // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); } return SDValue(); } // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. if (IsAllConstants) return SDValue(); // For AVX-length vectors, see if we can use a vector load to get all of the // elements, otherwise build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.is256BitVector() || VT.is512BitVector()) { SmallVector
V(Op->op_begin(), Op->op_begin() + NumElems); // Check for a build vector of consecutive loads. if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) return LD; EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); // Build both the lower and upper subvector. SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, makeArrayRef(&V[0], NumElems/2)); SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, makeArrayRef(&V[NumElems / 2], NumElems/2)); // Recreate the wider vector with the lower and upper part. if (VT.is256BitVector()) return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); } // Let legalizer expand 2-wide build_vectors. if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget, *this)) return V; if (EVTBits == 16 && NumElems == 8) if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero, DAG, Subtarget, *this)) return V; // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this)) return V; // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector
V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1ULL << i)); if (isZero) V[i] = getZeroVector(VT, Subtarget, DAG, dl); else V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } for (unsigned i = 0; i < 2; ++i) { switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { default: break; case 0: V[i] = V[i*2]; // Must be a zero vector. break; case 1: V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); break; case 2: V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); break; case 3: V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); break; } } bool Reverse1 = (NonZeros & 0x3) == 2; bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; int MaskVec[] = { Reverse1 ? 1 : 0, Reverse1 ? 0 : 1, static_cast
(Reverse2 ? NumElems+1 : NumElems), static_cast
(Reverse2 ? NumElems : NumElems+1) }; return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } if (Values.size() > 1 && VT.is128BitVector()) { // Check for a build vector of consecutive loads. for (unsigned i = 0; i < NumElems; ++i) V[i] = Op.getOperand(i); // Check for elements which are consecutive loads. if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false)) return LD; // Check for a build vector from mostly shuffle plus few inserting. if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. if (Subtarget->hasSSE41()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); else Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } return Result; } // Otherwise, expand into a number of unpckl*, start by extending each of // our (non-undef) elements to the full vector width with the element in the // bottom slot of the vector (which generates no code for SSE). for (unsigned i = 0; i < NumElems; ++i) { if (Op.getOperand(i).getOpcode() != ISD::UNDEF) V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); else V[i] = DAG.getUNDEF(VT); } // Next, we iteratively mix elements, e.g. for v4f32: // Step 1: unpcklps 0, 2 ==> X: , ?, 2, 0> // : unpcklps 1, 3 ==> Y: , ?, 3, 1> // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> unsigned EltStride = NumElems >> 1; while (EltStride != 0) { for (unsigned i = 0; i < EltStride; ++i) { // If V[i+EltStride] is undef and this is the first round of mixing, // then it is safe to just drop this shuffle: V[i] is already in the // right place, the one element (since it's the first round) being // inserted as undef can be dropped. This isn't safe for successive // rounds because they will permute elements within both vectors. if (V[i+EltStride].getOpcode() == ISD::UNDEF && EltStride == NumElems/2) continue; V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); } EltStride >>= 1; } return V[0]; } return SDValue(); } // 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); assert((ResVT.is256BitVector() || ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); if (ResVT.is256BitVector()) return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); if (Op.getNumOperands() == 4) { MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); SDValue V3 = Op.getOperand(2); SDValue V4 = Op.getOperand(3); return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl); } return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG & DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); unsigned NumOfOperands = Op.getNumOperands(); assert(isPowerOf2_32(NumOfOperands) && "Unexpected number of operands in CONCAT_VECTORS"); SDValue Undef = DAG.getUNDEF(ResVT); if (NumOfOperands > 2) { // Specialize the cases when all, or all but one, of the operands are undef. unsigned NumOfDefinedOps = 0; unsigned OpIdx = 0; for (unsigned i = 0; i < NumOfOperands; i++) if (!Op.getOperand(i).isUndef()) { NumOfDefinedOps++; OpIdx = i; } if (NumOfDefinedOps == 0) return Undef; if (NumOfDefinedOps == 1) { unsigned SubVecNumElts = Op.getOperand(OpIdx).getValueType().getVectorNumElements(); SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, Op.getOperand(OpIdx), IdxVal); } MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); SmallVector
Ops; for (unsigned i = 0; i < NumOfOperands/2; i++) Ops.push_back(Op.getOperand(i)); SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); Ops.clear(); for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++) Ops.push_back(Op.getOperand(i)); SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } // 2 operands SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); assert(V1.getValueType() == V2.getValueType() && V1.getValueType().getVectorNumElements() == NumElems/2 && "Unexpected operands in CONCAT_VECTORS"); if (ResVT.getSizeInBits() >= 16) return Op; // The operation is legal with KUNPCK bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl); if (IsZeroV1 && IsZeroV2) return ZeroVec; SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); if (V2.isUndef()) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); if (IsZeroV2) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx); SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); if (V1.isUndef()) V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); if (IsZeroV1) return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal); } static SDValue LowerCONCAT_VECTORS(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); if (VT.getVectorElementType() == MVT::i1) return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); // AVX can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors return LowerAVXCONCAT_VECTORS(Op, DAG); } //===----------------------------------------------------------------------===// // Vector shuffle lowering // // This is an experimental code path for lowering vector shuffles on x86. It is // designed to handle arbitrary vector shuffles and blends, gracefully // degrading performance as necessary. It works hard to recognize idiomatic // shuffles and lower them to optimal instruction patterns without leaving // a framework that allows reasonably efficient handling of all vector shuffle // patterns. //===----------------------------------------------------------------------===// /// \brief Tiny helper function to identify a no-op mask. /// /// This is a somewhat boring predicate function. It checks whether the mask /// array input, which is assumed to be a single-input shuffle mask of the kind /// used by the X86 shuffle instructions (not a fully general /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an /// in-place shuffle are 'no-op's. static bool isNoopShuffleMask(ArrayRef
Mask) { for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] != -1 && Mask[i] != i) return false; return true; } /// \brief Helper function to classify a mask as a single-input mask. /// /// This isn't a generic single-input test because in the vector shuffle /// lowering we canonicalize single inputs to be the first input operand. This /// means we can more quickly test for a single input by only checking whether /// an input from the second operand exists. We also assume that the size of /// mask corresponds to the size of the input vectors which isn't true in the /// fully general case. static bool isSingleInputShuffleMask(ArrayRef
Mask) { for (int M : Mask) if (M >= (int)Mask.size()) return false; return true; } /// \brief Test whether there are elements crossing 128-bit lanes in this /// shuffle mask. /// /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations /// and we routinely test for these. static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef
Mask) { int LaneSize = 128 / VT.getScalarSizeInBits(); int Size = Mask.size(); for (int i = 0; i < Size; ++i) if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) return true; return false; } /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane. /// /// This checks a shuffle mask to see if it is performing the same /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies /// that it is also not lane-crossing. It may however involve a blend from the /// same lane of a second vector. /// /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is /// non-trivial to compute in the face of undef lanes. The representation is /// *not* suitable for use with existing 128-bit shuffles as it will contain /// entries from both V1 and V2 inputs to the wider mask. static bool is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef
Mask, SmallVectorImpl
&RepeatedMask) { int LaneSize = 128 / VT.getScalarSizeInBits(); RepeatedMask.resize(LaneSize, -1); int Size = Mask.size(); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; if ((Mask[i] % Size) / LaneSize != i / LaneSize) // This entry crosses lanes, so there is no way to model this shuffle. return false; // Ok, handle the in-lane shuffles by detecting if and when they repeat. if (RepeatedMask[i % LaneSize] == -1) // This is the first non-undef entry in this slot of a 128-bit lane. RepeatedMask[i % LaneSize] = Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size; else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i]) // Found a mismatch with the repeated mask. return false; } return true; } /// \brief Checks whether a shuffle mask is equivalent to an explicit list of /// arguments. /// /// This is a fast way to test a shuffle mask against a fixed pattern: /// /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } /// /// It returns true if the mask is exactly as wide as the argument list, and /// each element of the mask is either -1 (signifying undef) or the value given /// in the argument. static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef
Mask, ArrayRef
ExpectedMask) { if (Mask.size() != ExpectedMask.size()) return false; int Size = Mask.size(); // If the values are build vectors, we can look through them to find // equivalent inputs that make the shuffles equivalent. auto *BV1 = dyn_cast
(V1); auto *BV2 = dyn_cast
(V2); for (int i = 0; i < Size; ++i) if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) { auto *MaskBV = Mask[i] < Size ? BV1 : BV2; auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2; if (!MaskBV || !ExpectedBV || MaskBV->getOperand(Mask[i] % Size) != ExpectedBV->getOperand(ExpectedMask[i] % Size)) return false; } return true; } /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to /// the ubiquitous shuffle encoding scheme used in x86 instructions for /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. static SDValue getV4X86ShuffleImm8ForMask(ArrayRef
Mask, SDLoc DL, SelectionDAG &DAG) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); unsigned Imm = 0; Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; return DAG.getConstant(Imm, DL, MVT::i8); } /// \brief Compute whether each element of a shuffle is zeroable. /// /// A "zeroable" vector shuffle element is one which can be lowered to zero. /// Either it is an undef element in the shuffle mask, the element of the input /// referenced is undef, or the element of the input referenced is known to be /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle /// as many lanes with this technique as possible to simplify the remaining /// shuffle. static SmallBitVector computeZeroableShuffleElements(ArrayRef
Mask, SDValue V1, SDValue V2) { SmallBitVector Zeroable(Mask.size(), false); while (V1.getOpcode() == ISD::BITCAST) V1 = V1->getOperand(0); while (V2.getOpcode() == ISD::BITCAST) V2 = V2->getOperand(0); bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; // Handle the easy cases. if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { Zeroable[i] = true; continue; } // If this is an index into a build_vector node (which has the same number // of elements), dig out the input value and use it. SDValue V = M < Size ? V1 : V2; if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) continue; SDValue Input = V.getOperand(M % Size); // The UNDEF opcode check really should be dead code here, but not quite // worth asserting on (it isn't invalid, just unexpected). if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) Zeroable[i] = true; } return Zeroable; } // X86 has dedicated unpack instructions that can handle specific blend // operations: UNPCKH and UNPCKL. static SDValue lowerVectorShuffleWithUNPCK(SDLoc DL, MVT VT, ArrayRef
Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { int NumElts = VT.getVectorNumElements(); int NumEltsInLane = 128 / VT.getScalarSizeInBits(); SmallVector
Unpckl; SmallVector
Unpckh; for (int i = 0; i < NumElts; ++i) { unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; int LoPos = (i % NumEltsInLane) / 2 + LaneStart + NumElts * (i % 2); int HiPos = LoPos + NumEltsInLane / 2; Unpckl.push_back(LoPos); Unpckh.push_back(HiPos); } if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); // Commute and try again. ShuffleVectorSDNode::commuteMask(Unpckl); if (isShuffleEquivalent(V1, V2, Mask, Unpckl)) return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); ShuffleVectorSDNode::commuteMask(Unpckh); if (isShuffleEquivalent(V1, V2, Mask, Unpckh)) return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); return SDValue(); } /// \brief Try to emit a bitmask instruction for a shuffle. /// /// This handles cases where we can model a blend exactly as a bitmask due to /// one of the inputs being zeroable. static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Mask, SelectionDAG &DAG) { MVT EltVT = VT.getVectorElementType(); int NumEltBits = EltVT.getSizeInBits(); MVT IntEltVT = MVT::getIntegerVT(NumEltBits); SDValue Zero = DAG.getConstant(0, DL, IntEltVT); SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, IntEltVT); if (EltVT.isFloatingPoint()) { Zero = DAG.getBitcast(EltVT, Zero); AllOnes = DAG.getBitcast(EltVT, AllOnes); } SmallVector
VMaskOps(Mask.size(), Zero); SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Zeroable[i]) continue; if (Mask[i] % Size != i) return SDValue(); // Not a blend. if (!V) V = Mask[i] < Size ? V1 : V2; else if (V != (Mask[i] < Size ? V1 : V2)) return SDValue(); // Can only let one input through the mask. VMaskOps[i] = AllOnes; } if (!V) return SDValue(); // No non-zeroable elements! SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps); V = DAG.getNode(VT.isFloatingPoint() ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND, DL, VT, V, VMask); return V; } /// \brief Try to emit a blend instruction for a shuffle using bit math. /// /// This is used as a fallback approach when first class blend instructions are /// unavailable. Currently it is only suitable for integer vectors, but could /// be generalized for floating point vectors if desirable. static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Mask, SelectionDAG &DAG) { assert(VT.isInteger() && "Only supports integer vector types!"); MVT EltVT = VT.getVectorElementType(); int NumEltBits = EltVT.getSizeInBits(); SDValue Zero = DAG.getConstant(0, DL, EltVT); SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL, EltVT); SmallVector
MaskOps; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size) return SDValue(); // Shuffled input! MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); } SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps); V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); // We have to cast V2 around. MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT, DAG.getBitcast(MaskVT, V1Mask), DAG.getBitcast(MaskVT, V2))); return DAG.getNode(ISD::OR, DL, VT, V1, V2); } /// \brief Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending /// these values. It relies on the availability of the X86ISD::BLENDI pattern to /// be matched in the backend with the type given. What it does check for is /// that the shuffle mask is a blend, or convertible into a blend with zero. static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Original, const X86Subtarget *Subtarget, SelectionDAG &DAG) { bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); SmallVector
Mask(Original.begin(), Original.end()); SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); bool ForceV1Zero = false, ForceV2Zero = false; // Attempt to generate the binary blend mask. If an input is zero then // we can use any lane. // TODO: generalize the zero matching to any scalar like isShuffleEquivalent. unsigned BlendMask = 0; for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; if (M < 0) continue; if (M == i) continue; if (M == i + Size) { BlendMask |= 1u << i; continue; } if (Zeroable[i]) { if (V1IsZero) { ForceV1Zero = true; Mask[i] = i; continue; } if (V2IsZero) { ForceV2Zero = true; BlendMask |= 1u << i; Mask[i] = i + Size; continue; } } return SDValue(); // Shuffled input! } // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. if (ForceV1Zero) V1 = getZeroVector(VT, Subtarget, DAG, DL); if (ForceV2Zero) V2 = getZeroVector(VT, Subtarget, DAG, DL); auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) { unsigned ScaledMask = 0; for (int i = 0; i != Size; ++i) if (BlendMask & (1u << i)) for (int j = 0; j != Scale; ++j) ScaledMask |= 1u << (i * Scale + j); return ScaledMask; }; switch (VT.SimpleTy) { case MVT::v2f64: case MVT::v4f32: case MVT::v4f64: case MVT::v8f32: return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); case MVT::v4i64: case MVT::v8i32: assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); // FALLTHROUGH case MVT::v2i64: case MVT::v4i32: // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into // that instruction. if (Subtarget->hasAVX2()) { // Scale the blend by the number of 32-bit dwords per element. int Scale = VT.getScalarSizeInBits() / 32; BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32; V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8))); } // FALLTHROUGH case MVT::v8i16: { // For integer shuffles we need to expand the mask and cast the inputs to // v8i16s prior to blending. int Scale = 8 / VT.getVectorNumElements(); BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale); V1 = DAG.getBitcast(MVT::v8i16, V1); V2 = DAG.getBitcast(MVT::v8i16, V2); return DAG.getBitcast(VT, DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8))); } case MVT::v16i16: { assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!"); SmallVector
RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { // We can lower these with PBLENDW which is mirrored across 128-bit lanes. assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!"); BlendMask = 0; for (int i = 0; i < 8; ++i) if (RepeatedMask[i] >= 16) BlendMask |= 1u << i; return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, DAG.getConstant(BlendMask, DL, MVT::i8)); } } // FALLTHROUGH case MVT::v16i8: case MVT::v32i8: { assert((VT.is128BitVector() || Subtarget->hasAVX2()) && "256-bit byte-blends require AVX2 support!"); // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, DAG)) return Masked; // Scale the blend by the number of bytes per element. int Scale = VT.getScalarSizeInBits() / 8; // This form of blend is always done on bytes. Compute the byte vector // type. MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); // Compute the VSELECT mask. Note that VSELECT is really confusing in the // mix of LLVM's code generator and the x86 backend. We tell the code // generator that boolean values in the elements of an x86 vector register // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' // mapping a select to operand #1, and 'false' mapping to operand #2. The // reality in x86 is that vector masks (pre-AVX-512) use only the high bit // of the element (the remaining are ignored) and 0 in that high bit would // mean operand #1 while 1 in the high bit would mean operand #2. So while // the LLVM model for boolean values in vector elements gets the relevant // bit set, it is set backwards and over constrained relative to x86's // actual model. SmallVector
VSELECTMask; for (int i = 0, Size = Mask.size(); i < Size; ++i) for (int j = 0; j < Scale; ++j) VSELECTMask.push_back( Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, MVT::i8)); V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask), V1, V2)); } default: llvm_unreachable("Not a supported integer vector type!"); } } /// \brief Try to lower as a blend of elements from two inputs followed by /// a single-input permutation. /// /// This matches the pattern where we can blend elements from two inputs and /// then reduce the shuffle to a single-input permutation. static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Mask, SelectionDAG &DAG) { // We build up the blend mask while checking whether a blend is a viable way // to reduce the shuffle. SmallVector
BlendMask(Mask.size(), -1); SmallVector
PermuteMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds."); if (BlendMask[Mask[i] % Size] == -1) BlendMask[Mask[i] % Size] = Mask[i]; else if (BlendMask[Mask[i] % Size] != Mask[i]) return SDValue(); // Can't blend in the needed input! PermuteMask[i] = Mask[i] % Size; } SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } /// \brief Generic routine to decompose a shuffle and blend into indepndent /// blends and permutes. /// /// This matches the extremely common pattern for handling combined /// shuffle+blend operations on newer X86 ISAs where we have very fast blend /// operations. It will try to pick the best arrangement of shuffles and /// blends. static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Mask, SelectionDAG &DAG) { // Shuffle the input elements into the desired positions in V1 and V2 and // blend them together. SmallVector
V1Mask(Mask.size(), -1); SmallVector
V2Mask(Mask.size(), -1); SmallVector
BlendMask(Mask.size(), -1); for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] >= 0 && Mask[i] < Size) { V1Mask[i] = Mask[i]; BlendMask[i] = i; } else if (Mask[i] >= Size) { V2Mask[i] = Mask[i] - Size; BlendMask[i] = i + Size; } // Try to lower with the simpler initial blend strategy unless one of the // input shuffles would be a no-op. We prefer to shuffle inputs as the // shuffle may be able to fold with a load or other benefit. However, when // we'll have to do 2x as many shuffles in order to achieve this, blending // first is a better strategy. if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) return BlendPerm; V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); } /// \brief Try to lower a vector shuffle as a byte rotation. /// /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will /// try to generically lower a vector shuffle through such an pattern. It /// does not check for the profitability of lowering either as PALIGNR or /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. /// This matches shuffle vectors that look like: /// /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] /// /// Essentially it concatenates V1 and V2, shifts right by some number of /// elements, and takes the low elements as the result. Note that while this is /// specified as a *right shift* because x86 is little-endian, it is a *left /// rotate* of the vector lanes. static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!"); int NumElts = Mask.size(); int NumLanes = VT.getSizeInBits() / 128; int NumLaneElts = NumElts / NumLanes; // We need to detect various ways of spelling a rotation: // [11, 12, 13, 14, 15, 0, 1, 2] // [-1, 12, 13, 14, -1, -1, 1, -1] // [-1, -1, -1, -1, -1, -1, 1, 2] // [ 3, 4, 5, 6, 7, 8, 9, 10] // [-1, 4, 5, 6, -1, -1, 9, -1] // [-1, 4, 5, 6, -1, -1, -1, -1] int Rotation = 0; SDValue Lo, Hi; for (int l = 0; l < NumElts; l += NumLaneElts) { for (int i = 0; i < NumLaneElts; ++i) { if (Mask[l + i] == -1) continue; assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!"); // Get the mod-Size index and lane correct it. int LaneIdx = (Mask[l + i] % NumElts) - l; // Make sure it was in this lane. if (LaneIdx < 0 || LaneIdx >= NumLaneElts) return SDValue(); // Determine where a rotated vector would have started. int StartIdx = i - LaneIdx; if (StartIdx == 0) // The identity rotation isn't interesting, stop. return SDValue(); // If we found the tail of a vector the rotation must be the missing // front. If we found the head of a vector, it must be how much of the // head. int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx; if (Rotation == 0) Rotation = CandidateRotation; else if (Rotation != CandidateRotation) // The rotations don't match, so we can't match this mask. return SDValue(); // Compute which value this mask is pointing at. SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2; // Compute which of the two target values this index should be assigned // to. This reflects whether the high elements are remaining or the low // elements are remaining. SDValue &TargetV = StartIdx < 0 ? Hi : Lo; // Either set up this value if we've not encountered it before, or check // that it remains consistent. if (!TargetV) TargetV = MaskV; else if (TargetV != MaskV) // This may be a rotation, but it pulls from the inputs in some // unsupported interleaving. return SDValue(); } } // Check that we successfully analyzed the mask, and normalize the results. assert(Rotation != 0 && "Failed to locate a viable rotation!"); assert((Lo || Hi) && "Failed to find a rotated input vector!"); if (!Lo) Lo = Hi; else if (!Hi) Hi = Lo; // The actual rotate instruction rotates bytes, so we need to scale the // rotation based on how many bytes are in the vector lane. int Scale = 16 / NumLaneElts; // SSSE3 targets can use the palignr instruction. if (Subtarget->hasSSSE3()) { // Cast the inputs to i8 vector of correct length to match PALIGNR. MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes); Lo = DAG.getBitcast(AlignVT, Lo); Hi = DAG.getBitcast(AlignVT, Hi); return DAG.getBitcast( VT, DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Lo, Hi, DAG.getConstant(Rotation * Scale, DL, MVT::i8))); } assert(VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!"); assert(Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!"); // Default SSE2 implementation int LoByteShift = 16 - Rotation * Scale; int HiByteShift = Rotation * Scale; // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ. Lo = DAG.getBitcast(MVT::v2i64, Lo); Hi = DAG.getBitcast(MVT::v2i64, Hi); SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo, DAG.getConstant(LoByteShift, DL, MVT::i8)); SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi, DAG.getConstant(HiByteShift, DL, MVT::i8)); return DAG.getBitcast(VT, DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift)); } /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros). /// /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function /// matches elements from one of the input vectors shuffled to the left or /// right with zeroable elements 'shifted in'. It handles both the strictly /// bit-wise element shifts and the byte shift across an entire 128-bit double /// quad word lane. /// /// PSHL : (little-endian) left bit shift. /// [ zz, 0, zz, 2 ] /// [ -1, 4, zz, -1 ] /// PSRL : (little-endian) right bit shift. /// [ 1, zz, 3, zz] /// [ -1, -1, 7, zz] /// PSLLDQ : (little-endian) left byte shift /// [ zz, 0, 1, 2, 3, 4, 5, 6] /// [ zz, zz, -1, -1, 2, 3, 4, -1] /// [ zz, zz, zz, zz, zz, zz, -1, 1] /// PSRLDQ : (little-endian) right byte shift /// [ 5, 6, 7, zz, zz, zz, zz, zz] /// [ -1, 5, 6, 7, zz, zz, zz, zz] /// [ 1, 2, -1, -1, -1, -1, zz, zz] static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Mask, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Size = Mask.size(); assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); auto CheckZeros = [&](int Shift, int Scale, bool Left) { for (int i = 0; i < Size; i += Scale) for (int j = 0; j < Shift; ++j) if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) return false; return true; }; auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) { for (int i = 0; i != Size; i += Scale) { unsigned Pos = Left ? i + Shift : i; unsigned Low = Left ? i : i + Shift; unsigned Len = Scale - Shift; if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + (V == V1 ? 0 : Size))) return SDValue(); } int ShiftEltBits = VT.getScalarSizeInBits() * Scale; bool ByteShift = ShiftEltBits > 64; unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1); // Normalize the scale for byte shifts to still produce an i64 element // type. Scale = ByteShift ? Scale / 2 : Scale; // We need to round trip through the appropriate type for the shift. MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale); MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale); assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && "Illegal integer vector type"); V = DAG.getBitcast(ShiftVT, V); V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, DL, MVT::i8)); return DAG.getBitcast(VT, V); }; // SSE/AVX supports logical shifts up to 64-bit integers - so we can just // keep doubling the size of the integer elements up to that. We can // then shift the elements of the integer vector by whole multiples of // their width within the elements of the larger integer vector. Test each // multiple to see if we can find a match with the moved element indices // and that the shifted in elements are all zeroable. for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2) for (int Shift = 1; Shift != Scale; ++Shift) for (bool Left : {true, false}) if (CheckZeros(Shift, Scale, Left)) for (SDValue V : {V1, V2}) if (SDValue Match = MatchShift(Shift, Scale, Left, V)) return Match; // no match return SDValue(); } /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. static SDValue lowerVectorShuffleWithSSE4A(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Mask, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); assert(!Zeroable.all() && "Fully zeroable shuffle mask"); int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); // Upper half must be undefined. if (!isUndefInRange(Mask, HalfSize, HalfSize)) return SDValue(); // EXTRQ: Extract Len elements from lower half of source, starting at Idx. // Remainder of lower half result is zero and upper half is all undef. auto LowerAsEXTRQ = [&]() { // Determine the extraction length from the part of the // lower half that isn't zeroable. int Len = HalfSize; for (; Len > 0; --Len) if (!Zeroable[Len - 1]) break; assert(Len > 0 && "Zeroable shuffle mask"); // Attempt to match first Len sequential elements from the lower half. SDValue Src; int Idx = -1; for (int i = 0; i != Len; ++i) { int M = Mask[i]; if (M < 0) continue; SDValue &V = (M < Size ? V1 : V2); M = M % Size; // The extracted elements must start at a valid index and all mask // elements must be in the lower half. if (i > M || M >= HalfSize) return SDValue(); if (Idx < 0 || (Src == V && Idx == (M - i))) { Src = V; Idx = M - i; continue; } return SDValue(); } if (Idx < 0) return SDValue(); assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); }; if (SDValue ExtrQ = LowerAsEXTRQ()) return ExtrQ; // INSERTQ: Extract lowest Len elements from lower half of second source and // insert over first source, starting at Idx. // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } auto LowerAsInsertQ = [&]() { for (int Idx = 0; Idx != HalfSize; ++Idx) { SDValue Base; // Attempt to match first source from mask before insertion point. if (isUndefInRange(Mask, 0, Idx)) { /* EMPTY */ } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { Base = V1; } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { Base = V2; } else { continue; } // Extend the extraction length looking to match both the insertion of // the second source and the remaining elements of the first. for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { SDValue Insert; int Len = Hi - Idx; // Match insertion. if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { Insert = V1; } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { Insert = V2; } else { continue; } // Match the remaining elements of the lower half. if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { /* EMPTY */ } else if ((!Base || (Base == V1)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { Base = V1; } else if ((!Base || (Base == V2)) && isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Size + Hi)) { Base = V2; } else { continue; } // We may not have a base (first source) - this can safely be undefined. if (!Base) Base = DAG.getUNDEF(VT); int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); } } return SDValue(); }; if (SDValue InsertQ = LowerAsInsertQ()) return InsertQ; return SDValue(); } /// \brief Lower a vector shuffle as a zero or any extension. /// /// Given a specific number of elements, element bit width, and extension /// stride, produce either a zero or any extension based on the available /// features of the subtarget. The extended elements are consecutive and /// begin and can start from an offseted element index in the input; to /// avoid excess shuffling the offset must either being in the bottom lane /// or at the start of a higher lane. All extended elements must be from /// the same lane. static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend( SDLoc DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, ArrayRef
Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Scale > 1 && "Need a scale to extend."); int EltBits = VT.getScalarSizeInBits(); int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = 128 / EltBits; int OffsetLane = Offset / NumEltsPerLane; assert((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended."); assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits."); assert(0 <= Offset && "Extension offset must be positive."); assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane."); // Check that an index is in same lane as the base offset. auto SafeOffset = [&](int Idx) { return OffsetLane == (Idx / NumEltsPerLane); }; // Shift along an input so that the offset base moves to the first element. auto ShuffleOffset = [&](SDValue V) { if (!Offset) return V; SmallVector
ShMask((unsigned)NumElements, -1); for (int i = 0; i * Scale < NumElements; ++i) { int SrcIdx = i + Offset; ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; } return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); }; // Found a valid zext mask! Try various lowering strategies based on the // input type and available ISA extensions. if (Subtarget->hasSSE41()) { // Not worth offseting 128-bit vectors if scale == 2, a pattern using // PUNPCK will catch this in a later shuffle match. if (Offset && Scale == 2 && VT.is128BitVector()) return SDValue(); MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), NumElements / Scale); InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, ShuffleOffset(InputV)); return DAG.getBitcast(VT, InputV); } assert(VT.is128BitVector() && "Only 128-bit vectors can be extended."); // For any extends we can cheat for larger element sizes and use shuffle // instructions that can fold with a load and/or copy. if (AnyExt && EltBits == 32) { int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, -1}; return DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); } if (AnyExt && EltBits == 16 && Scale > 2) { int PSHUFDMask[4] = {Offset / 2, -1, SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, InputV), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); int PSHUFWMask[4] = {1, -1, -1, -1}; unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW); return DAG.getBitcast( VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, InputV), getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); } // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes // to 64-bits. if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget->hasSSE4A()) { assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!"); assert(VT.is128BitVector() && "Unexpected vector width!"); int LoIdx = Offset * EltBits; SDValue Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getConstant(EltBits, DL, MVT::i8), DAG.getConstant(LoIdx, DL, MVT::i8))); if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) || !SafeOffset(Offset + 1)) return DAG.getNode(ISD::BITCAST, DL, VT, Lo); int HiIdx = (Offset + 1) * EltBits; SDValue Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, DAG.getConstant(EltBits, DL, MVT::i8), DAG.getConstant(HiIdx, DL, MVT::i8))); return DAG.getNode(ISD::BITCAST, DL, VT, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); } // If this would require more than 2 unpack instructions to expand, use // pshufb when available. We can only use more than 2 unpack instructions // when zero extending i8 elements which also makes it easier to use pshufb. if (Scale > 4 && EltBits == 8 && Subtarget->hasSSSE3()) { assert(NumElements == 16 && "Unexpected byte vector width!"); SDValue PSHUFBMask[16]; for (int i = 0; i < 16; ++i) { int Idx = Offset + (i / Scale); PSHUFBMask[i] = DAG.getConstant( (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8); } InputV = DAG.getBitcast(MVT::v16i8, InputV); return DAG.getBitcast(VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, PSHUFBMask))); } // If we are extending from an offset, ensure we start on a boundary that // we can unpack from. int AlignToUnpack = Offset % (NumElements / Scale); if (AlignToUnpack) { SmallVector
ShMask((unsigned)NumElements, -1); for (int i = AlignToUnpack; i < NumElements; ++i) ShMask[i - AlignToUnpack] = i; InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); Offset -= AlignToUnpack; } // Otherwise emit a sequence of unpacks. do { unsigned UnpackLoHi = X86ISD::UNPCKL; if (Offset >= (NumElements / 2)) { UnpackLoHi = X86ISD::UNPCKH; Offset -= (NumElements / 2); } MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) : getZeroVector(InputVT, Subtarget, DAG, DL); InputV = DAG.getBitcast(InputVT, InputV); InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); Scale /= 2; EltBits *= 2; NumElements /= 2; } while (Scale > 1); return DAG.getBitcast(VT, InputV); } /// \brief Try to lower a vector shuffle as a zero extension on any microarch. /// /// This routine will try to do everything in its power to cleverly lower /// a shuffle which happens to match the pattern of a zero extend. It doesn't /// check for the profitability of this lowering, it tries to aggressively /// match this pattern. It will use all of the micro-architectural details it /// can to emit an efficient lowering. It handles both blends with all-zero /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to /// masking out later). /// /// The reason we have dedicated lowering for zext-style shuffles is that they /// are both incredibly common and often quite performance sensitive. static SDValue lowerVectorShuffleAsZeroOrAnyExtend( SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); int Bits = VT.getSizeInBits(); int NumLanes = Bits / 128; int NumElements = VT.getVectorNumElements(); int NumEltsPerLane = NumElements / NumLanes; assert(VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit"); assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size"); // Define a helper function to check a particular ext-scale and lower to it if // valid. auto Lower = [&](int Scale) -> SDValue { SDValue InputV; bool AnyExt = true; int Offset = 0; int Matches = 0; for (int i = 0; i < NumElements; ++i) { int M = Mask[i]; if (M == -1) continue; // Valid anywhere but doesn't tell us anything. if (i % Scale != 0) { // Each of the extended elements need to be zeroable. if (!Zeroable[i]) return SDValue(); // We no longer are in the anyext case. AnyExt = false; continue; } // Each of the base elements needs to be consecutive indices into the // same input vector. SDValue V = M < NumElements ? V1 : V2; M = M % NumElements; if (!InputV) { InputV = V; Offset = M - (i / Scale); } else if (InputV != V) return SDValue(); // Flip-flopping inputs. // Offset must start in the lowest 128-bit lane or at the start of an // upper lane. // FIXME: Is it ever worth allowing a negative base offset? if (!((0 <= Offset && Offset < NumEltsPerLane) || (Offset % NumEltsPerLane) == 0)) return SDValue(); // If we are offsetting, all referenced entries must come from the same // lane. if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) return SDValue(); if ((M % NumElements) != (Offset + (i / Scale))) return SDValue(); // Non-consecutive strided elements. Matches++; } // If we fail to find an input, we have a zero-shuffle which should always // have already been handled. // FIXME: Maybe handle this here in case during blending we end up with one? if (!InputV) return SDValue(); // If we are offsetting, don't extend if we only match a single input, we // can always do better by using a basic PSHUF or PUNPCK. if (Offset != 0 && Matches < 2) return SDValue(); return lowerVectorShuffleAsSpecificZeroOrAnyExtend( DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG); }; // The widest scale possible for extending is to a 64-bit integer. assert(Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!"); int NumExtElements = Bits / 64; // Each iteration, try extending the elements half as much, but into twice as // many elements. for (; NumExtElements < NumElements; NumExtElements *= 2) { assert(NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size."); if (SDValue V = Lower(NumElements / NumExtElements)) return V; } // General extends failed, but 128-bit vectors may be able to use MOVQ. if (Bits != 128) return SDValue(); // Returns one of the source operands if the shuffle can be reduced to a // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. auto CanZExtLowHalf = [&]() { for (int i = NumElements / 2; i != NumElements; ++i) if (!Zeroable[i]) return SDValue(); if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) return V1; if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) return V2; return SDValue(); }; if (SDValue V = CanZExtLowHalf()) { V = DAG.getBitcast(MVT::v2i64, V); V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); return DAG.getBitcast(VT, V); } // No viable ext lowering found. return SDValue(); } /// \brief Try to get a scalar value for a specific element of a vector. /// /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. static SDValue getScalarValueForVectorElement(SDValue V, int Idx, SelectionDAG &DAG) { MVT VT = V.getSimpleValueType(); MVT EltVT = VT.getVectorElementType(); while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); // If the bitcasts shift the element size, we can't extract an equivalent // element from it. MVT NewVT = V.getSimpleValueType(); if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) return SDValue(); if (V.getOpcode() == ISD::BUILD_VECTOR || (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { // Ensure the scalar operand is the same size as the destination. // FIXME: Add support for scalar truncation where possible. SDValue S = V.getOperand(Idx); if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) return DAG.getNode(ISD::BITCAST, SDLoc(V), EltVT, S); } return SDValue(); } /// \brief Helper to test for a load that can be folded with x86 shuffles. /// /// This is particularly important because the set of instructions varies /// significantly based on whether the operand is a load or not. static bool isShuffleFoldableLoad(SDValue V) { while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); return ISD::isNON_EXTLoad(V.getNode()); } /// \brief Try to lower insertion of a single element into a zero vector. /// /// This is a common pattern that we have especially efficient patterns to lower /// across all subtarget feature sets. static SDValue lowerVectorShuffleAsElementInsertion( SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); MVT ExtVT = VT; MVT EltVT = VT.getVectorElementType(); int V2Index = std::find_if(Mask.begin(), Mask.end(), [&Mask](int M) { return M >= (int)Mask.size(); }) - Mask.begin(); bool IsV1Zeroable = true; for (int i = 0, Size = Mask.size(); i < Size; ++i) if (i != V2Index && !Zeroable[i]) { IsV1Zeroable = false; break; } // Check for a single input from a SCALAR_TO_VECTOR node. // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and // all the smarts here sunk into that routine. However, the current // lowering of BUILD_VECTOR makes that nearly impossible until the old // vector shuffle lowering is dead. SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), DAG); if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { // We need to zext the scalar if it is smaller than an i32. V2S = DAG.getBitcast(EltVT, V2S); if (EltVT == MVT::i8 || EltVT == MVT::i16) { // Using zext to expand a narrow element won't work for non-zero // insertions. if (!IsV1Zeroable) return SDValue(); // Zero-extend directly to i32. ExtVT = MVT::v4i32; V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); } V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || EltVT == MVT::i16) { // Either not inserting from the low element of the input or the input // element size is too small to use VZEXT_MOVL to clear the high bits. return SDValue(); } if (!IsV1Zeroable) { // If V1 can't be treated as a zero vector we have fewer options to lower // this. We can't support integer vectors or non-zero targets cheaply, and // the V1 elements can't be permuted in any way. assert(VT == ExtVT && "Cannot change extended type when non-zeroable!"); if (!VT.isFloatingPoint() || V2Index != 0) return SDValue(); SmallVector
V1Mask(Mask.begin(), Mask.end()); V1Mask[V2Index] = -1; if (!isNoopShuffleMask(V1Mask)) return SDValue(); // This is essentially a special case blend operation, but if we have // general purpose blend operations, they are always faster. Bail and let // the rest of the lowering handle these as blends. if (Subtarget->hasSSE41()) return SDValue(); // Otherwise, use MOVSD or MOVSS. assert((EltVT == MVT::f32 || EltVT == MVT::f64) && "Only two types of floating point element types to handle!"); return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL, ExtVT, V1, V2); } // This lowering only works for the low element with floating point vectors. if (VT.isFloatingPoint() && V2Index != 0) return SDValue(); V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); if (ExtVT != VT) V2 = DAG.getBitcast(VT, V2); if (V2Index != 0) { // If we have 4 or fewer lanes we can cheaply shuffle the element into // the desired position. Otherwise it is more efficient to do a vector // shift left. We know that we can do a vector shift left because all // the inputs are zero. if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) { SmallVector
V2Shuffle(Mask.size(), 1); V2Shuffle[V2Index] = 0; V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); } else { V2 = DAG.getBitcast(MVT::v2i64, V2); V2 = DAG.getNode( X86ISD::VSHLDQ, DL, MVT::v2i64, V2, DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, DAG.getTargetLoweringInfo().getScalarShiftAmountTy( DAG.getDataLayout(), VT))); V2 = DAG.getBitcast(VT, V2); } } return V2; } /// \brief Try to lower broadcast of a single - truncated - integer element, /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. /// /// This assumes we have AVX2. static SDValue lowerVectorShuffleAsTruncBroadcast(SDLoc DL, MVT VT, SDValue V0, int BroadcastIdx, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(Subtarget->hasAVX2() && "We can only lower integer broadcasts with AVX2!"); EVT EltVT = VT.getVectorElementType(); EVT V0VT = V0.getValueType(); assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!"); assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!"); EVT V0EltVT = V0VT.getVectorElementType(); if (!V0EltVT.isInteger()) return SDValue(); const unsigned EltSize = EltVT.getSizeInBits(); const unsigned V0EltSize = V0EltVT.getSizeInBits(); // This is only a truncation if the original element type is larger. if (V0EltSize <= EltSize) return SDValue(); assert(((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!"); const unsigned V0Opc = V0.getOpcode(); const unsigned Scale = V0EltSize / EltSize; const unsigned V0BroadcastIdx = BroadcastIdx / Scale; if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && V0Opc != ISD::BUILD_VECTOR) return SDValue(); SDValue Scalar = V0.getOperand(V0BroadcastIdx); // If we're extracting non-least-significant bits, shift so we can truncate. // Hopefully, we can fold away the trunc/srl/load into the broadcast. // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. if (const int OffsetIdx = BroadcastIdx % Scale) Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType())); return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); } /// \brief Try to lower broadcast of a single element. /// /// For convenience, this code also bundles all of the subtarget feature set /// filtering. While a little annoying to re-dispatch on type here, there isn't /// a convenient way to factor it out. /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them? static SDValue lowerVectorShuffleAsBroadcast(SDLoc DL, MVT VT, SDValue V, ArrayRef
Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { if (!Subtarget->hasAVX()) return SDValue(); if (VT.isInteger() && !Subtarget->hasAVX2()) return SDValue(); // Check that the mask is a broadcast. int BroadcastIdx = -1; for (int M : Mask) if (M >= 0 && BroadcastIdx == -1) BroadcastIdx = M; else if (M >= 0 && M != BroadcastIdx) return SDValue(); assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1."); // Go up the chain of (vector) values to find a scalar load that we can // combine with the broadcast. for (;;) { switch (V.getOpcode()) { case ISD::CONCAT_VECTORS: { int OperandSize = Mask.size() / V.getNumOperands(); V = V.getOperand(BroadcastIdx / OperandSize); BroadcastIdx %= OperandSize; continue; } case ISD::INSERT_SUBVECTOR: { SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); auto ConstantIdx = dyn_cast
(V.getOperand(2)); if (!ConstantIdx) break; int BeginIdx = (int)ConstantIdx->getZExtValue(); int EndIdx = BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements(); if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) { BroadcastIdx -= BeginIdx; V = VInner; } else { V = VOuter; } continue; } } break; } // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. // First, look through bitcast: if the original value has a larger element // type than the shuffle, the broadcast element is in essence truncated. // Make that explicit to ease folding. if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast( DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG)) return TruncBroadcast; // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); // If the scalar isn't a load, we can't broadcast from it in AVX1. // Only AVX2 has register broadcasts. if (!Subtarget->hasAVX2() && !isShuffleFoldableLoad(V)) return SDValue(); } else if (MayFoldLoad(V) && !cast
(V)->isVolatile()) { // If we are broadcasting a load that is only used by the shuffle // then we can reduce the vector load to the broadcasted scalar load. LoadSDNode *Ld = cast
(V); SDValue BaseAddr = Ld->getOperand(1); EVT AddrVT = BaseAddr.getValueType(); EVT SVT = VT.getScalarType(); unsigned Offset = BroadcastIdx * SVT.getStoreSize(); SDValue NewAddr = DAG.getNode( ISD::ADD, DL, AddrVT, BaseAddr, DAG.getConstant(Offset, DL, AddrVT)); V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); } else if (BroadcastIdx != 0 || !Subtarget->hasAVX2()) { // We can't broadcast from a vector register without AVX2, and we can only // broadcast from the zero-element of a vector register. return SDValue(); } return DAG.getNode(X86ISD::VBROADCAST, DL, VT, V); } // Check for whether we can use INSERTPS to perform the shuffle. We only use // INSERTPS when the V1 elements are already in the correct locations // because otherwise we can just always use two SHUFPS instructions which // are much smaller to encode than a SHUFPS and an INSERTPS. We can also // perform INSERTPS if a single V1 element is out of place and all V2 // elements are zeroable. static SDValue lowerVectorShuffleAsInsertPS(SDValue Op, SDValue V1, SDValue V2, ArrayRef
Mask, SelectionDAG &DAG) { assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); unsigned ZMask = 0; int V1DstIndex = -1; int V2DstIndex = -1; bool V1UsedInPlace = false; for (int i = 0; i < 4; ++i) { // Synthesize a zero mask from the zeroable elements (includes undefs). if (Zeroable[i]) { ZMask |= 1 << i; continue; } // Flag if we use any V1 inputs in place. if (i == Mask[i]) { V1UsedInPlace = true; continue; } // We can only insert a single non-zeroable element. if (V1DstIndex != -1 || V2DstIndex != -1) return SDValue(); if (Mask[i] < 4) { // V1 input out of place for insertion. V1DstIndex = i; } else { // V2 input for insertion. V2DstIndex = i; } } // Don't bother if we have no (non-zeroable) element for insertion. if (V1DstIndex == -1 && V2DstIndex == -1) return SDValue(); // Determine element insertion src/dst indices. The src index is from the // start of the inserted vector, not the start of the concatenated vector. unsigned V2SrcIndex = 0; if (V1DstIndex != -1) { // If we have a V1 input out of place, we use V1 as the V2 element insertion // and don't use the original V2 at all. V2SrcIndex = Mask[V1DstIndex]; V2DstIndex = V1DstIndex; V2 = V1; } else { V2SrcIndex = Mask[V2DstIndex] - 4; } // If no V1 inputs are used in place, then the result is created only from // the zero mask and the V2 insertion - so remove V1 dependency. if (!V1UsedInPlace) V1 = DAG.getUNDEF(MVT::v4f32); unsigned InsertPSMask = V2SrcIndex << 6 | V2DstIndex << 4 | ZMask; assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!"); // Insert the V2 element into the desired position. SDLoc DL(Op); return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, DAG.getConstant(InsertPSMask, DL, MVT::i8)); } /// \brief Try to lower a shuffle as a permute of the inputs followed by an /// UNPCK instruction. /// /// This specifically targets cases where we end up with alternating between /// the two inputs, and so can permute them into something that feeds a single /// UNPCK instruction. Note that this routine only targets integer vectors /// because for floating point vectors we have a generalized SHUFPS lowering /// strategy that handles everything that doesn't *exactly* match an unpack, /// making this clever lowering unnecessary. static SDValue lowerVectorShuffleAsPermuteAndUnpack(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Mask, SelectionDAG &DAG) { assert(!VT.isFloatingPoint() && "This routine only supports integer vectors."); assert(!isSingleInputShuffleMask(Mask) && "This routine should only be used when blending two inputs."); assert(Mask.size() >= 2 && "Single element masks are invalid."); int Size = Mask.size(); int NumLoInputs = std::count_if(Mask.begin(), Mask.end(), [Size](int M) { return M >= 0 && M % Size < Size / 2; }); int NumHiInputs = std::count_if( Mask.begin(), Mask.end(), [Size](int M) { return M % Size >= Size / 2; }); bool UnpackLo = NumLoInputs >= NumHiInputs; auto TryUnpack = [&](MVT UnpackVT, int Scale) { SmallVector
V1Mask(Mask.size(), -1); SmallVector
V2Mask(Mask.size(), -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; // Each element of the unpack contains Scale elements from this mask. int UnpackIdx = i / Scale; // We only handle the case where V1 feeds the first slots of the unpack. // We rely on canonicalization to ensure this is the case. if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) return SDValue(); // Setup the mask for this input. The indexing is tricky as we have to // handle the unpack stride. SmallVectorImpl
&VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = Mask[i] % Size; } // If we will have to shuffle both inputs to use the unpack, check whether // we can just unpack first and shuffle the result. If so, skip this unpack. if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) return SDValue(); // Shuffle the inputs into place. V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); // Cast the inputs to the type we will use to unpack them. V1 = DAG.getBitcast(UnpackVT, V1); V2 = DAG.getBitcast(UnpackVT, V2); // Unpack the inputs and cast the result back to the desired type. return DAG.getBitcast( VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, UnpackVT, V1, V2)); }; // We try each unpack from the largest to the smallest to try and find one // that fits this mask. int OrigNumElements = VT.getVectorNumElements(); int OrigScalarSize = VT.getScalarSizeInBits(); for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) { int Scale = ScalarSize / OrigScalarSize; int NumElements = OrigNumElements / Scale; MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), NumElements); if (SDValue Unpack = TryUnpack(UnpackVT, Scale)) return Unpack; } // If none of the unpack-rooted lowerings worked (or were profitable) try an // initial unpack. if (NumLoInputs == 0 || NumHiInputs == 0) { assert((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!"); int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; // FIXME: We could consider the total complexity of the permute of each // possible unpacking. Or at the least we should consider how many // half-crossings are created. // FIXME: We could consider commuting the unpacks. SmallVector
PermMask; PermMask.assign(Size, -1); for (int i = 0; i < Size; ++i) { if (Mask[i] < 0) continue; assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!"); PermMask[i] = 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); } return DAG.getVectorShuffle( VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT, V1, V2), DAG.getUNDEF(VT), PermMask); } return SDValue(); } /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full /// support for floating point shuffles but not integer shuffles. These /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
Mask = SVOp->getMask(); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { // Use low duplicate instructions for masks that match their pattern. if (Subtarget->hasSSE3()) if (isShuffleEquivalent(V1, V2, Mask, {0, 0})) return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, V1); // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); if (Subtarget->hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); // If we have a single input, insert that into V1 if we can do so cheaply. if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) { if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2f64, V2, V1, InverseMask, Subtarget, DAG)) return Insertion; } // Try to use one of the special instruction patterns to handle two common // blend patterns if a zero-blend above didn't work. if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) || isShuffleEquivalent(V1, V2, Mask, {1, 3})) if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) // We can either use a special instruction to load over the low double or // to move just the low double. return DAG.getNode( isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD, DL, MVT::v2f64, V2, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); if (Subtarget->hasSSE41()) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) return V; unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, DL, MVT::i8)); } /// \brief Handle lowering of 2-lane 64-bit integer shuffles. /// /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
Mask = SVOp->getMask(); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v2i64, V1, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getBitcast(MVT::v4i32, V1); int WidenedMask[4] = { std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; return DAG.getBitcast( MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); } assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!"); assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); // If we have a blend of two PACKUS operations an the blend aligns with the // low and half halves, we can just merge the PACKUS operations. This is // particularly important as it lets us merge shuffles that this routine itself // creates. auto GetPackNode = [](SDValue V) { while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); return V.getOpcode() == X86ISD::PACKUS ? V : SDValue(); }; if (SDValue V1Pack = GetPackNode(V1)) if (SDValue V2Pack = GetPackNode(V2)) return DAG.getBitcast(MVT::v2i64, DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Mask[0] == 0 ? V1Pack.getOperand(0) : V1Pack.getOperand(1), Mask[1] == 2 ? V2Pack.getOperand(0) : V2Pack.getOperand(1))); // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, DAG)) return Shift; // When loading a scalar and then shuffling it into a vector we can often do // the insertion cheaply. if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Insertion; // Try inverting the insertion since for v2 masks it is easy to do and we // can't reliably sort the mask one way or the other. int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; if (SDValue Insertion = lowerVectorShuffleAsElementInsertion( DL, MVT::v2i64, V2, V1, InverseMask, Subtarget, DAG)) return Insertion; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget->hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2, Mask, DAG); // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't // have this problem. It would be really nice if x86 had better shuffles here. V1 = DAG.getBitcast(MVT::v2f64, V1); V2 = DAG.getBitcast(MVT::v2f64, V2); return DAG.getBitcast(MVT::v2i64, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } /// \brief Test whether this can be lowered with a single SHUFPS instruction. /// /// This is used to disable more specialized lowerings when the shufps lowering /// will happen to be efficient. static bool isSingleSHUFPSMask(ArrayRef
Mask) { // This routine only handles 128-bit shufps. assert(Mask.size() == 4 && "Unsupported mask size!"); // To lower with a single SHUFPS we need to have the low half and high half // each requiring a single input. if (Mask[0] != -1 && Mask[1] != -1 && (Mask[0] < 4) != (Mask[1] < 4)) return false; if (Mask[2] != -1 && Mask[3] != -1 && (Mask[2] < 4) != (Mask[3] < 4)) return false; return true; } /// \brief Lower a vector shuffle using the SHUFPS instruction. /// /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. static SDValue lowerVectorShuffleWithSHUFPS(SDLoc DL, MVT VT, ArrayRef
Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 1) { int V2Index = std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - Mask.begin(); // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; if (Mask[V2AdjIndex] == -1) { // Handles all the cases where we have a single V2 element and an undef. // This will only ever happen in the high lanes because we commute the // vector otherwise. if (V2Index < 2) std::swap(LowV, HighV); NewMask[V2Index] -= 4; } else { // Handle the case where the V2 element ends up adjacent to a V1 element. // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now proceed to reconstruct the final blend as we have the necessary // high or low half formed. if (V2Index < 2) { LowV = V2; HighV = V1; } else { HighV = V2; } NewMask[V1Index] = 2; // We put the V1 element in V2[2]. NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. } } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the // high lanes. NewMask[2] -= 4; NewMask[3] -= 4; } else if (Mask[2] < 4 && Mask[3] < 4) { // We also handle the reversed case because this utility may get called // when we detect a SHUFPS pattern but can't easily commute the shuffle to // arrange things in the right direction. NewMask[0] -= 4; NewMask[1] -= 4; HighV = V1; LowV = V2; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final // shuffle to place them. // The first two blend mask elements are for V1, the second two are for // V2. int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to // a blend. LowV = HighV = V1; NewMask[0] = Mask[0] < 4 ? 0 : 2; NewMask[1] = Mask[0] < 4 ? 2 : 0; NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } } return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); } /// \brief Lower 4-lane 32-bit floating point shuffles. /// /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4f32, V1, Mask, Subtarget, DAG)) return Broadcast; // Use even/odd duplicate instructions for masks that match their pattern. if (Subtarget->hasSSE3()) { if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2})) return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3})) return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); } if (Subtarget->hasAVX()) { // If we have AVX, we can use VPERMILPS which will allow folding a load // into the shuffle. return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Otherwise, use a straight shuffle of a single input vector. We pass the // input vector to both operands to simulate this with a SHUFPS. return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // There are special ways we can lower some single-element blends. However, we // have custom ways we can lower more complex single-element blends below that // we defer to if both this and BLENDPS fail to match, so restrict this to // when the V2 input is targeting element 0 of the mask -- that is the fast // case here. if (NumV2Elements == 1 && Mask[0] >= 4) if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return V; if (Subtarget->hasSSE41()) { if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG)) return Blend; // Use INSERTPS if we can complete the shuffle efficiently. if (SDValue V = lowerVectorShuffleAsInsertPS(Op, V1, V2, Mask, DAG)) return V; if (!isSingleSHUFPSMask(Mask)) if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute( DL, MVT::v4f32, V1, V2, Mask, DAG)) return BlendPerm; } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) return V; // Otherwise fall back to a SHUFPS lowering strategy. return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); } /// \brief Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); // Whenever we can lower this as a zext, that instruction is strictly faster // than any alternative. It also allows us to fold memory operands into the // shuffle in many cases. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return ZExt; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 0) { // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i32, V1, Mask, Subtarget, DAG)) return Broadcast; // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We coerce the shuffle pattern to be compatible with UNPCK instructions // but we aren't actually going to use the UNPCK instruction because doing // so prevents folding a load into this instruction or making a copy. const int UnpackLoMask[] = {0, 0, 1, 1}; const int UnpackHiMask[] = {2, 2, 3, 3}; if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1})) Mask = UnpackLoMask; else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3})) Mask = UnpackHiMask; return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, DAG)) return Shift; // There are special ways we can lower some single-element blends. if (NumV2Elements == 1) if (SDValue V = lowerVectorShuffleAsElementInsertion(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return V; // We have different paths for blend lowering, but they all must use the // *exact* same predicate. bool IsBlendSupported = Subtarget->hasSSE41(); if (IsBlendSupported) if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Blend; if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) return V; // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. if (Subtarget->hasSSSE3()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. if (IsBlendSupported) return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2, Mask, DAG); // Try to lower by permuting the inputs into an unpack instruction. if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2, Mask, DAG)) return Unpack; // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. return DAG.getBitcast( MVT::v4i32, DAG.getVectorShuffle(MVT::v4f32, DL, DAG.getBitcast(MVT::v4f32, V1), DAG.getBitcast(MVT::v4f32, V2), Mask)); } /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 /// shuffle lowering, and the most complex part. /// /// The lowering strategy is to try to form pairs of input lanes which are /// targeted at the same half of the final vector, and then use a dword shuffle /// to place them onto the right half, and finally unpack the paired lanes into /// their final position. /// /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. /// /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 /// vector, form the analogous 128-bit 8-element Mask. static SDValue lowerV8I16GeneralSingleInputVectorShuffle( SDLoc DL, MVT VT, SDValue V, MutableArrayRef
Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!"); MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); assert(Mask.size() == 8 && "Shuffle mask length doen't match!"); MutableArrayRef
LoMask = Mask.slice(0, 4); MutableArrayRef
HiMask = Mask.slice(4, 4); SmallVector
LoInputs; std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), [](int M) { return M >= 0; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector
HiInputs; std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), [](int M) { return M >= 0; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); int NumLToL = std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; int NumLToH = std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef
LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef
LToHInputs(HiInputs.data(), NumLToH); MutableArrayRef
HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef
HToHInputs(HiInputs.data() + NumLToH, NumHToH); // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through // to the generic code below. For example: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half // and an existing 2-into-2 on the other half. In this case we may have to // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. // Fortunately, we don't have to handle anything but a 2-into-2 pattern // because any other situation (including a 3-into-1 or 1-into-3 in the other // half than the one we target for fixing) will be fixed when we re-enter this // path. We will also combine away any sequence of PSHUFD instructions that // result into a single instruction. Here is an example of the tricky case: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] // // This now has a 1-into-3 in the high half! Instead, we do two shuffles: // // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] // // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] // // The result is fine to be handled by the generic logic. auto balanceSides = [&](ArrayRef
AToAInputs, ArrayRef
BToAInputs, ArrayRef
BToBInputs, ArrayRef
AToBInputs, int AOffset, int BOffset) { assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) && "Must call this with A having 3 or 1 inputs from the A half."); assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) && "Must call this with B having 1 or 3 inputs from the B half."); assert(AToAInputs.size() + BToAInputs.size() == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)."); bool ThreeAInputs = AToAInputs.size() == 3; // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int ADWord, BDWord; int &TripleDWord = ThreeAInputs ? ADWord : BDWord; int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; ArrayRef
TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); int TripleNonInputIdx = TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); TripleDWord = TripleNonInputIdx / 2; // We use xor with one to compute the adjacent DWord to whichever one the // OneInput is in. OneInputDWord = (OneInput / 2) ^ 1; // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA // and BToA inputs. If there is also such a problem with the BToB and AToB // inputs, we don't try to fix it necessarily -- we'll recurse and see it in // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it // is essential that we don't *create* a 3<-1 as then we might oscillate. if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { // Compute how many inputs will be flipped by swapping these DWords. We // need // to balance this to ensure we don't form a 3-1 shuffle in the other // half. int NumFlippedAToBInputs = std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) + std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1); int NumFlippedBToBInputs = std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) + std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1); if ((NumFlippedAToBInputs == 1 && (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || (NumFlippedBToBInputs == 1 && (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { // We choose whether to fix the A half or B half based on whether that // half has zero flipped inputs. At zero, we may not be able to fix it // with that half. We also bias towards fixing the B half because that // will more commonly be the high half, and we have to bias one way. auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, ArrayRef
Inputs) { int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. bool IsFixIdxInput = std::find(Inputs.begin(), Inputs.end(), PinnedIdx ^ 1) != Inputs.end(); // Determine whether the free index is in the flipped dword or the // unflipped dword based on where the pinned index is. We use this bit // in an xor to conditionally select the adjacent dword. int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); bool IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), FixFreeIdx) != Inputs.end(); if (IsFixIdxInput == IsFixFreeIdxInput) FixFreeIdx += 1; IsFixFreeIdxInput = std::find(Inputs.begin(), Inputs.end(), FixFreeIdx) != Inputs.end(); assert(IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!"); int PSHUFHalfMask[] = {0, 1, 2, 3}; std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, MVT::v8i16, V, getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); for (int &M : Mask) if (M != -1 && M == FixIdx) M = FixFreeIdx; else if (M != -1 && M == FixFreeIdx) M = FixIdx; }; if (NumFlippedBToBInputs != 0) { int BPinnedIdx = BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); } else { assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!"); int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); } } } int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[ADWord] = BDWord; PSHUFDMask[BDWord] = ADWord; V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // Adjust the mask to match the new locations of A and B. for (int &M : Mask) if (M != -1 && M/2 == ADWord) M = 2 * BDWord + M % 2; else if (M != -1 && M/2 == BDWord) M = 2 * ADWord + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget, DAG); }; if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and // those dwords can then be moved to the correct half with a dword shuffle. // We use at most one low and one high word shuffle to collect these paired // inputs into dwords, and finally a dword shuffle to place them. int PSHUFLMask[4] = {-1, -1, -1, -1}; int PSHUFHMask[4] = {-1, -1, -1, -1}; int PSHUFDMask[4] = {-1, -1, -1, -1}; // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. auto fixInPlaceInputs = [&PSHUFDMask](ArrayRef
InPlaceInputs, ArrayRef
IncomingInputs, MutableArrayRef
SourceHalfMask, MutableArrayRef
HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } if (IncomingInputs.empty()) { // Just fix all of the in place inputs. for (int Input : InPlaceInputs) { SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; PSHUFDMask[Input / 2] = Input / 2; } return; } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; // Put the second input next to the first so that they are packed into // a dword. We find the adjacent index by toggling the low bit. int AdjIndex = InPlaceInputs[0] ^ 1; SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. // FIXME: This operation could almost certainly be simplified dramatically to // look more like the 3-1 fixing operation. auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef
IncomingInputs, ArrayRef
ExistingInputs, MutableArrayRef
SourceHalfMask, MutableArrayRef
HalfMask, MutableArrayRef
FinalSourceHalfMask, int SourceOffset, int DestOffset) { auto isWordClobbered = [](ArrayRef
SourceHalfMask, int Word) { return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; }; auto isDWordClobbered = [&isWordClobbered](ArrayRef
SourceHalfMask, int Word) { int LowWord = Word & ~1; int HighWord = Word | 1; return isWordClobbered(SourceHalfMask, LowWord) || isWordClobbered(SourceHalfMask, HighWord); }; if (IncomingInputs.empty()) return; if (ExistingInputs.empty()) { // Map any dwords with inputs from them into the right half. for (int Input : IncomingInputs) { // If the source half mask maps over the inputs, turn those into // swaps and use the swapped lane. if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. for (int &M : HalfMask) if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) M = Input; else if (M == Input) M = SourceHalfMask[Input - SourceOffset] + SourceOffset; } else { assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"); } // Note that this correctly re-maps both when we do a swap and when // we observe the other side of the swap above. We rely on that to // avoid swapping the members of the input list directly. Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; } // Map the input's dword into the correct half. if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; else assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"); } // And just directly shift any other-half mask elements to be same-half // as we will have mirrored the dword containing the element into the // same position within that half. for (int &M : HalfMask) if (M >= SourceOffset && M < SourceOffset + 4) { M = M - SourceOffset + DestOffset; assert(M >= 0 && "This should never wrap below zero!"); } return; } // Ensure we have the input in a viable dword of its current half. This // is particularly tricky because the original position may be clobbered // by inputs being moved and *staying* in that half. if (IncomingInputs.size() == 1) { if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { int InputFixed = std::find(std::begin(SourceHalfMask), std::end(SourceHalfMask), -1) - std::begin(SourceHalfMask) + SourceOffset; SourceHalfMask[InputFixed - SourceOffset] = IncomingInputs[0] - SourceOffset; std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], InputFixed); IncomingInputs[0] = InputFixed; } } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { // We have two non-adjacent or clobbered inputs we need to extract from // the source half. To do this, we need to map them into some adjacent // dword slot in the source mask. int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, IncomingInputs[1] - SourceOffset}; // If there is a free slot in the source half mask adjacent to one of // the inputs, place the other input in it. We use (Index XOR 1) to // compute an adjacent index. if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && SourceHalfMask[InputsFixed[0] ^ 1] == -1) { SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; InputsFixed[1] = InputsFixed[0] ^ 1; } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && SourceHalfMask[InputsFixed[1] ^ 1] == -1) { SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; InputsFixed[0] = InputsFixed[1] ^ 1; } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] == -1 && SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] == -1) { // The two inputs are in the same DWord but it is clobbered and the // adjacent DWord isn't used at all. Move both inputs to the free // slot. SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; } else { // The only way we hit this point is if there is no clobbering // (because there are no off-half inputs to this half) and there is no // free slot adjacent to one of the inputs. In this case, we have to // swap an input with a non-input. for (int i = 0; i < 4; ++i) assert((SourceHalfMask[i] == -1 || SourceHalfMask[i] == i) && "We can't handle any clobbers here!"); assert(InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!"); SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; // We also have to update the final source mask in this case because // it may need to undo the above swap. for (int &M : FinalSourceHalfMask) if (M == (InputsFixed[0] ^ 1) + SourceOffset) M = InputsFixed[1] + SourceOffset; else if (M == InputsFixed[1] + SourceOffset) M = (InputsFixed[0] ^ 1) + SourceOffset; InputsFixed[1] = InputsFixed[0] ^ 1; } // Point everything at the fixed inputs. for (int &M : HalfMask) if (M == IncomingInputs[0]) M = InputsFixed[0] + SourceOffset; else if (M == IncomingInputs[1]) M = InputsFixed[1] + SourceOffset; IncomingInputs[0] = InputsFixed[0] + SourceOffset; IncomingInputs[1] = InputsFixed[1] + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); } // Now hoist the DWord down to the right half. int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; for (int &M : HalfMask) for (int Input : IncomingInputs) if (M == Input) M = FreeDWord * 2 + Input % 2; }; moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their // target half. if (!isNoopShuffleMask(PSHUFLMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFHMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) V = DAG.getBitcast( VT, DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); // At this point, each half should contain all its inputs, and we can then // just shuffle them into their final position. assert(std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"); assert(std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"); // Do a half shuffle for the low mask. if (!isNoopShuffleMask(LoMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); // Do a half shuffle with the high mask after shifting its values down. for (int &M : HiMask) if (M >= 0) M -= 4; if (!isNoopShuffleMask(HiMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); return V; } /// \brief Helper to form a PSHUFB-based shuffle+blend. static SDValue lowerVectorShuffleAsPSHUFB(SDLoc DL, MVT VT, SDValue V1, SDValue V2, ArrayRef
Mask, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2); SDValue V1Mask[16]; SDValue V2Mask[16]; V1InUse = false; V2InUse = false; int Size = Mask.size(); int Scale = 16 / Size; for (int i = 0; i < 16; ++i) { if (Mask[i / Scale] == -1) { V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8); } else { const int ZeroMask = 0x80; int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale : ZeroMask; int V2Idx = Mask[i / Scale] < Size ? ZeroMask : (Mask[i / Scale] - Size) * Scale + i % Scale; if (Zeroable[i / Scale]) V1Idx = V2Idx = ZeroMask; V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8); V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8); V1InUse |= (ZeroMask != V1Idx); V2InUse |= (ZeroMask != V2Idx); } } if (V1InUse) V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, V1), DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V1Mask)); if (V2InUse) V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, DAG.getBitcast(MVT::v16i8, V2), DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v16i8, V2Mask)); // If we need shuffled inputs from both, blend the two. SDValue V; if (V1InUse && V2InUse) V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2); else V = V1InUse ? V1 : V2; // Cast the result back to the correct type. return DAG.getBitcast(VT, V); } /// \brief Generic lowering of 8-lane i16 shuffles. /// /// This handles both single-input shuffles and combined shuffle/blends with /// two inputs. The single input shuffles are immediately delegated to /// a dedicated lowering routine. /// /// The blends are lowered in one of three fundamental ways. If there are few /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle /// of the input is significantly cheaper when lowered as an interleaving of /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
OrigMask = SVOp->getMask(); int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; MutableArrayRef