//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//==-----------------------------------------------------------------------===//
//
// This file contains TargetLowering functions borrowed from AMDLI.
//
//===----------------------------------------------------------------------===//
#include "AMDGPUISelLowering.h"
#include "AMDGPURegisterInfo.h"
#include "AMDILDevices.h"
#include "AMDILIntrinsicInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDILUtilityFunctions.h"
#include "llvm/CallingConv.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/DerivedTypes.h"
#include "llvm/Instructions.h"
#include "llvm/Intrinsics.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
#include "AMDGPUGenCallingConv.inc"
//===----------------------------------------------------------------------===//
// TargetLowering Implementation Help Functions End
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// TargetLowering Class Implementation Begins
//===----------------------------------------------------------------------===//
void AMDGPUTargetLowering::InitAMDILLowering()
{
int types[] =
{
(int)MVT::i8,
(int)MVT::i16,
(int)MVT::i32,
(int)MVT::f32,
(int)MVT::f64,
(int)MVT::i64,
(int)MVT::v2i8,
(int)MVT::v4i8,
(int)MVT::v2i16,
(int)MVT::v4i16,
(int)MVT::v4f32,
(int)MVT::v4i32,
(int)MVT::v2f32,
(int)MVT::v2i32,
(int)MVT::v2f64,
(int)MVT::v2i64
};
int IntTypes[] =
{
(int)MVT::i8,
(int)MVT::i16,
(int)MVT::i32,
(int)MVT::i64
};
int FloatTypes[] =
{
(int)MVT::f32,
(int)MVT::f64
};
int VectorTypes[] =
{
(int)MVT::v2i8,
(int)MVT::v4i8,
(int)MVT::v2i16,
(int)MVT::v4i16,
(int)MVT::v4f32,
(int)MVT::v4i32,
(int)MVT::v2f32,
(int)MVT::v2i32,
(int)MVT::v2f64,
(int)MVT::v2i64
};
size_t numTypes = sizeof(types) / sizeof(*types);
size_t numFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
size_t numIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
size_t numVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);
const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
// These are the current register classes that are
// supported
for (unsigned int x = 0; x < numTypes; ++x) {
MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
//FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
// We cannot sextinreg, expand to shifts
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
setOperationAction(ISD::SUBE, VT, Expand);
setOperationAction(ISD::SUBC, VT, Expand);
setOperationAction(ISD::ADDE, VT, Expand);
setOperationAction(ISD::ADDC, VT, Expand);
setOperationAction(ISD::BRCOND, VT, Custom);
setOperationAction(ISD::BR_JT, VT, Expand);
setOperationAction(ISD::BRIND, VT, Expand);
// TODO: Implement custom UREM/SREM routines
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
if (VT != MVT::i64 && VT != MVT::v2i64) {
setOperationAction(ISD::SDIV, VT, Custom);
}
}
for (unsigned int x = 0; x < numFloatTypes; ++x) {
MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
// IL does not have these operations for floating point types
setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
setOperationAction(ISD::SETOLT, VT, Expand);
setOperationAction(ISD::SETOGE, VT, Expand);
setOperationAction(ISD::SETOGT, VT, Expand);
setOperationAction(ISD::SETOLE, VT, Expand);
setOperationAction(ISD::SETULT, VT, Expand);
setOperationAction(ISD::SETUGE, VT, Expand);
setOperationAction(ISD::SETUGT, VT, Expand);
setOperationAction(ISD::SETULE, VT, Expand);
}
for (unsigned int x = 0; x < numIntTypes; ++x) {
MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
// GPU also does not have divrem function for signed or unsigned
setOperationAction(ISD::SDIVREM, VT, Expand);
// GPU does not have [S|U]MUL_LOHI functions as a single instruction
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
// GPU doesn't have a rotl, rotr, or byteswap instruction
setOperationAction(ISD::ROTR, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
// GPU doesn't have any counting operators
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::CTTZ, VT, Expand);
setOperationAction(ISD::CTLZ, VT, Expand);
}
for ( unsigned int ii = 0; ii < numVectorTypes; ++ii )
{
MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
// setOperationAction(ISD::VSETCC, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
}
if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) {
setOperationAction(ISD::MULHU, MVT::i64, Expand);
setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
setOperationAction(ISD::MULHS, MVT::i64, Expand);
setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
setOperationAction(ISD::ADD, MVT::v2i64, Expand);
setOperationAction(ISD::SREM, MVT::v2i64, Expand);
setOperationAction(ISD::Constant , MVT::i64 , Legal);
setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
}
if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) {
// we support loading/storing v2f64 but not operations on the type
setOperationAction(ISD::FADD, MVT::v2f64, Expand);
setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
setOperationAction(ISD::ConstantFP , MVT::f64 , Legal);
// We want to expand vector conversions into their scalar
// counterparts.
setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
setOperationAction(ISD::FABS, MVT::f64, Expand);
setOperationAction(ISD::FABS, MVT::v2f64, Expand);
}
// TODO: Fix the UDIV24 algorithm so it works for these
// types correctly. This needs vector comparisons
// for this to work correctly.
setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
setOperationAction(ISD::SUBC, MVT::Other, Expand);
setOperationAction(ISD::ADDE, MVT::Other, Expand);
setOperationAction(ISD::ADDC, MVT::Other, Expand);
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::BRIND, MVT::Other, Expand);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
setOperationAction(ISD::BUILD_VECTOR, MVT::Other, Custom);
// Use the default implementation.
setOperationAction(ISD::ConstantFP , MVT::f32 , Legal);
setOperationAction(ISD::Constant , MVT::i32 , Legal);
setSchedulingPreference(Sched::RegPressure);
setPow2DivIsCheap(false);
setPrefLoopAlignment(16);
setSelectIsExpensive(true);
setJumpIsExpensive(true);
maxStoresPerMemcpy = 4096;
maxStoresPerMemmove = 4096;
maxStoresPerMemset = 4096;
#undef numTypes
#undef numIntTypes
#undef numVectorTypes
#undef numFloatTypes
}
bool
AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I, unsigned Intrinsic) const
{
return false;
}
// The backend supports 32 and 64 bit floating point immediates
bool
AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const
{
if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
|| VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
return true;
} else {
return false;
}
}
bool
AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const
{
if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
|| VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
return false;
} else {
return true;
}
}
// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
// be zero. Op is expected to be a target specific node. Used by DAG
// combiner.
void
AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
const SDValue Op,
APInt &KnownZero,
APInt &KnownOne,
const SelectionDAG &DAG,
unsigned Depth) const
{
APInt KnownZero2;
APInt KnownOne2;
KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
switch (Op.getOpcode()) {
default: break;
case ISD::SELECT_CC:
DAG.ComputeMaskedBits(
Op.getOperand(1),
KnownZero,
KnownOne,
Depth + 1
);
DAG.ComputeMaskedBits(
Op.getOperand(0),
KnownZero2,
KnownOne2
);
assert((KnownZero & KnownOne) == 0
&& "Bits known to be one AND zero?");
assert((KnownZero2 & KnownOne2) == 0
&& "Bits known to be one AND zero?");
// Only known if known in both the LHS and RHS
KnownOne &= KnownOne2;
KnownZero &= KnownZero2;
break;
};
}
//===----------------------------------------------------------------------===//
// Other Lowering Hooks
//===----------------------------------------------------------------------===//
SDValue
AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const
{
EVT OVT = Op.getValueType();
SDValue DST;
if (OVT.getScalarType() == MVT::i64) {
DST = LowerSDIV64(Op, DAG);
} else if (OVT.getScalarType() == MVT::i32) {
DST = LowerSDIV32(Op, DAG);
} else if (OVT.getScalarType() == MVT::i16
|| OVT.getScalarType() == MVT::i8) {
DST = LowerSDIV24(Op, DAG);
} else {
DST = SDValue(Op.getNode(), 0);
}
return DST;
}
SDValue
AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const
{
EVT OVT = Op.getValueType();
SDValue DST;
if (OVT.getScalarType() == MVT::i64) {
DST = LowerSREM64(Op, DAG);
} else if (OVT.getScalarType() == MVT::i32) {
DST = LowerSREM32(Op, DAG);
} else if (OVT.getScalarType() == MVT::i16) {
DST = LowerSREM16(Op, DAG);
} else if (OVT.getScalarType() == MVT::i8) {
DST = LowerSREM8(Op, DAG);
} else {
DST = SDValue(Op.getNode(), 0);
}
return DST;
}
SDValue
AMDGPUTargetLowering::LowerBUILD_VECTOR( SDValue Op, SelectionDAG &DAG ) const
{
EVT VT = Op.getValueType();
SDValue Nodes1;
SDValue second;
SDValue third;
SDValue fourth;
DebugLoc DL = Op.getDebugLoc();
Nodes1 = DAG.getNode(AMDGPUISD::VBUILD,
DL,
VT, Op.getOperand(0));
#if 0
bool allEqual = true;
for (unsigned x = 1, y = Op.getNumOperands(); x < y; ++x) {
if (Op.getOperand(0) != Op.getOperand(x)) {
allEqual = false;
break;
}
}
if (allEqual) {
return Nodes1;
}
#endif
switch(Op.getNumOperands()) {
default:
case 1:
break;
case 4:
fourth = Op.getOperand(3);
if (fourth.getOpcode() != ISD::UNDEF) {
Nodes1 = DAG.getNode(
ISD::INSERT_VECTOR_ELT,
DL,
Op.getValueType(),
Nodes1,
fourth,
DAG.getConstant(7, MVT::i32));
}
case 3:
third = Op.getOperand(2);
if (third.getOpcode() != ISD::UNDEF) {
Nodes1 = DAG.getNode(
ISD::INSERT_VECTOR_ELT,
DL,
Op.getValueType(),
Nodes1,
third,
DAG.getConstant(6, MVT::i32));
}
case 2:
second = Op.getOperand(1);
if (second.getOpcode() != ISD::UNDEF) {
Nodes1 = DAG.getNode(
ISD::INSERT_VECTOR_ELT,
DL,
Op.getValueType(),
Nodes1,
second,
DAG.getConstant(5, MVT::i32));
}
break;
};
return Nodes1;
}
SDValue
AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
{
SDValue Data = Op.getOperand(0);
VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
DebugLoc DL = Op.getDebugLoc();
EVT DVT = Data.getValueType();
EVT BVT = BaseType->getVT();
unsigned baseBits = BVT.getScalarType().getSizeInBits();
unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
unsigned shiftBits = srcBits - baseBits;
if (srcBits < 32) {
// If the op is less than 32 bits, then it needs to extend to 32bits
// so it can properly keep the upper bits valid.
EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
shiftBits = 32 - baseBits;
DVT = IVT;
}
SDValue Shift = DAG.getConstant(shiftBits, DVT);
// Shift left by 'Shift' bits.
Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
// Signed shift Right by 'Shift' bits.
Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
if (srcBits < 32) {
// Once the sign extension is done, the op needs to be converted to
// its original type.
Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
}
return Data;
}
EVT
AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const
{
int iSize = (size * numEle);
int vEle = (iSize >> ((size == 64) ? 6 : 5));
if (!vEle) {
vEle = 1;
}
if (size == 64) {
if (vEle == 1) {
return EVT(MVT::i64);
} else {
return EVT(MVT::getVectorVT(MVT::i64, vEle));
}
} else {
if (vEle == 1) {
return EVT(MVT::i32);
} else {
return EVT(MVT::getVectorVT(MVT::i32, vEle));
}
}
}
SDValue
AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const
{
SDValue Chain = Op.getOperand(0);
SDValue Cond = Op.getOperand(1);
SDValue Jump = Op.getOperand(2);
SDValue Result;
Result = DAG.getNode(
AMDGPUISD::BRANCH_COND,
Op.getDebugLoc(),
Op.getValueType(),
Chain, Jump, Cond);
return Result;
}
SDValue
AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const
{
DebugLoc DL = Op.getDebugLoc();
EVT OVT = Op.getValueType();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
MVT INTTY;
MVT FLTTY;
if (!OVT.isVector()) {
INTTY = MVT::i32;
FLTTY = MVT::f32;
} else if (OVT.getVectorNumElements() == 2) {
INTTY = MVT::v2i32;
FLTTY = MVT::v2f32;
} else if (OVT.getVectorNumElements() == 4) {
INTTY = MVT::v4i32;
FLTTY = MVT::v4f32;
}
unsigned bitsize = OVT.getScalarType().getSizeInBits();
// char|short jq = ia ^ ib;
SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
// jq = jq >> (bitsize - 2)
jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT));
// jq = jq | 0x1
jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
// jq = (int)jq
jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
// int ia = (int)LHS;
SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
// int ib, (int)RHS;
SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
// float fa = (float)ia;
SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
// float fb = (float)ib;
SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
// float fq = native_divide(fa, fb);
SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
// fq = trunc(fq);
fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
// float fqneg = -fq;
SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
// float fr = mad(fqneg, fb, fa);
SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa);
// int iq = (int)fq;
SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
// fr = fabs(fr);
fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
// fb = fabs(fb);
fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
// int cv = fr >= fb;
SDValue cv;
if (INTTY == MVT::i32) {
cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
} else {
cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
}
// jq = (cv ? jq : 0);
jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq,
DAG.getConstant(0, OVT));
// dst = iq + jq;
iq = DAG.getSExtOrTrunc(iq, DL, OVT);
iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
return iq;
}
SDValue
AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const
{
DebugLoc DL = Op.getDebugLoc();
EVT OVT = Op.getValueType();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
// The LowerSDIV32 function generates equivalent to the following IL.
// mov r0, LHS
// mov r1, RHS
// ilt r10, r0, 0
// ilt r11, r1, 0
// iadd r0, r0, r10
// iadd r1, r1, r11
// ixor r0, r0, r10
// ixor r1, r1, r11
// udiv r0, r0, r1
// ixor r10, r10, r11
// iadd r0, r0, r10
// ixor DST, r0, r10
// mov r0, LHS
SDValue r0 = LHS;
// mov r1, RHS
SDValue r1 = RHS;
// ilt r10, r0, 0
SDValue r10 = DAG.getSelectCC(DL,
r0, DAG.getConstant(0, OVT),
DAG.getConstant(-1, MVT::i32),
DAG.getConstant(0, MVT::i32),
ISD::SETLT);
// ilt r11, r1, 0
SDValue r11 = DAG.getSelectCC(DL,
r1, DAG.getConstant(0, OVT),
DAG.getConstant(-1, MVT::i32),
DAG.getConstant(0, MVT::i32),
ISD::SETLT);
// iadd r0, r0, r10
r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
// iadd r1, r1, r11
r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
// ixor r0, r0, r10
r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
// ixor r1, r1, r11
r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
// udiv r0, r0, r1
r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
// ixor r10, r10, r11
r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
// iadd r0, r0, r10
r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
// ixor DST, r0, r10
SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
return DST;
}
SDValue
AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const
{
return SDValue(Op.getNode(), 0);
}
SDValue
AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const
{
DebugLoc DL = Op.getDebugLoc();
EVT OVT = Op.getValueType();
MVT INTTY = MVT::i32;
if (OVT == MVT::v2i8) {
INTTY = MVT::v2i32;
} else if (OVT == MVT::v4i8) {
INTTY = MVT::v4i32;
}
SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
return LHS;
}
SDValue
AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const
{
DebugLoc DL = Op.getDebugLoc();
EVT OVT = Op.getValueType();
MVT INTTY = MVT::i32;
if (OVT == MVT::v2i16) {
INTTY = MVT::v2i32;
} else if (OVT == MVT::v4i16) {
INTTY = MVT::v4i32;
}
SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
return LHS;
}
SDValue
AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const
{
DebugLoc DL = Op.getDebugLoc();
EVT OVT = Op.getValueType();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
// The LowerSREM32 function generates equivalent to the following IL.
// mov r0, LHS
// mov r1, RHS
// ilt r10, r0, 0
// ilt r11, r1, 0
// iadd r0, r0, r10
// iadd r1, r1, r11
// ixor r0, r0, r10
// ixor r1, r1, r11
// udiv r20, r0, r1
// umul r20, r20, r1
// sub r0, r0, r20
// iadd r0, r0, r10
// ixor DST, r0, r10
// mov r0, LHS
SDValue r0 = LHS;
// mov r1, RHS
SDValue r1 = RHS;
// ilt r10, r0, 0
SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
// ilt r11, r1, 0
SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
// iadd r0, r0, r10
r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
// iadd r1, r1, r11
r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
// ixor r0, r0, r10
r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
// ixor r1, r1, r11
r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
// udiv r20, r0, r1
SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
// umul r20, r20, r1
r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
// sub r0, r0, r20
r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
// iadd r0, r0, r10
r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
// ixor DST, r0, r10
SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
return DST;
}
SDValue
AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const
{
return SDValue(Op.getNode(), 0);
}