//===-- AMDILPeepholeOptimizer.cpp - AMDIL Peephole optimizations ---------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//==-----------------------------------------------------------------------===//

#include "AMDILDevices.h"
#include "AMDGPUInstrInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Constants.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionAnalysis.h"
#include "llvm/Function.h"
#include "llvm/Instructions.h"
#include "llvm/Module.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"

#include <sstream>

#if 0
STATISTIC(PointerAssignments, "Number of dynamic pointer "
    "assigments discovered");
STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
#endif

using namespace llvm;
// The Peephole optimization pass is used to do simple last minute optimizations
// that are required for correct code or to remove redundant functions
namespace {

class OpaqueType;

class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
public:
  TargetMachine &TM;
  static char ID;
  AMDGPUPeepholeOpt(TargetMachine &tm);
  ~AMDGPUPeepholeOpt();
  const char *getPassName() const;
  bool runOnFunction(Function &F);
  bool doInitialization(Module &M);
  bool doFinalization(Module &M);
  void getAnalysisUsage(AnalysisUsage &AU) const;
protected:
private:
  // Function to initiate all of the instruction level optimizations.
  bool instLevelOptimizations(BasicBlock::iterator *inst);
  // Quick check to see if we need to dump all of the pointers into the
  // arena. If this is correct, then we set all pointers to exist in arena. This
  // is a workaround for aliasing of pointers in a struct/union.
  bool dumpAllIntoArena(Function &F);
  // Because I don't want to invalidate any pointers while in the
  // safeNestedForEachFunction. I push atomic conversions to a vector and handle
  // it later. This function does the conversions if required.
  void doAtomicConversionIfNeeded(Function &F);
  // Because __amdil_is_constant cannot be properly evaluated if
  // optimizations are disabled, the call's are placed in a vector
  // and evaluated after the __amdil_image* functions are evaluated
  // which should allow the __amdil_is_constant function to be
  // evaluated correctly.
  void doIsConstCallConversionIfNeeded();
  bool mChanged;
  bool mDebug;
  bool mConvertAtomics;
  CodeGenOpt::Level optLevel;
  // Run a series of tests to see if we can optimize a CALL instruction.
  bool optimizeCallInst(BasicBlock::iterator *bbb);
  // A peephole optimization to optimize bit extract sequences.
  bool optimizeBitExtract(Instruction *inst);
  // A peephole optimization to optimize bit insert sequences.
  bool optimizeBitInsert(Instruction *inst);
  bool setupBitInsert(Instruction *base, 
                      Instruction *&src, 
                      Constant *&mask, 
                      Constant *&shift);
  // Expand the bit field insert instruction on versions of OpenCL that
  // don't support it.
  bool expandBFI(CallInst *CI);
  // Expand the bit field mask instruction on version of OpenCL that 
  // don't support it.
  bool expandBFM(CallInst *CI);
  // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
  // this case we need to expand them. These functions check for 24bit functions
  // and then expand.
  bool isSigned24BitOps(CallInst *CI);
  void expandSigned24BitOps(CallInst *CI);
  // One optimization that can occur is that if the required workgroup size is
  // specified then the result of get_local_size is known at compile time and
  // can be returned accordingly.
  bool isRWGLocalOpt(CallInst *CI);
  // On northern island cards, the division is slightly less accurate than on
  // previous generations, so we need to utilize a more accurate division. So we
  // can translate the accurate divide to a normal divide on all other cards.
  bool convertAccurateDivide(CallInst *CI);
  void expandAccurateDivide(CallInst *CI);
  // If the alignment is set incorrectly, it can produce really inefficient
  // code. This checks for this scenario and fixes it if possible.
  bool correctMisalignedMemOp(Instruction *inst);

  // If we are in no opt mode, then we need to make sure that
  // local samplers are properly propagated as constant propagation 
  // doesn't occur and we need to know the value of kernel defined
  // samplers at compile time.
  bool propagateSamplerInst(CallInst *CI);

  // Helper functions

  // Group of functions that recursively calculate the size of a structure based
  // on it's sub-types.
  size_t getTypeSize(Type * const T, bool dereferencePtr = false);
  size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
  size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
  size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
  size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
  size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
  size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
  size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);

  LLVMContext *mCTX;
  Function *mF;
  const AMDGPUSubtarget *mSTM;
  SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
  SmallVector<CallInst *, 16> isConstVec;
}; // class AMDGPUPeepholeOpt
  char AMDGPUPeepholeOpt::ID = 0;

// A template function that has two levels of looping before calling the
// function with a pointer to the current iterator.
template<class InputIterator, class SecondIterator, class Function>
Function safeNestedForEach(InputIterator First, InputIterator Last,
                              SecondIterator S, Function F)
{
  for ( ; First != Last; ++First) {
    SecondIterator sf, sl;
    for (sf = First->begin(), sl = First->end();
         sf != sl; )  {
      if (!F(&sf)) {
        ++sf;
      } 
    }
  }
  return F;
}

} // anonymous namespace

namespace llvm {
  FunctionPass *
  createAMDGPUPeepholeOpt(TargetMachine &tm) 
  {
    return new AMDGPUPeepholeOpt(tm);
  }
} // llvm namespace

AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
  : FunctionPass(ID), TM(tm) 
{
  mDebug = false;
  optLevel = TM.getOptLevel();

}

AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() 
{
}

const char *
AMDGPUPeepholeOpt::getPassName() const 
{
  return "AMDGPU PeepHole Optimization Pass";
}

bool 
containsPointerType(Type *Ty) 
{
  if (!Ty) {
    return false;
  }
  switch(Ty->getTypeID()) {
  default:
    return false;
  case Type::StructTyID: {
    const StructType *ST = dyn_cast<StructType>(Ty);
    for (StructType::element_iterator stb = ST->element_begin(),
           ste = ST->element_end(); stb != ste; ++stb) {
      if (!containsPointerType(*stb)) {
        continue;
      }
      return true;
    }
    break;
  }
  case Type::VectorTyID:
  case Type::ArrayTyID:
    return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
  case Type::PointerTyID:
    return true;
  };
  return false;
}

bool 
AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) 
{
  bool dumpAll = false;
  for (Function::const_arg_iterator cab = F.arg_begin(),
       cae = F.arg_end(); cab != cae; ++cab) {
    const Argument *arg = cab;
    const PointerType *PT = dyn_cast<PointerType>(arg->getType());
    if (!PT) {
      continue;
    }
    Type *DereferencedType = PT->getElementType();
    if (!dyn_cast<StructType>(DereferencedType) 
        ) {
      continue;
    }
    if (!containsPointerType(DereferencedType)) {
      continue;
    }
    // FIXME: Because a pointer inside of a struct/union may be aliased to
    // another pointer we need to take the conservative approach and place all
    // pointers into the arena until more advanced detection is implemented.
    dumpAll = true;
  }
  return dumpAll;
}
void
AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded()
{
  if (isConstVec.empty()) {
    return;
  }
  for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
    CallInst *CI = isConstVec[x];
    Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
    Type *aType = Type::getInt32Ty(*mCTX);
    Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
      : ConstantInt::get(aType, 0);
    CI->replaceAllUsesWith(Val);
    CI->eraseFromParent();
  }
  isConstVec.clear();
}
void 
AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) 
{
  // Don't do anything if we don't have any atomic operations.
  if (atomicFuncs.empty()) {
    return;
  }
  // Change the function name for the atomic if it is required
  uint32_t size = atomicFuncs.size();
  for (uint32_t x = 0; x < size; ++x) {
    atomicFuncs[x].first->setOperand(
        atomicFuncs[x].first->getNumOperands()-1, 
        atomicFuncs[x].second);

  }
  mChanged = true;
  if (mConvertAtomics) {
    return;
  }
}

bool 
AMDGPUPeepholeOpt::runOnFunction(Function &MF) 
{
  mChanged = false;
  mF = &MF;
  mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
  if (mDebug) {
    MF.dump();
  }
  mCTX = &MF.getType()->getContext();
  mConvertAtomics = true;
  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
     std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
                  this));

  doAtomicConversionIfNeeded(MF);
  doIsConstCallConversionIfNeeded();

  if (mDebug) {
    MF.dump();
  }
  return mChanged;
}

bool 
AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) 
{
  Instruction *inst = (*bbb);
  CallInst *CI = dyn_cast<CallInst>(inst);
  if (!CI) {
    return false;
  }
  if (isSigned24BitOps(CI)) {
    expandSigned24BitOps(CI);
    ++(*bbb);
    CI->eraseFromParent();
    return true;
  }
  if (propagateSamplerInst(CI)) {
    return false;
  }
  if (expandBFI(CI) || expandBFM(CI)) {
    ++(*bbb);
    CI->eraseFromParent();
    return true;
  }
  if (convertAccurateDivide(CI)) {
    expandAccurateDivide(CI);
    ++(*bbb);
    CI->eraseFromParent();
    return true;
  }

  StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
  if (calleeName.startswith("__amdil_is_constant")) {
    // If we do not have optimizations, then this
    // cannot be properly evaluated, so we add the
    // call instruction to a vector and process
    // them at the end of processing after the
    // samplers have been correctly handled.
    if (optLevel == CodeGenOpt::None) {
      isConstVec.push_back(CI);
      return false;
    } else {
      Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
      Type *aType = Type::getInt32Ty(*mCTX);
      Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
        : ConstantInt::get(aType, 0);
      CI->replaceAllUsesWith(Val);
      ++(*bbb);
      CI->eraseFromParent();
      return true;
    }
  }

  if (calleeName.equals("__amdil_is_asic_id_i32")) {
    ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
    Type *aType = Type::getInt32Ty(*mCTX);
    Value *Val = CV;
    if (Val) {
      Val = ConstantInt::get(aType, 
          mSTM->device()->getDeviceFlag() & CV->getZExtValue());
    } else {
      Val = ConstantInt::get(aType, 0);
    }
    CI->replaceAllUsesWith(Val);
    ++(*bbb);
    CI->eraseFromParent();
    return true;
  }
  Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
  if (!F) {
    return false;
  } 
  if (F->getName().startswith("__atom") && !CI->getNumUses() 
      && F->getName().find("_xchg") == StringRef::npos) {
    std::string buffer(F->getName().str() + "_noret");
    F = dyn_cast<Function>(
          F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
    atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
  }
  
  if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
      && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
    return false;
  }
  if (!mConvertAtomics) {
    return false;
  }
  StringRef name = F->getName();
  if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
    mConvertAtomics = false;
  }
  return false;
}

bool
AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, 
    Instruction *&src, 
    Constant *&mask, 
    Constant *&shift)
{
  if (!base) {
    if (mDebug) {
      dbgs() << "Null pointer passed into function.\n";
    }
    return false;
  }
  bool andOp = false;
  if (base->getOpcode() == Instruction::Shl) {
    shift = dyn_cast<Constant>(base->getOperand(1));
  } else if (base->getOpcode() == Instruction::And) {
    mask = dyn_cast<Constant>(base->getOperand(1));
    andOp = true;
  } else {
    if (mDebug) {
      dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
    }
    // If the base is neither a Shl or a And, we don't fit any of the patterns above.
    return false;
  }
  src = dyn_cast<Instruction>(base->getOperand(0));
  if (!src) {
    if (mDebug) {
      dbgs() << "Failed setup since the base operand is not an instruction!\n";
    }
    return false;
  }
  // If we find an 'and' operation, then we don't need to
  // find the next operation as we already know the
  // bits that are valid at this point.
  if (andOp) {
    return true;
  }
  if (src->getOpcode() == Instruction::Shl && !shift) {
    shift = dyn_cast<Constant>(src->getOperand(1));
    src = dyn_cast<Instruction>(src->getOperand(0));
  } else if (src->getOpcode() == Instruction::And && !mask) {
    mask = dyn_cast<Constant>(src->getOperand(1));
  }
  if (!mask && !shift) {
    if (mDebug) {
      dbgs() << "Failed setup since both mask and shift are NULL!\n";
    }
    // Did not find a constant mask or a shift.
    return false;
  }
  return true;
}
bool
AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) 
{
  if (!inst) {
    return false;
  }
  if (!inst->isBinaryOp()) {
    return false;
  }
  if (inst->getOpcode() != Instruction::Or) {
    return false;
  }
  if (optLevel == CodeGenOpt::None) {
    return false;
  }
  // We want to do an optimization on a sequence of ops that in the end equals a
  // single ISA instruction.
  // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
  // Some simplified versions of this pattern are as follows:
  // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
  // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
  // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
  // (A & B) | (D << F) when (1 << F) >= B
  // (A << C) | (D & E) when (1 << C) >= E
  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
    // The HD4XXX hardware doesn't support the ubit_insert instruction.
    return false;
  }
  Type *aType = inst->getType();
  bool isVector = aType->isVectorTy();
  int numEle = 1;
  // This optimization only works on 32bit integers.
  if (aType->getScalarType()
      != Type::getInt32Ty(inst->getContext())) {
    return false;
  }
  if (isVector) {
    const VectorType *VT = dyn_cast<VectorType>(aType);
    numEle = VT->getNumElements();
    // We currently cannot support more than 4 elements in a intrinsic and we
    // cannot support Vec3 types.
    if (numEle > 4 || numEle == 3) {
      return false;
    }
  }
  // TODO: Handle vectors.
  if (isVector) {
    if (mDebug) {
      dbgs() << "!!! Vectors are not supported yet!\n";
    }
    return false;
  }
  Instruction *LHSSrc = NULL, *RHSSrc = NULL;
  Constant *LHSMask = NULL, *RHSMask = NULL;
  Constant *LHSShift = NULL, *RHSShift = NULL;
  Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
  Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
  if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
    if (mDebug) {
      dbgs() << "Found an OR Operation that failed setup!\n";
      inst->dump();
      if (LHS) { LHS->dump(); }
      if (LHSSrc) { LHSSrc->dump(); }
      if (LHSMask) { LHSMask->dump(); }
      if (LHSShift) { LHSShift->dump(); }
    }
    // There was an issue with the setup for BitInsert.
    return false;
  }
  if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
    if (mDebug) {
      dbgs() << "Found an OR Operation that failed setup!\n";
      inst->dump();
      if (RHS) { RHS->dump(); }
      if (RHSSrc) { RHSSrc->dump(); }
      if (RHSMask) { RHSMask->dump(); }
      if (RHSShift) { RHSShift->dump(); }
    }
    // There was an issue with the setup for BitInsert.
    return false;
  }
  if (mDebug) {
    dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
    dbgs() << "Op:        "; inst->dump();
    dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
    dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
    dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
    dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
    dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
    dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
    dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
    dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
  }
  Constant *offset = NULL;
  Constant *width = NULL;
  int32_t lhsMaskVal = 0, rhsMaskVal = 0;
  int32_t lhsShiftVal = 0, rhsShiftVal = 0;
  int32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
  int32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
  lhsMaskVal = (int32_t)(LHSMask 
      ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
  rhsMaskVal = (int32_t)(RHSMask 
      ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
  lhsShiftVal = (int32_t)(LHSShift 
      ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
  rhsShiftVal = (int32_t)(RHSShift 
      ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
  lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
  rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
  lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
  rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
  // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
  if (mDebug) {
      dbgs() << "Found pattern: \'((A" << (LHSMask ? " & B)" : ")");
      dbgs() << (LHSShift ? " << C)" : ")") << " | ((D" ;
      dbgs() << (RHSMask ? " & E)" : ")");
      dbgs() << (RHSShift ? " << F)\'\n" : ")\'\n");
      dbgs() << "A = LHSSrc\t\tD = RHSSrc \n";
      dbgs() << "B = " << lhsMaskVal << "\t\tE = " << rhsMaskVal << "\n";
      dbgs() << "C = " << lhsShiftVal << "\t\tF = " << rhsShiftVal << "\n";
      dbgs() << "width(B) = " << lhsMaskWidth;
      dbgs() << "\twidth(E) = " << rhsMaskWidth << "\n";
      dbgs() << "offset(B) = " << lhsMaskOffset;
      dbgs() << "\toffset(E) = " << rhsMaskOffset << "\n";
      dbgs() << "Constraints: \n";
      dbgs() << "\t(1) B ^ E == 0\n";
      dbgs() << "\t(2-LHS) B is a mask\n";
      dbgs() << "\t(2-LHS) E is a mask\n";
      dbgs() << "\t(3-LHS) (offset(B)) >= (width(E) + offset(E))\n";
      dbgs() << "\t(3-RHS) (offset(E)) >= (width(B) + offset(B))\n";
  }
  if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
    if (mDebug) {
      dbgs() << lhsMaskVal << " ^ " << rhsMaskVal;
      dbgs() << " = " << (lhsMaskVal ^ rhsMaskVal) << "\n";
      dbgs() << "Failed constraint 1!\n";
    }
    return false;
  }
  if (mDebug) {
    dbgs() << "LHS = " << lhsMaskOffset << "";
    dbgs() << " >= (" << rhsMaskWidth << " + " << rhsMaskOffset << ") = ";
    dbgs() << (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset));
    dbgs() << "\nRHS = " << rhsMaskOffset << "";
    dbgs() << " >= (" << lhsMaskWidth << " + " << lhsMaskOffset << ") = ";
    dbgs() << (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset));
    dbgs() << "\n";
  }
  if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
    offset = ConstantInt::get(aType, lhsMaskOffset, false);
    width = ConstantInt::get(aType, lhsMaskWidth, false);
    RHSSrc = RHS;
    if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
      if (mDebug) {
        dbgs() << "Value is not a Mask: " << lhsMaskVal << "\n";
        dbgs() << "Failed constraint 2!\n";
      }
      return false;
    }
    if (!LHSShift) {
      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
          "MaskShr", LHS);
    } else if (lhsShiftVal != lhsMaskOffset) {
      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
          "MaskShr", LHS);
    }
    if (mDebug) {
      dbgs() << "Optimizing LHS!\n";
    }
  } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
    offset = ConstantInt::get(aType, rhsMaskOffset, false);
    width = ConstantInt::get(aType, rhsMaskWidth, false);
    LHSSrc = RHSSrc;
    RHSSrc = LHS;
    if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
      if (mDebug) {
        dbgs() << "Non-Mask: " << rhsMaskVal << "\n";
        dbgs() << "Failed constraint 2!\n";
      }
      return false;
    }
    if (!RHSShift) {
      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
          "MaskShr", RHS);
    } else if (rhsShiftVal != rhsMaskOffset) {
      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
          "MaskShr", RHS);
    }
    if (mDebug) {
      dbgs() << "Optimizing RHS!\n";
    }
  } else {
    if (mDebug) {
      dbgs() << "Failed constraint 3!\n";
    }
    return false;
  }
  if (mDebug) {
    dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
    dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
    dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
    dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
  }
  if (!offset || !width) {
    if (mDebug) {
      dbgs() << "Either width or offset are NULL, failed detection!\n";
    }
    return false;
  }
  // Lets create the function signature.
  std::vector<Type *> callTypes;
  callTypes.push_back(aType);
  callTypes.push_back(aType);
  callTypes.push_back(aType);
  callTypes.push_back(aType);
  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
  std::string name = "__amdil_ubit_insert";
  if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
  Function *Func = 
    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
        getOrInsertFunction(llvm::StringRef(name), funcType));
  Value *Operands[4] = {
    width,
    offset,
    LHSSrc,
    RHSSrc
  };
  CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
  if (mDebug) {
    dbgs() << "Old Inst: ";
    inst->dump();
    dbgs() << "New Inst: ";
    CI->dump();
    dbgs() << "\n\n";
  }
  CI->insertBefore(inst);
  inst->replaceAllUsesWith(CI);
  return true;
}

bool 
AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) 
{
  if (!inst) {
    return false;
  }
  if (!inst->isBinaryOp()) {
    return false;
  }
  if (inst->getOpcode() != Instruction::And) {
    return false;
  }
  if (optLevel == CodeGenOpt::None) {
    return false;
  }
  // We want to do some simple optimizations on Shift right/And patterns. The
  // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
  // value smaller than 32 and C is a mask. If C is a constant value, then the
  // following transformation can occur. For signed integers, it turns into the
  // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
  // integers, it turns into the function call dst =
  // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
  // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
  // Evergreen hardware.
  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
    // This does not work on HD4XXX hardware.
    return false;
  }
  Type *aType = inst->getType();
  bool isVector = aType->isVectorTy();

  // XXX Support vector types
  if (isVector) {
    return false;
  }
  int numEle = 1;
  // This only works on 32bit integers
  if (aType->getScalarType()
      != Type::getInt32Ty(inst->getContext())) {
    return false;
  }
  if (isVector) {
    const VectorType *VT = dyn_cast<VectorType>(aType);
    numEle = VT->getNumElements();
    // We currently cannot support more than 4 elements in a intrinsic and we
    // cannot support Vec3 types.
    if (numEle > 4 || numEle == 3) {
      return false;
    }
  }
  BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
  // If the first operand is not a shift instruction, then we can return as it
  // doesn't match this pattern.
  if (!ShiftInst || !ShiftInst->isShift()) {
    return false;
  }
  // If we are a shift left, then we need don't match this pattern.
  if (ShiftInst->getOpcode() == Instruction::Shl) {
    return false;
  }
  bool isSigned = ShiftInst->isArithmeticShift();
  Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
  Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
  // Lets make sure that the shift value and the and mask are constant integers.
  if (!AndMask || !ShrVal) {
    return false;
  }
  Constant *newMaskConst;
  Constant *shiftValConst;
  if (isVector) {
    // Handle the vector case
    std::vector<Constant *> maskVals;
    std::vector<Constant *> shiftVals;
    ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
    ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
    Type *scalarType = AndMaskVec->getType()->getScalarType();
    assert(AndMaskVec->getNumOperands() ==
           ShrValVec->getNumOperands() && "cannot have a "
           "combination where the number of elements to a "
           "shift and an and are different!");
    for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
      ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
      ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
      if (!AndCI || !ShiftIC) {
        return false;
      }
      uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
      if (!isMask_32(maskVal)) {
        return false;
      }
      maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
      uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
      // If the mask or shiftval is greater than the bitcount, then break out.
      if (maskVal >= 32 || shiftVal >= 32) {
        return false;
      }
      // If the mask val is greater than the the number of original bits left
      // then this optimization is invalid.
      if (maskVal > (32 - shiftVal)) {
        return false;
      }
      maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
      shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
    }
    newMaskConst = ConstantVector::get(maskVals);
    shiftValConst = ConstantVector::get(shiftVals);
  } else {
    // Handle the scalar case
    uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
    // This must be a mask value where all lower bits are set to 1 and then any
    // bit higher is set to 0.
    if (!isMask_32(maskVal)) {
      return false;
    }
    maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
    // Count the number of bits set in the mask, this is the width of the
    // resulting bit set that is extracted from the source value.
    uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
    // If the mask or shift val is greater than the bitcount, then break out.
    if (maskVal >= 32 || shiftVal >= 32) {
      return false;
    }
    // If the mask val is greater than the the number of original bits left then
    // this optimization is invalid.
    if (maskVal > (32 - shiftVal)) {
      return false;
    }
    newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
    shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
  }
  // Lets create the function signature.
  std::vector<Type *> callTypes;
  callTypes.push_back(aType);
  callTypes.push_back(aType);
  callTypes.push_back(aType);
  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
  std::string name = "llvm.AMDIL.bit.extract.u32";
  if (isVector) {
    name += ".v" + itostr(numEle) + "i32";
  } else {
    name += ".";
  }
  // Lets create the function.
  Function *Func = 
    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
                       getOrInsertFunction(llvm::StringRef(name), funcType));
  Value *Operands[3] = {
    ShiftInst->getOperand(0),
    shiftValConst,
    newMaskConst
  };
  // Lets create the Call with the operands
  CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
  CI->setDoesNotAccessMemory();
  CI->insertBefore(inst);
  inst->replaceAllUsesWith(CI);
  return true;
}

bool
AMDGPUPeepholeOpt::expandBFI(CallInst *CI)
{
  if (!CI) {
    return false;
  }
  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
  if (!LHS->getName().startswith("__amdil_bfi")) {
    return false;
  }
  Type* type = CI->getOperand(0)->getType();
  Constant *negOneConst = NULL;
  if (type->isVectorTy()) {
    std::vector<Constant *> negOneVals;
    negOneConst = ConstantInt::get(CI->getContext(), 
        APInt(32, StringRef("-1"), 10));
    for (size_t x = 0,
        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
      negOneVals.push_back(negOneConst);
    }
    negOneConst = ConstantVector::get(negOneVals);
  } else {
    negOneConst = ConstantInt::get(CI->getContext(), 
        APInt(32, StringRef("-1"), 10));
  }
  // __amdil_bfi => (A & B) | (~A & C)
  BinaryOperator *lhs = 
    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
        CI->getOperand(1), "bfi_and", CI);
  BinaryOperator *rhs =
    BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
        "bfi_not", CI);
  rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
      "bfi_and", CI);
  lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
  CI->replaceAllUsesWith(lhs);
  return true;
}

bool
AMDGPUPeepholeOpt::expandBFM(CallInst *CI)
{
  if (!CI) {
    return false;
  }
  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
  if (!LHS->getName().startswith("__amdil_bfm")) {
    return false;
  }
  // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
  Constant *newMaskConst = NULL;
  Constant *newShiftConst = NULL;
  Type* type = CI->getOperand(0)->getType();
  if (type->isVectorTy()) {
    std::vector<Constant*> newMaskVals, newShiftVals;
    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
    for (size_t x = 0,
        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
      newMaskVals.push_back(newMaskConst);
      newShiftVals.push_back(newShiftConst);
    }
    newMaskConst = ConstantVector::get(newMaskVals);
    newShiftConst = ConstantVector::get(newShiftVals);
  } else {
    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
  }
  BinaryOperator *lhs =
    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
        newMaskConst, "bfm_mask", CI);
  lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
      lhs, "bfm_shl", CI);
  lhs = BinaryOperator::Create(Instruction::Sub, lhs,
      newShiftConst, "bfm_sub", CI);
  BinaryOperator *rhs =
    BinaryOperator::Create(Instruction::And, CI->getOperand(1),
        newMaskConst, "bfm_mask", CI);
  lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
  CI->replaceAllUsesWith(lhs);
  return true;
}

bool
AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) 
{
  Instruction *inst = (*bbb);
  if (optimizeCallInst(bbb)) {
    return true;
  }
  if (optimizeBitExtract(inst)) {
    return false;
  }
  if (optimizeBitInsert(inst)) {
    return false;
  }
  if (correctMisalignedMemOp(inst)) {
    return false;
  }
  return false;
}
bool
AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst)
{
  LoadInst *linst = dyn_cast<LoadInst>(inst);
  StoreInst *sinst = dyn_cast<StoreInst>(inst);
  unsigned alignment;
  Type* Ty = inst->getType();
  if (linst) {
    alignment = linst->getAlignment();
    Ty = inst->getType();
  } else if (sinst) {
    alignment = sinst->getAlignment();
    Ty = sinst->getValueOperand()->getType();
  } else {
    return false;
  }
  unsigned size = getTypeSize(Ty);
  if (size == alignment || size < alignment) {
    return false;
  }
  if (!Ty->isStructTy()) {
    return false;
  }
  if (alignment < 4) {
    if (linst) {
      linst->setAlignment(0);
      return true;
    } else if (sinst) {
      sinst->setAlignment(0);
      return true;
    }
  }
  return false;
}
bool 
AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) 
{
  if (!CI) {
    return false;
  }
  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
  std::string namePrefix = LHS->getName().substr(0, 14);
  if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
      && namePrefix != "__amdil__imul24_high") {
    return false;
  }
  if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
    return false;
  }
  return true;
}

void 
AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) 
{
  assert(isSigned24BitOps(CI) && "Must be a "
      "signed 24 bit operation to call this function!");
  Value *LHS = CI->getOperand(CI->getNumOperands()-1);
  // On 7XX and 8XX we do not have signed 24bit, so we need to
  // expand it to the following:
  // imul24 turns into 32bit imul
  // imad24 turns into 32bit imad
  // imul24_high turns into 32bit imulhigh
  if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
    Type *aType = CI->getOperand(0)->getType();
    bool isVector = aType->isVectorTy();
    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
    std::vector<Type*> callTypes;
    callTypes.push_back(CI->getOperand(0)->getType());
    callTypes.push_back(CI->getOperand(1)->getType());
    callTypes.push_back(CI->getOperand(2)->getType());
    FunctionType *funcType =
      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
    std::string name = "__amdil_imad";
    if (isVector) {
      name += "_v" + itostr(numEle) + "i32";
    } else {
      name += "_i32";
    }
    Function *Func = dyn_cast<Function>(
                       CI->getParent()->getParent()->getParent()->
                       getOrInsertFunction(llvm::StringRef(name), funcType));
    Value *Operands[3] = {
      CI->getOperand(0),
      CI->getOperand(1),
      CI->getOperand(2)
    };
    CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
    nCI->insertBefore(CI);
    CI->replaceAllUsesWith(nCI);
  } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
    BinaryOperator *mulOp =
      BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
          CI->getOperand(1), "imul24", CI);
    CI->replaceAllUsesWith(mulOp);
  } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
    Type *aType = CI->getOperand(0)->getType();

    bool isVector = aType->isVectorTy();
    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
    std::vector<Type*> callTypes;
    callTypes.push_back(CI->getOperand(0)->getType());
    callTypes.push_back(CI->getOperand(1)->getType());
    FunctionType *funcType =
      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
    std::string name = "__amdil_imul_high";
    if (isVector) {
      name += "_v" + itostr(numEle) + "i32";
    } else {
      name += "_i32";
    }
    Function *Func = dyn_cast<Function>(
                       CI->getParent()->getParent()->getParent()->
                       getOrInsertFunction(llvm::StringRef(name), funcType));
    Value *Operands[2] = {
      CI->getOperand(0),
      CI->getOperand(1)
    };
    CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
    nCI->insertBefore(CI);
    CI->replaceAllUsesWith(nCI);
  }
}

bool 
AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) 
{
  return (CI != NULL
          && CI->getOperand(CI->getNumOperands() - 1)->getName() 
          == "__amdil_get_local_size_int");
}

bool 
AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) 
{
  if (!CI) {
    return false;
  }
  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
      && (mSTM->getDeviceName() == "cayman")) {
    return false;
  }
  return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) 
      == "__amdil_improved_div";
}

void 
AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) 
{
  assert(convertAccurateDivide(CI)
         && "expanding accurate divide can only happen if it is expandable!");
  BinaryOperator *divOp =
    BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
                           CI->getOperand(1), "fdiv32", CI);
  CI->replaceAllUsesWith(divOp);
}

bool
AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI)
{
  if (optLevel != CodeGenOpt::None) {
    return false;
  }

  if (!CI) {
    return false;
  }

  unsigned funcNameIdx = 0;
  funcNameIdx = CI->getNumOperands() - 1;
  StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
  if (calleeName != "__amdil_image2d_read_norm"
   && calleeName != "__amdil_image2d_read_unnorm"
   && calleeName != "__amdil_image3d_read_norm"
   && calleeName != "__amdil_image3d_read_unnorm") {
    return false;
  }

  unsigned samplerIdx = 2;
  samplerIdx = 1;
  Value *sampler = CI->getOperand(samplerIdx);
  LoadInst *lInst = dyn_cast<LoadInst>(sampler);
  if (!lInst) {
    return false;
  }

  if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
    return false;
  }

  GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
  // If we are loading from what is not a global value, then we
  // fail and return.
  if (!gv) {
    return false;
  }

  // If we don't have an initializer or we have an initializer and
  // the initializer is not a 32bit integer, we fail.
  if (!gv->hasInitializer() 
      || !gv->getInitializer()->getType()->isIntegerTy(32)) {
      return false;
  }

  // Now that we have the global variable initializer, lets replace
  // all uses of the load instruction with the samplerVal and
  // reparse the __amdil_is_constant() function.
  Constant *samplerVal = gv->getInitializer();
  lInst->replaceAllUsesWith(samplerVal);
  return true;
}

bool 
AMDGPUPeepholeOpt::doInitialization(Module &M) 
{
  return false;
}

bool 
AMDGPUPeepholeOpt::doFinalization(Module &M) 
{
  return false;
}

void 
AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const 
{
  AU.addRequired<MachineFunctionAnalysis>();
  FunctionPass::getAnalysisUsage(AU);
  AU.setPreservesAll();
}

size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
  size_t size = 0;
  if (!T) {
    return size;
  }
  switch (T->getTypeID()) {
  case Type::X86_FP80TyID:
  case Type::FP128TyID:
  case Type::PPC_FP128TyID:
  case Type::LabelTyID:
    assert(0 && "These types are not supported by this backend");
  default:
  case Type::FloatTyID:
  case Type::DoubleTyID:
    size = T->getPrimitiveSizeInBits() >> 3;
    break;
  case Type::PointerTyID:
    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
    break;
  case Type::IntegerTyID:
    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
    break;
  case Type::StructTyID:
    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
    break;
  case Type::ArrayTyID:
    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
    break;
  case Type::FunctionTyID:
    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
    break;
  case Type::VectorTyID:
    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
    break;
  };
  return size;
}

size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
    bool dereferencePtr) {
  size_t size = 0;
  if (!ST) {
    return size;
  }
  Type *curType;
  StructType::element_iterator eib;
  StructType::element_iterator eie;
  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
    curType = *eib;
    size += getTypeSize(curType, dereferencePtr);
  }
  return size;
}

size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
    bool dereferencePtr) {
  return IT ? (IT->getBitWidth() >> 3) : 0;
}

size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
    bool dereferencePtr) {
    assert(0 && "Should not be able to calculate the size of an function type");
    return 0;
}

size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
    bool dereferencePtr) {
  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
                                    dereferencePtr) * AT->getNumElements())
                     : 0);
}

size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
    bool dereferencePtr) {
  return VT ? (VT->getBitWidth() >> 3) : 0;
}

size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
    bool dereferencePtr) {
  if (!PT) {
    return 0;
  }
  Type *CT = PT->getElementType();
  if (CT->getTypeID() == Type::StructTyID &&
      PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
    return getTypeSize(dyn_cast<StructType>(CT));
  } else if (dereferencePtr) {
    size_t size = 0;
    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
    }
    return size;
  } else {
    return 4;
  }
}

size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
    bool dereferencePtr) {
  //assert(0 && "Should not be able to calculate the size of an opaque type");
  return 4;
}