//===-- EfficiencySanitizer.cpp - performance tuner -----------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file is a part of EfficiencySanitizer, a family of performance tuners
// that detects multiple performance issues via separate sub-tools.
//
// The instrumentation phase is straightforward:
//   - Take action on every memory access: either inlined instrumentation,
//     or Inserted calls to our run-time library.
//   - Optimizations may apply to avoid instrumenting some of the accesses.
//   - Turn mem{set,cpy,move} instrinsics into library calls.
// The rest is handled by the run-time library.
//===----------------------------------------------------------------------===//

#include "llvm/Transforms/Instrumentation.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"

using namespace llvm;

#define DEBUG_TYPE "esan"

// The tool type must be just one of these ClTool* options, as the tools
// cannot be combined due to shadow memory constraints.
static cl::opt<bool>
    ClToolCacheFrag("esan-cache-frag", cl::init(false),
                    cl::desc("Detect data cache fragmentation"), cl::Hidden);
static cl::opt<bool>
    ClToolWorkingSet("esan-working-set", cl::init(false),
                    cl::desc("Measure the working set size"), cl::Hidden);
// Each new tool will get its own opt flag here.
// These are converted to EfficiencySanitizerOptions for use
// in the code.

static cl::opt<bool> ClInstrumentLoadsAndStores(
    "esan-instrument-loads-and-stores", cl::init(true),
    cl::desc("Instrument loads and stores"), cl::Hidden);
static cl::opt<bool> ClInstrumentMemIntrinsics(
    "esan-instrument-memintrinsics", cl::init(true),
    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
static cl::opt<bool> ClInstrumentFastpath(
    "esan-instrument-fastpath", cl::init(true),
    cl::desc("Instrument fastpath"), cl::Hidden);
static cl::opt<bool> ClAuxFieldInfo(
    "esan-aux-field-info", cl::init(true),
    cl::desc("Generate binary with auxiliary struct field information"),
    cl::Hidden);

// Experiments show that the performance difference can be 2x or more,
// and accuracy loss is typically negligible, so we turn this on by default.
static cl::opt<bool> ClAssumeIntraCacheLine(
    "esan-assume-intra-cache-line", cl::init(true),
    cl::desc("Assume each memory access touches just one cache line, for "
             "better performance but with a potential loss of accuracy."),
    cl::Hidden);

STATISTIC(NumInstrumentedLoads, "Number of instrumented loads");
STATISTIC(NumInstrumentedStores, "Number of instrumented stores");
STATISTIC(NumFastpaths, "Number of instrumented fastpaths");
STATISTIC(NumAccessesWithIrregularSize,
          "Number of accesses with a size outside our targeted callout sizes");
STATISTIC(NumIgnoredStructs, "Number of ignored structs");
STATISTIC(NumIgnoredGEPs, "Number of ignored GEP instructions");
STATISTIC(NumInstrumentedGEPs, "Number of instrumented GEP instructions");
STATISTIC(NumAssumedIntraCacheLine,
          "Number of accesses assumed to be intra-cache-line");

static const uint64_t EsanCtorAndDtorPriority = 0;
static const char *const EsanModuleCtorName = "esan.module_ctor";
static const char *const EsanModuleDtorName = "esan.module_dtor";
static const char *const EsanInitName = "__esan_init";
static const char *const EsanExitName = "__esan_exit";

// We need to specify the tool to the runtime earlier than
// the ctor is called in some cases, so we set a global variable.
static const char *const EsanWhichToolName = "__esan_which_tool";

// We must keep these Shadow* constants consistent with the esan runtime.
// FIXME: Try to place these shadow constants, the names of the __esan_*
// interface functions, and the ToolType enum into a header shared between
// llvm and compiler-rt.
static const uint64_t ShadowMask = 0x00000fffffffffffull;
static const uint64_t ShadowOffs[3] = { // Indexed by scale
  0x0000130000000000ull,
  0x0000220000000000ull,
  0x0000440000000000ull,
};
// This array is indexed by the ToolType enum.
static const int ShadowScale[] = {
  0, // ESAN_None.
  2, // ESAN_CacheFrag: 4B:1B, so 4 to 1 == >>2.
  6, // ESAN_WorkingSet: 64B:1B, so 64 to 1 == >>6.
};

// MaxStructCounterNameSize is a soft size limit to avoid insanely long
// names for those extremely large structs.
static const unsigned MaxStructCounterNameSize = 512;

namespace {

static EfficiencySanitizerOptions
OverrideOptionsFromCL(EfficiencySanitizerOptions Options) {
  if (ClToolCacheFrag)
    Options.ToolType = EfficiencySanitizerOptions::ESAN_CacheFrag;
  else if (ClToolWorkingSet)
    Options.ToolType = EfficiencySanitizerOptions::ESAN_WorkingSet;

  // Direct opt invocation with no params will have the default ESAN_None.
  // We run the default tool in that case.
  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_None)
    Options.ToolType = EfficiencySanitizerOptions::ESAN_CacheFrag;

  return Options;
}

// Create a constant for Str so that we can pass it to the run-time lib.
static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
                                                    bool AllowMerging) {
  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
  // We use private linkage for module-local strings. If they can be merged
  // with another one, we set the unnamed_addr attribute.
  GlobalVariable *GV =
    new GlobalVariable(M, StrConst->getType(), true,
                       GlobalValue::PrivateLinkage, StrConst, "");
  if (AllowMerging)
    GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
  return GV;
}

/// EfficiencySanitizer: instrument each module to find performance issues.
class EfficiencySanitizer : public ModulePass {
public:
  EfficiencySanitizer(
      const EfficiencySanitizerOptions &Opts = EfficiencySanitizerOptions())
      : ModulePass(ID), Options(OverrideOptionsFromCL(Opts)) {}
  const char *getPassName() const override;
  void getAnalysisUsage(AnalysisUsage &AU) const override;
  bool runOnModule(Module &M) override;
  static char ID;

private:
  bool initOnModule(Module &M);
  void initializeCallbacks(Module &M);
  bool shouldIgnoreStructType(StructType *StructTy);
  void createStructCounterName(
      StructType *StructTy, SmallString<MaxStructCounterNameSize> &NameStr);
  void createCacheFragAuxGV(
    Module &M, const DataLayout &DL, StructType *StructTy,
    GlobalVariable *&TypeNames, GlobalVariable *&Offsets, GlobalVariable *&Size);
  GlobalVariable *createCacheFragInfoGV(Module &M, const DataLayout &DL,
                                        Constant *UnitName);
  Constant *createEsanInitToolInfoArg(Module &M, const DataLayout &DL);
  void createDestructor(Module &M, Constant *ToolInfoArg);
  bool runOnFunction(Function &F, Module &M);
  bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
  bool instrumentMemIntrinsic(MemIntrinsic *MI);
  bool instrumentGetElementPtr(Instruction *I, Module &M);
  bool insertCounterUpdate(Instruction *I, StructType *StructTy,
                           unsigned CounterIdx);
  unsigned getFieldCounterIdx(StructType *StructTy) {
    return 0;
  }
  unsigned getArrayCounterIdx(StructType *StructTy) {
    return StructTy->getNumElements();
  }
  unsigned getStructCounterSize(StructType *StructTy) {
    // The struct counter array includes:
    // - one counter for each struct field,
    // - one counter for the struct access within an array.
    return (StructTy->getNumElements()/*field*/ + 1/*array*/);
  }
  bool shouldIgnoreMemoryAccess(Instruction *I);
  int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
  Value *appToShadow(Value *Shadow, IRBuilder<> &IRB);
  bool instrumentFastpath(Instruction *I, const DataLayout &DL, bool IsStore,
                          Value *Addr, unsigned Alignment);
  // Each tool has its own fastpath routine:
  bool instrumentFastpathCacheFrag(Instruction *I, const DataLayout &DL,
                                   Value *Addr, unsigned Alignment);
  bool instrumentFastpathWorkingSet(Instruction *I, const DataLayout &DL,
                                    Value *Addr, unsigned Alignment);

  EfficiencySanitizerOptions Options;
  LLVMContext *Ctx;
  Type *IntptrTy;
  // Our slowpath involves callouts to the runtime library.
  // Access sizes are powers of two: 1, 2, 4, 8, 16.
  static const size_t NumberOfAccessSizes = 5;
  Function *EsanAlignedLoad[NumberOfAccessSizes];
  Function *EsanAlignedStore[NumberOfAccessSizes];
  Function *EsanUnalignedLoad[NumberOfAccessSizes];
  Function *EsanUnalignedStore[NumberOfAccessSizes];
  // For irregular sizes of any alignment:
  Function *EsanUnalignedLoadN, *EsanUnalignedStoreN;
  Function *MemmoveFn, *MemcpyFn, *MemsetFn;
  Function *EsanCtorFunction;
  Function *EsanDtorFunction;
  // Remember the counter variable for each struct type to avoid
  // recomputing the variable name later during instrumentation.
  std::map<Type *, GlobalVariable *> StructTyMap;
};
} // namespace

char EfficiencySanitizer::ID = 0;
INITIALIZE_PASS_BEGIN(
    EfficiencySanitizer, "esan",
    "EfficiencySanitizer: finds performance issues.", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_END(
    EfficiencySanitizer, "esan",
    "EfficiencySanitizer: finds performance issues.", false, false)

const char *EfficiencySanitizer::getPassName() const {
  return "EfficiencySanitizer";
}

void EfficiencySanitizer::getAnalysisUsage(AnalysisUsage &AU) const {
  AU.addRequired<TargetLibraryInfoWrapperPass>();
}

ModulePass *
llvm::createEfficiencySanitizerPass(const EfficiencySanitizerOptions &Options) {
  return new EfficiencySanitizer(Options);
}

void EfficiencySanitizer::initializeCallbacks(Module &M) {
  IRBuilder<> IRB(M.getContext());
  // Initialize the callbacks.
  for (size_t Idx = 0; Idx < NumberOfAccessSizes; ++Idx) {
    const unsigned ByteSize = 1U << Idx;
    std::string ByteSizeStr = utostr(ByteSize);
    // We'll inline the most common (i.e., aligned and frequent sizes)
    // load + store instrumentation: these callouts are for the slowpath.
    SmallString<32> AlignedLoadName("__esan_aligned_load" + ByteSizeStr);
    EsanAlignedLoad[Idx] =
        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
            AlignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
    SmallString<32> AlignedStoreName("__esan_aligned_store" + ByteSizeStr);
    EsanAlignedStore[Idx] =
        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
            AlignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
    SmallString<32> UnalignedLoadName("__esan_unaligned_load" + ByteSizeStr);
    EsanUnalignedLoad[Idx] =
        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
            UnalignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
    SmallString<32> UnalignedStoreName("__esan_unaligned_store" + ByteSizeStr);
    EsanUnalignedStore[Idx] =
        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
            UnalignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
  }
  EsanUnalignedLoadN = checkSanitizerInterfaceFunction(
      M.getOrInsertFunction("__esan_unaligned_loadN", IRB.getVoidTy(),
                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
  EsanUnalignedStoreN = checkSanitizerInterfaceFunction(
      M.getOrInsertFunction("__esan_unaligned_storeN", IRB.getVoidTy(),
                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
  MemmoveFn = checkSanitizerInterfaceFunction(
      M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
  MemcpyFn = checkSanitizerInterfaceFunction(
      M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
  MemsetFn = checkSanitizerInterfaceFunction(
      M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
                            IRB.getInt32Ty(), IntptrTy, nullptr));
}

bool EfficiencySanitizer::shouldIgnoreStructType(StructType *StructTy) {
  if (StructTy == nullptr || StructTy->isOpaque() /* no struct body */)
    return true;
  return false;
}

void EfficiencySanitizer::createStructCounterName(
    StructType *StructTy, SmallString<MaxStructCounterNameSize> &NameStr) {
  // Append NumFields and field type ids to avoid struct conflicts
  // with the same name but different fields.
  if (StructTy->hasName())
    NameStr += StructTy->getName();
  else
    NameStr += "struct.anon";
  // We allow the actual size of the StructCounterName to be larger than
  // MaxStructCounterNameSize and append #NumFields and at least one
  // field type id.
  // Append #NumFields.
  NameStr += "#";
  Twine(StructTy->getNumElements()).toVector(NameStr);
  // Append struct field type ids in the reverse order.
  for (int i = StructTy->getNumElements() - 1; i >= 0; --i) {
    NameStr += "#";
    Twine(StructTy->getElementType(i)->getTypeID()).toVector(NameStr);
    if (NameStr.size() >= MaxStructCounterNameSize)
      break;
  }
  if (StructTy->isLiteral()) {
    // End with # for literal struct.
    NameStr += "#";
  }
}

// Create global variables with auxiliary information (e.g., struct field size,
// offset, and type name) for better user report.
void EfficiencySanitizer::createCacheFragAuxGV(
    Module &M, const DataLayout &DL, StructType *StructTy,
    GlobalVariable *&TypeName, GlobalVariable *&Offset,
    GlobalVariable *&Size) {
  auto *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
  auto *Int32Ty = Type::getInt32Ty(*Ctx);
  // FieldTypeName.
  auto *TypeNameArrayTy = ArrayType::get(Int8PtrTy, StructTy->getNumElements());
  TypeName = new GlobalVariable(M, TypeNameArrayTy, true,
                                 GlobalVariable::InternalLinkage, nullptr);
  SmallVector<Constant *, 16> TypeNameVec;
  // FieldOffset.
  auto *OffsetArrayTy = ArrayType::get(Int32Ty, StructTy->getNumElements());
  Offset = new GlobalVariable(M, OffsetArrayTy, true,
                              GlobalVariable::InternalLinkage, nullptr);
  SmallVector<Constant *, 16> OffsetVec;
  // FieldSize
  auto *SizeArrayTy = ArrayType::get(Int32Ty, StructTy->getNumElements());
  Size = new GlobalVariable(M, SizeArrayTy, true,
                            GlobalVariable::InternalLinkage, nullptr);
  SmallVector<Constant *, 16> SizeVec;
  for (unsigned i = 0; i < StructTy->getNumElements(); ++i) {
    Type *Ty = StructTy->getElementType(i);
    std::string Str;
    raw_string_ostream StrOS(Str);
    Ty->print(StrOS);
    TypeNameVec.push_back(
        ConstantExpr::getPointerCast(
            createPrivateGlobalForString(M, StrOS.str(), true),
            Int8PtrTy));
    OffsetVec.push_back(
        ConstantInt::get(Int32Ty,
                         DL.getStructLayout(StructTy)->getElementOffset(i)));
    SizeVec.push_back(ConstantInt::get(Int32Ty,
                                       DL.getTypeAllocSize(Ty)));
    }
  TypeName->setInitializer(ConstantArray::get(TypeNameArrayTy, TypeNameVec));
  Offset->setInitializer(ConstantArray::get(OffsetArrayTy, OffsetVec));
  Size->setInitializer(ConstantArray::get(SizeArrayTy, SizeVec));
}

// Create the global variable for the cache-fragmentation tool.
GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV(
    Module &M, const DataLayout &DL, Constant *UnitName) {
  assert(Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag);

  auto *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
  auto *Int8PtrPtrTy = Int8PtrTy->getPointerTo();
  auto *Int32Ty = Type::getInt32Ty(*Ctx);
  auto *Int32PtrTy = Type::getInt32PtrTy(*Ctx);
  auto *Int64Ty = Type::getInt64Ty(*Ctx);
  auto *Int64PtrTy = Type::getInt64PtrTy(*Ctx);
  // This structure should be kept consistent with the StructInfo struct
  // in the runtime library.
  // struct StructInfo {
  //   const char *StructName;
  //   u32 Size;
  //   u32 NumFields;
  //   u32 *FieldOffset;           // auxiliary struct field info.
  //   u32 *FieldSize;             // auxiliary struct field info.
  //   const char **FieldTypeName; // auxiliary struct field info.
  //   u64 *FieldCounters;
  //   u64 *ArrayCounter;
  // };
  auto *StructInfoTy =
    StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy,
                    Int8PtrPtrTy, Int64PtrTy, Int64PtrTy, nullptr);
  auto *StructInfoPtrTy = StructInfoTy->getPointerTo();
  // This structure should be kept consistent with the CacheFragInfo struct
  // in the runtime library.
  // struct CacheFragInfo {
  //   const char *UnitName;
  //   u32 NumStructs;
  //   StructInfo *Structs;
  // };
  auto *CacheFragInfoTy =
    StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy, nullptr);

  std::vector<StructType *> Vec = M.getIdentifiedStructTypes();
  unsigned NumStructs = 0;
  SmallVector<Constant *, 16> Initializers;

  for (auto &StructTy : Vec) {
    if (shouldIgnoreStructType(StructTy)) {
      ++NumIgnoredStructs;
      continue;
    }
    ++NumStructs;

    // StructName.
    SmallString<MaxStructCounterNameSize> CounterNameStr;
    createStructCounterName(StructTy, CounterNameStr);
    GlobalVariable *StructCounterName = createPrivateGlobalForString(
        M, CounterNameStr, /*AllowMerging*/true);

    // Counters.
    // We create the counter array with StructCounterName and weak linkage
    // so that the structs with the same name and layout from different
    // compilation units will be merged into one.
    auto *CounterArrayTy = ArrayType::get(Int64Ty,
                                          getStructCounterSize(StructTy));
    GlobalVariable *Counters =
      new GlobalVariable(M, CounterArrayTy, false,
                         GlobalVariable::WeakAnyLinkage,
                         ConstantAggregateZero::get(CounterArrayTy),
                         CounterNameStr);

    // Remember the counter variable for each struct type.
    StructTyMap.insert(std::pair<Type *, GlobalVariable *>(StructTy, Counters));

    // We pass the field type name array, offset array, and size array to
    // the runtime for better reporting.
    GlobalVariable *TypeName = nullptr, *Offset = nullptr, *Size = nullptr;
    if (ClAuxFieldInfo)
      createCacheFragAuxGV(M, DL, StructTy, TypeName, Offset, Size);

    Constant *FieldCounterIdx[2];
    FieldCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
    FieldCounterIdx[1] = ConstantInt::get(Int32Ty,
                                          getFieldCounterIdx(StructTy));
    Constant *ArrayCounterIdx[2];
    ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0);
    ArrayCounterIdx[1] = ConstantInt::get(Int32Ty,
                                          getArrayCounterIdx(StructTy));
    Initializers.push_back(
        ConstantStruct::get(
            StructInfoTy,
            ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy),
            ConstantInt::get(Int32Ty,
                             DL.getStructLayout(StructTy)->getSizeInBytes()),
            ConstantInt::get(Int32Ty, StructTy->getNumElements()),
            Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
                ConstantExpr::getPointerCast(Offset, Int32PtrTy),
            Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) :
                ConstantExpr::getPointerCast(Size, Int32PtrTy),
            TypeName == nullptr ? ConstantPointerNull::get(Int8PtrPtrTy) :
                ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy),
            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
                                           FieldCounterIdx),
            ConstantExpr::getGetElementPtr(CounterArrayTy, Counters,
                                           ArrayCounterIdx),
            nullptr));
  }
  // Structs.
  Constant *StructInfo;
  if (NumStructs == 0) {
    StructInfo = ConstantPointerNull::get(StructInfoPtrTy);
  } else {
    auto *StructInfoArrayTy = ArrayType::get(StructInfoTy, NumStructs);
    StructInfo = ConstantExpr::getPointerCast(
        new GlobalVariable(M, StructInfoArrayTy, false,
                           GlobalVariable::InternalLinkage,
                           ConstantArray::get(StructInfoArrayTy, Initializers)),
        StructInfoPtrTy);
  }

  auto *CacheFragInfoGV = new GlobalVariable(
      M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage,
      ConstantStruct::get(CacheFragInfoTy,
                          UnitName,
                          ConstantInt::get(Int32Ty, NumStructs),
                          StructInfo,
                          nullptr));
  return CacheFragInfoGV;
}

// Create the tool-specific argument passed to EsanInit and EsanExit.
Constant *EfficiencySanitizer::createEsanInitToolInfoArg(Module &M,
                                                         const DataLayout &DL) {
  // This structure contains tool-specific information about each compilation
  // unit (module) and is passed to the runtime library.
  GlobalVariable *ToolInfoGV = nullptr;

  auto *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
  // Compilation unit name.
  auto *UnitName = ConstantExpr::getPointerCast(
      createPrivateGlobalForString(M, M.getModuleIdentifier(), true),
      Int8PtrTy);

  // Create the tool-specific variable.
  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag)
    ToolInfoGV = createCacheFragInfoGV(M, DL, UnitName);

  if (ToolInfoGV != nullptr)
    return ConstantExpr::getPointerCast(ToolInfoGV, Int8PtrTy);

  // Create the null pointer if no tool-specific variable created.
  return ConstantPointerNull::get(Int8PtrTy);
}

void EfficiencySanitizer::createDestructor(Module &M, Constant *ToolInfoArg) {
  PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
  EsanDtorFunction = Function::Create(FunctionType::get(Type::getVoidTy(*Ctx),
                                                        false),
                                      GlobalValue::InternalLinkage,
                                      EsanModuleDtorName, &M);
  ReturnInst::Create(*Ctx, BasicBlock::Create(*Ctx, "", EsanDtorFunction));
  IRBuilder<> IRB_Dtor(EsanDtorFunction->getEntryBlock().getTerminator());
  Function *EsanExit = checkSanitizerInterfaceFunction(
      M.getOrInsertFunction(EsanExitName, IRB_Dtor.getVoidTy(),
                            Int8PtrTy, nullptr));
  EsanExit->setLinkage(Function::ExternalLinkage);
  IRB_Dtor.CreateCall(EsanExit, {ToolInfoArg});
  appendToGlobalDtors(M, EsanDtorFunction, EsanCtorAndDtorPriority);
}

bool EfficiencySanitizer::initOnModule(Module &M) {
  Ctx = &M.getContext();
  const DataLayout &DL = M.getDataLayout();
  IRBuilder<> IRB(M.getContext());
  IntegerType *OrdTy = IRB.getInt32Ty();
  PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx);
  IntptrTy = DL.getIntPtrType(M.getContext());
  // Create the variable passed to EsanInit and EsanExit.
  Constant *ToolInfoArg = createEsanInitToolInfoArg(M, DL);
  // Constructor
  // We specify the tool type both in the EsanWhichToolName global
  // and as an arg to the init routine as a sanity check.
  std::tie(EsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
      M, EsanModuleCtorName, EsanInitName, /*InitArgTypes=*/{OrdTy, Int8PtrTy},
      /*InitArgs=*/{
        ConstantInt::get(OrdTy, static_cast<int>(Options.ToolType)),
        ToolInfoArg});
  appendToGlobalCtors(M, EsanCtorFunction, EsanCtorAndDtorPriority);

  createDestructor(M, ToolInfoArg);

  new GlobalVariable(M, OrdTy, true,
                     GlobalValue::WeakAnyLinkage,
                     ConstantInt::get(OrdTy,
                                      static_cast<int>(Options.ToolType)),
                     EsanWhichToolName);

  return true;
}

Value *EfficiencySanitizer::appToShadow(Value *Shadow, IRBuilder<> &IRB) {
  // Shadow = ((App & Mask) + Offs) >> Scale
  Shadow = IRB.CreateAnd(Shadow, ConstantInt::get(IntptrTy, ShadowMask));
  uint64_t Offs;
  int Scale = ShadowScale[Options.ToolType];
  if (Scale <= 2)
    Offs = ShadowOffs[Scale];
  else
    Offs = ShadowOffs[0] << Scale;
  Shadow = IRB.CreateAdd(Shadow, ConstantInt::get(IntptrTy, Offs));
  if (Scale > 0)
    Shadow = IRB.CreateLShr(Shadow, Scale);
  return Shadow;
}

bool EfficiencySanitizer::shouldIgnoreMemoryAccess(Instruction *I) {
  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag) {
    // We'd like to know about cache fragmentation in vtable accesses and
    // constant data references, so we do not currently ignore anything.
    return false;
  } else if (Options.ToolType == EfficiencySanitizerOptions::ESAN_WorkingSet) {
    // TODO: the instrumentation disturbs the data layout on the stack, so we
    // may want to add an option to ignore stack references (if we can
    // distinguish them) to reduce overhead.
  }
  // TODO(bruening): future tools will be returning true for some cases.
  return false;
}

bool EfficiencySanitizer::runOnModule(Module &M) {
  bool Res = initOnModule(M);
  initializeCallbacks(M);
  for (auto &F : M) {
    Res |= runOnFunction(F, M);
  }
  return Res;
}

bool EfficiencySanitizer::runOnFunction(Function &F, Module &M) {
  // This is required to prevent instrumenting the call to __esan_init from
  // within the module constructor.
  if (&F == EsanCtorFunction)
    return false;
  SmallVector<Instruction *, 8> LoadsAndStores;
  SmallVector<Instruction *, 8> MemIntrinCalls;
  SmallVector<Instruction *, 8> GetElementPtrs;
  bool Res = false;
  const DataLayout &DL = M.getDataLayout();
  const TargetLibraryInfo *TLI =
      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();

  for (auto &BB : F) {
    for (auto &Inst : BB) {
      if ((isa<LoadInst>(Inst) || isa<StoreInst>(Inst) ||
           isa<AtomicRMWInst>(Inst) || isa<AtomicCmpXchgInst>(Inst)) &&
          !shouldIgnoreMemoryAccess(&Inst))
        LoadsAndStores.push_back(&Inst);
      else if (isa<MemIntrinsic>(Inst))
        MemIntrinCalls.push_back(&Inst);
      else if (isa<GetElementPtrInst>(Inst))
        GetElementPtrs.push_back(&Inst);
      else if (CallInst *CI = dyn_cast<CallInst>(&Inst))
        maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
    }
  }

  if (ClInstrumentLoadsAndStores) {
    for (auto Inst : LoadsAndStores) {
      Res |= instrumentLoadOrStore(Inst, DL);
    }
  }

  if (ClInstrumentMemIntrinsics) {
    for (auto Inst : MemIntrinCalls) {
      Res |= instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
    }
  }

  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag) {
    for (auto Inst : GetElementPtrs) {
      Res |= instrumentGetElementPtr(Inst, M);
    }
  }

  return Res;
}

bool EfficiencySanitizer::instrumentLoadOrStore(Instruction *I,
                                                const DataLayout &DL) {
  IRBuilder<> IRB(I);
  bool IsStore;
  Value *Addr;
  unsigned Alignment;
  if (LoadInst *Load = dyn_cast<LoadInst>(I)) {
    IsStore = false;
    Alignment = Load->getAlignment();
    Addr = Load->getPointerOperand();
  } else if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
    IsStore = true;
    Alignment = Store->getAlignment();
    Addr = Store->getPointerOperand();
  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
    IsStore = true;
    Alignment = 0;
    Addr = RMW->getPointerOperand();
  } else if (AtomicCmpXchgInst *Xchg = dyn_cast<AtomicCmpXchgInst>(I)) {
    IsStore = true;
    Alignment = 0;
    Addr = Xchg->getPointerOperand();
  } else
    llvm_unreachable("Unsupported mem access type");

  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
  const uint32_t TypeSizeBytes = DL.getTypeStoreSizeInBits(OrigTy) / 8;
  Value *OnAccessFunc = nullptr;

  // Convert 0 to the default alignment.
  if (Alignment == 0)
    Alignment = DL.getPrefTypeAlignment(OrigTy);

  if (IsStore)
    NumInstrumentedStores++;
  else
    NumInstrumentedLoads++;
  int Idx = getMemoryAccessFuncIndex(Addr, DL);
  if (Idx < 0) {
    OnAccessFunc = IsStore ? EsanUnalignedStoreN : EsanUnalignedLoadN;
    IRB.CreateCall(OnAccessFunc,
                   {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
                    ConstantInt::get(IntptrTy, TypeSizeBytes)});
  } else {
    if (ClInstrumentFastpath &&
        instrumentFastpath(I, DL, IsStore, Addr, Alignment)) {
      NumFastpaths++;
      return true;
    }
    if (Alignment == 0 || (Alignment % TypeSizeBytes) == 0)
      OnAccessFunc = IsStore ? EsanAlignedStore[Idx] : EsanAlignedLoad[Idx];
    else
      OnAccessFunc = IsStore ? EsanUnalignedStore[Idx] : EsanUnalignedLoad[Idx];
    IRB.CreateCall(OnAccessFunc,
                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
  }
  return true;
}

// It's simplest to replace the memset/memmove/memcpy intrinsics with
// calls that the runtime library intercepts.
// Our pass is late enough that calls should not turn back into intrinsics.
bool EfficiencySanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
  IRBuilder<> IRB(MI);
  bool Res = false;
  if (isa<MemSetInst>(MI)) {
    IRB.CreateCall(
        MemsetFn,
        {IRB.CreatePointerCast(MI->getArgOperand(0), IRB.getInt8PtrTy()),
         IRB.CreateIntCast(MI->getArgOperand(1), IRB.getInt32Ty(), false),
         IRB.CreateIntCast(MI->getArgOperand(2), IntptrTy, false)});
    MI->eraseFromParent();
    Res = true;
  } else if (isa<MemTransferInst>(MI)) {
    IRB.CreateCall(
        isa<MemCpyInst>(MI) ? MemcpyFn : MemmoveFn,
        {IRB.CreatePointerCast(MI->getArgOperand(0), IRB.getInt8PtrTy()),
         IRB.CreatePointerCast(MI->getArgOperand(1), IRB.getInt8PtrTy()),
         IRB.CreateIntCast(MI->getArgOperand(2), IntptrTy, false)});
    MI->eraseFromParent();
    Res = true;
  } else
    llvm_unreachable("Unsupported mem intrinsic type");
  return Res;
}

bool EfficiencySanitizer::instrumentGetElementPtr(Instruction *I, Module &M) {
  GetElementPtrInst *GepInst = dyn_cast<GetElementPtrInst>(I);
  bool Res = false;
  if (GepInst == nullptr || GepInst->getNumIndices() == 1) {
    ++NumIgnoredGEPs;
    return false;
  }
  Type *SourceTy = GepInst->getSourceElementType();
  StructType *StructTy;
  ConstantInt *Idx;
  // Check if GEP calculates address from a struct array.
  if (isa<StructType>(SourceTy)) {
    StructTy = cast<StructType>(SourceTy);
    Idx = dyn_cast<ConstantInt>(GepInst->getOperand(1));
    if ((Idx == nullptr || Idx->getSExtValue() != 0) &&
        !shouldIgnoreStructType(StructTy) && StructTyMap.count(StructTy) != 0)
      Res |= insertCounterUpdate(I, StructTy, getArrayCounterIdx(StructTy));
  }
  // Iterate all (except the first and the last) idx within each GEP instruction
  // for possible nested struct field address calculation.
  for (unsigned i = 1; i < GepInst->getNumIndices(); ++i) {
    SmallVector<Value *, 8> IdxVec(GepInst->idx_begin(),
                                   GepInst->idx_begin() + i);
    Type *Ty = GetElementPtrInst::getIndexedType(SourceTy, IdxVec);
    unsigned CounterIdx = 0;
    if (isa<ArrayType>(Ty)) {
      ArrayType *ArrayTy = cast<ArrayType>(Ty);
      StructTy = dyn_cast<StructType>(ArrayTy->getElementType());
      if (shouldIgnoreStructType(StructTy) || StructTyMap.count(StructTy) == 0)
        continue;
      // The last counter for struct array access.
      CounterIdx = getArrayCounterIdx(StructTy);
    } else if (isa<StructType>(Ty)) {
      StructTy = cast<StructType>(Ty);
      if (shouldIgnoreStructType(StructTy) || StructTyMap.count(StructTy) == 0)
        continue;
      // Get the StructTy's subfield index.
      Idx = cast<ConstantInt>(GepInst->getOperand(i+1));
      assert(Idx->getSExtValue() >= 0 &&
             Idx->getSExtValue() < StructTy->getNumElements());
      CounterIdx = getFieldCounterIdx(StructTy) + Idx->getSExtValue();
    }
    Res |= insertCounterUpdate(I, StructTy, CounterIdx);
  }
  if (Res)
    ++NumInstrumentedGEPs;
  else
    ++NumIgnoredGEPs;
  return Res;
}

bool EfficiencySanitizer::insertCounterUpdate(Instruction *I,
                                              StructType *StructTy,
                                              unsigned CounterIdx) {
  GlobalVariable *CounterArray = StructTyMap[StructTy];
  if (CounterArray == nullptr)
    return false;
  IRBuilder<> IRB(I);
  Constant *Indices[2];
  // Xref http://llvm.org/docs/LangRef.html#i-getelementptr and
  // http://llvm.org/docs/GetElementPtr.html.
  // The first index of the GEP instruction steps through the first operand,
  // i.e., the array itself.
  Indices[0] = ConstantInt::get(IRB.getInt32Ty(), 0);
  // The second index is the index within the array.
  Indices[1] = ConstantInt::get(IRB.getInt32Ty(), CounterIdx);
  Constant *Counter =
    ConstantExpr::getGetElementPtr(
        ArrayType::get(IRB.getInt64Ty(), getStructCounterSize(StructTy)),
        CounterArray, Indices);
  Value *Load = IRB.CreateLoad(Counter);
  IRB.CreateStore(IRB.CreateAdd(Load, ConstantInt::get(IRB.getInt64Ty(), 1)),
                  Counter);
  return true;
}

int EfficiencySanitizer::getMemoryAccessFuncIndex(Value *Addr,
                                                  const DataLayout &DL) {
  Type *OrigPtrTy = Addr->getType();
  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
  assert(OrigTy->isSized());
  // The size is always a multiple of 8.
  uint32_t TypeSizeBytes = DL.getTypeStoreSizeInBits(OrigTy) / 8;
  if (TypeSizeBytes != 1 && TypeSizeBytes != 2 && TypeSizeBytes != 4 &&
      TypeSizeBytes != 8 && TypeSizeBytes != 16) {
    // Irregular sizes do not have per-size call targets.
    NumAccessesWithIrregularSize++;
    return -1;
  }
  size_t Idx = countTrailingZeros(TypeSizeBytes);
  assert(Idx < NumberOfAccessSizes);
  return Idx;
}

bool EfficiencySanitizer::instrumentFastpath(Instruction *I,
                                             const DataLayout &DL, bool IsStore,
                                             Value *Addr, unsigned Alignment) {
  if (Options.ToolType == EfficiencySanitizerOptions::ESAN_CacheFrag) {
    return instrumentFastpathCacheFrag(I, DL, Addr, Alignment);
  } else if (Options.ToolType == EfficiencySanitizerOptions::ESAN_WorkingSet) {
    return instrumentFastpathWorkingSet(I, DL, Addr, Alignment);
  }
  return false;
}

bool EfficiencySanitizer::instrumentFastpathCacheFrag(Instruction *I,
                                                      const DataLayout &DL,
                                                      Value *Addr,
                                                      unsigned Alignment) {
  // Do nothing.
  return true; // Return true to avoid slowpath instrumentation.
}

bool EfficiencySanitizer::instrumentFastpathWorkingSet(
    Instruction *I, const DataLayout &DL, Value *Addr, unsigned Alignment) {
  assert(ShadowScale[Options.ToolType] == 6); // The code below assumes this
  IRBuilder<> IRB(I);
  Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
  const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
  // Bail to the slowpath if the access might touch multiple cache lines.
  // An access aligned to its size is guaranteed to be intra-cache-line.
  // getMemoryAccessFuncIndex has already ruled out a size larger than 16
  // and thus larger than a cache line for platforms this tool targets
  // (and our shadow memory setup assumes 64-byte cache lines).
  assert(TypeSize <= 128);
  if (!(TypeSize == 8 ||
        (Alignment % (TypeSize / 8)) == 0)) {
    if (ClAssumeIntraCacheLine)
      ++NumAssumedIntraCacheLine;
    else
      return false;
  }

  // We inline instrumentation to set the corresponding shadow bits for
  // each cache line touched by the application.  Here we handle a single
  // load or store where we've already ruled out the possibility that it
  // might touch more than one cache line and thus we simply update the
  // shadow memory for a single cache line.
  // Our shadow memory model is fine with races when manipulating shadow values.
  // We generate the following code:
  //
  //   const char BitMask = 0x81;
  //   char *ShadowAddr = appToShadow(AppAddr);
  //   if ((*ShadowAddr & BitMask) != BitMask)
  //     *ShadowAddr |= Bitmask;
  //
  Value *AddrPtr = IRB.CreatePointerCast(Addr, IntptrTy);
  Value *ShadowPtr = appToShadow(AddrPtr, IRB);
  Type *ShadowTy = IntegerType::get(*Ctx, 8U);
  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
  // The bottom bit is used for the current sampling period's working set.
  // The top bit is used for the total working set.  We set both on each
  // memory access, if they are not already set.
  Value *ValueMask = ConstantInt::get(ShadowTy, 0x81); // 10000001B

  Value *OldValue = IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
  // The AND and CMP will be turned into a TEST instruction by the compiler.
  Value *Cmp = IRB.CreateICmpNE(IRB.CreateAnd(OldValue, ValueMask), ValueMask);
  TerminatorInst *CmpTerm = SplitBlockAndInsertIfThen(Cmp, I, false);
  // FIXME: do I need to call SetCurrentDebugLocation?
  IRB.SetInsertPoint(CmpTerm);
  // We use OR to set the shadow bits to avoid corrupting the middle 6 bits,
  // which are used by the runtime library.
  Value *NewVal = IRB.CreateOr(OldValue, ValueMask);
  IRB.CreateStore(NewVal, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
  IRB.SetInsertPoint(I);

  return true;
}