#include "rsCpuScriptGroup2.h" #include <dlfcn.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <set> #include <sstream> #include <string> #include <vector> #ifndef RS_COMPATIBILITY_LIB #include "bcc/Config.h" #endif #include "cpu_ref/rsCpuCore.h" #include "rsClosure.h" #include "rsContext.h" #include "rsCpuCore.h" #include "rsCpuExecutable.h" #include "rsCpuScript.h" #include "rsScript.h" #include "rsScriptGroup2.h" #include "rsScriptIntrinsic.h" using std::string; using std::vector; namespace android { namespace renderscript { namespace { const size_t DefaultKernelArgCount = 2; void groupRoot(const RsExpandKernelDriverInfo *kinfo, uint32_t xstart, uint32_t xend, uint32_t outstep) { const List<CPUClosure*>& closures = *(List<CPUClosure*>*)kinfo->usr; RsExpandKernelDriverInfo *mutable_kinfo = const_cast<RsExpandKernelDriverInfo *>(kinfo); const size_t oldInLen = mutable_kinfo->inLen; decltype(mutable_kinfo->inStride) oldInStride; memcpy(&oldInStride, &mutable_kinfo->inStride, sizeof(oldInStride)); for (CPUClosure* cpuClosure : closures) { const Closure* closure = cpuClosure->mClosure; // There had better be enough space in mutable_kinfo rsAssert(closure->mNumArg <= RS_KERNEL_INPUT_LIMIT); for (size_t i = 0; i < closure->mNumArg; i++) { const void* arg = closure->mArgs[i]; const Allocation* a = (const Allocation*)arg; const uint32_t eStride = a->mHal.state.elementSizeBytes; const uint8_t* ptr = (uint8_t*)(a->mHal.drvState.lod[0].mallocPtr) + eStride * xstart; if (kinfo->dim.y > 1) { ptr += a->mHal.drvState.lod[0].stride * kinfo->current.y; } mutable_kinfo->inPtr[i] = ptr; mutable_kinfo->inStride[i] = eStride; } mutable_kinfo->inLen = closure->mNumArg; const Allocation* out = closure->mReturnValue; const uint32_t ostep = out->mHal.state.elementSizeBytes; const uint8_t* ptr = (uint8_t *)(out->mHal.drvState.lod[0].mallocPtr) + ostep * xstart; if (kinfo->dim.y > 1) { ptr += out->mHal.drvState.lod[0].stride * kinfo->current.y; } mutable_kinfo->outPtr[0] = const_cast<uint8_t*>(ptr); // The implementation of an intrinsic relies on kinfo->usr being // the "this" pointer to the intrinsic (an RsdCpuScriptIntrinsic object) mutable_kinfo->usr = cpuClosure->mSi; cpuClosure->mFunc(kinfo, xstart, xend, ostep); } mutable_kinfo->inLen = oldInLen; mutable_kinfo->usr = &closures; memcpy(&mutable_kinfo->inStride, &oldInStride, sizeof(oldInStride)); } } // namespace Batch::Batch(CpuScriptGroup2Impl* group, const char* name) : mGroup(group), mFunc(nullptr) { mName = strndup(name, strlen(name)); } Batch::~Batch() { for (CPUClosure* c : mClosures) { delete c; } free(mName); } bool Batch::conflict(CPUClosure* cpuClosure) const { if (mClosures.empty()) { return false; } const Closure* closure = cpuClosure->mClosure; if (!closure->mIsKernel || !mClosures.front()->mClosure->mIsKernel) { // An invoke should be in a batch by itself, so it conflicts with any other // closure. return true; } const auto& globalDeps = closure->mGlobalDeps; const auto& argDeps = closure->mArgDeps; for (CPUClosure* c : mClosures) { const Closure* batched = c->mClosure; if (globalDeps.find(batched) != globalDeps.end()) { return true; } const auto& it = argDeps.find(batched); if (it != argDeps.end()) { const auto& args = (*it).second; for (const auto &p1 : *args) { if (p1.second.get() != nullptr) { return true; } } } } // The compiler fusion pass in bcc expects that kernels chained up through // (1st) input and output. const Closure* lastBatched = mClosures.back()->mClosure; const auto& it = argDeps.find(lastBatched); if (it == argDeps.end()) { return true; } const auto& args = (*it).second; for (const auto &p1 : *args) { if (p1.first == 0 && p1.second.get() == nullptr) { // The new closure depends on the last batched closure's return // value (fieldId being nullptr) for its first argument (argument 0) return false; } } return true; } CpuScriptGroup2Impl::CpuScriptGroup2Impl(RsdCpuReferenceImpl *cpuRefImpl, const ScriptGroupBase *sg) : mCpuRefImpl(cpuRefImpl), mGroup((const ScriptGroup2*)(sg)), mExecutable(nullptr), mScriptObj(nullptr) { rsAssert(!mGroup->mClosures.empty()); mCpuRefImpl->lockMutex(); Batch* batch = new Batch(this, "Batch0"); int i = 0; for (Closure* closure: mGroup->mClosures) { CPUClosure* cc; const IDBase* funcID = closure->mFunctionID.get(); RsdCpuScriptImpl* si = (RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(funcID->mScript); if (closure->mIsKernel) { MTLaunchStructForEach mtls; si->forEachKernelSetup(funcID->mSlot, &mtls); cc = new CPUClosure(closure, si, (ExpandFuncTy)mtls.kernel); } else { cc = new CPUClosure(closure, si); } if (batch->conflict(cc)) { mBatches.push_back(batch); std::stringstream ss; ss << "Batch" << ++i; std::string batchStr(ss.str()); batch = new Batch(this, batchStr.c_str()); } batch->mClosures.push_back(cc); } rsAssert(!batch->mClosures.empty()); mBatches.push_back(batch); #ifndef RS_COMPATIBILITY_LIB compile(mGroup->mCacheDir); if (mScriptObj != nullptr && mExecutable != nullptr) { for (Batch* batch : mBatches) { batch->resolveFuncPtr(mScriptObj); } } #endif // RS_COMPATIBILITY_LIB mCpuRefImpl->unlockMutex(); } void Batch::resolveFuncPtr(void* sharedObj) { std::string funcName(mName); if (mClosures.front()->mClosure->mIsKernel) { funcName.append(".expand"); } mFunc = dlsym(sharedObj, funcName.c_str()); rsAssert (mFunc != nullptr); } CpuScriptGroup2Impl::~CpuScriptGroup2Impl() { for (Batch* batch : mBatches) { delete batch; } delete mExecutable; // TODO: move this dlclose into ~ScriptExecutable(). if (mScriptObj != nullptr) { dlclose(mScriptObj); } } namespace { #ifndef RS_COMPATIBILITY_LIB string getCoreLibPath(Context* context, string* coreLibRelaxedPath) { *coreLibRelaxedPath = ""; // If we're debugging, use the debug library. if (context->getContextType() == RS_CONTEXT_TYPE_DEBUG) { return SYSLIBPATH_BC"/libclcore_debug.bc"; } // Check for a platform specific library #if defined(ARCH_ARM_HAVE_NEON) && !defined(DISABLE_CLCORE_NEON) // NEON-capable ARMv7a devices can use an accelerated math library // for all reduced precision scripts. // ARMv8 does not use NEON, as ASIMD can be used with all precision // levels. *coreLibRelaxedPath = SYSLIBPATH_BC"/libclcore_neon.bc"; #endif #if defined(__i386__) || defined(__x86_64__) // x86 devices will use an optimized library. return SYSLIBPATH_BC"/libclcore_x86.bc"; #else return SYSLIBPATH_BC"/libclcore.bc"; #endif } void setupCompileArguments( const vector<const char*>& inputs, const vector<string>& kernelBatches, const vector<string>& invokeBatches, const char* outputDir, const char* outputFileName, const char* coreLibPath, const char* coreLibRelaxedPath, const bool emitGlobalInfo, const bool emitGlobalInfoSkipConstant, int optLevel, vector<const char*>* args) { args->push_back(RsdCpuScriptImpl::BCC_EXE_PATH); args->push_back("-fPIC"); args->push_back("-embedRSInfo"); if (emitGlobalInfo) { args->push_back("-rs-global-info"); if (emitGlobalInfoSkipConstant) { args->push_back("-rs-global-info-skip-constant"); } } args->push_back("-mtriple"); args->push_back(DEFAULT_TARGET_TRIPLE_STRING); args->push_back("-bclib"); args->push_back(coreLibPath); args->push_back("-bclib_relaxed"); args->push_back(coreLibRelaxedPath); for (const char* input : inputs) { args->push_back(input); } for (const string& batch : kernelBatches) { args->push_back("-merge"); args->push_back(batch.c_str()); } for (const string& batch : invokeBatches) { args->push_back("-invoke"); args->push_back(batch.c_str()); } args->push_back("-output_path"); args->push_back(outputDir); args->push_back("-O"); switch (optLevel) { case 0: args->push_back("0"); break; case 3: args->push_back("3"); break; default: ALOGW("Expected optimization level of 0 or 3. Received %d", optLevel); args->push_back("3"); break; } // The output filename has to be the last, in case we need to pop it out and // replace with a different name. args->push_back("-o"); args->push_back(outputFileName); } void generateSourceSlot(RsdCpuReferenceImpl* ctxt, const Closure& closure, const std::vector<const char*>& inputs, std::stringstream& ss) { const IDBase* funcID = (const IDBase*)closure.mFunctionID.get(); const Script* script = funcID->mScript; rsAssert (!script->isIntrinsic()); const RsdCpuScriptImpl *cpuScript = (const RsdCpuScriptImpl *)ctxt->lookupScript(script); const string& bitcodeFilename = cpuScript->getBitcodeFilePath(); const int index = find(inputs.begin(), inputs.end(), bitcodeFilename) - inputs.begin(); ss << index << "," << funcID->mSlot << "."; } #endif // RS_COMPATIBILTY_LIB } // anonymous namespace // This function is used by the debugger to inspect ScriptGroup // compilations. // // "__attribute__((noinline))" and "__asm__" are used to prevent the // function call from being eliminated as a no-op (see the "noinline" // attribute in gcc documentation). // // "__attribute__((weak))" is used to prevent callers from recognizing // that this is guaranteed to be the function definition, recognizing // that certain arguments are unused, and optimizing away the passing // of those arguments (see the LLVM optimization // DeadArgumentElimination). Theoretically, the compiler could get // aggressive enough with link-time optimization that even marking the // entry point as a weak definition wouldn't solve the problem. // extern __attribute__((noinline)) __attribute__((weak)) void debugHintScriptGroup2(const char* groupName, const uint32_t groupNameSize, const ExpandFuncTy* kernel, const uint32_t kernelCount) { ALOGV("group name: %d:%s\n", groupNameSize, groupName); for (uint32_t i=0; i < kernelCount; ++i) { const char* f1 = (const char*)(kernel[i]); __asm__ __volatile__(""); ALOGV(" closure: %p\n", (const void*)f1); } // do nothing, this is just a hook point for the debugger. return; } void CpuScriptGroup2Impl::compile(const char* cacheDir) { #ifndef RS_COMPATIBILITY_LIB if (mGroup->mClosures.size() < 2) { return; } const int optLevel = getCpuRefImpl()->getContext()->getOptLevel(); if (optLevel == 0) { std::vector<ExpandFuncTy> kernels; for (const Batch* b : mBatches) for (const CPUClosure* c : b->mClosures) kernels.push_back(c->mFunc); if (kernels.size()) { // pass this information on to the debugger via a hint function. debugHintScriptGroup2(mGroup->mName, strlen(mGroup->mName), kernels.data(), kernels.size()); } // skip script group compilation forcing the driver to use the fallback // execution path which currently has better support for debugging. return; } auto comparator = [](const char* str1, const char* str2) -> bool { return strcmp(str1, str2) < 0; }; std::set<const char*, decltype(comparator)> inputSet(comparator); for (Closure* closure : mGroup->mClosures) { const Script* script = closure->mFunctionID.get()->mScript; // If any script is an intrinsic, give up trying fusing the kernels. if (script->isIntrinsic()) { return; } const RsdCpuScriptImpl *cpuScript = (const RsdCpuScriptImpl *)mCpuRefImpl->lookupScript(script); const char* bitcodeFilename = cpuScript->getBitcodeFilePath(); inputSet.insert(bitcodeFilename); } std::vector<const char*> inputs(inputSet.begin(), inputSet.end()); std::vector<string> kernelBatches; std::vector<string> invokeBatches; int i = 0; for (const auto& batch : mBatches) { rsAssert(batch->size() > 0); std::stringstream ss; ss << batch->mName << ":"; if (!batch->mClosures.front()->mClosure->mIsKernel) { rsAssert(batch->size() == 1); generateSourceSlot(mCpuRefImpl, *batch->mClosures.front()->mClosure, inputs, ss); invokeBatches.push_back(ss.str()); } else { for (const auto& cpuClosure : batch->mClosures) { generateSourceSlot(mCpuRefImpl, *cpuClosure->mClosure, inputs, ss); } kernelBatches.push_back(ss.str()); } } rsAssert(cacheDir != nullptr); string objFilePath(cacheDir); objFilePath.append("/"); objFilePath.append(mGroup->mName); objFilePath.append(".o"); const char* resName = mGroup->mName; string coreLibRelaxedPath; const string& coreLibPath = getCoreLibPath(getCpuRefImpl()->getContext(), &coreLibRelaxedPath); vector<const char*> arguments; bool emitGlobalInfo = getCpuRefImpl()->getEmbedGlobalInfo(); bool emitGlobalInfoSkipConstant = getCpuRefImpl()->getEmbedGlobalInfoSkipConstant(); setupCompileArguments(inputs, kernelBatches, invokeBatches, cacheDir, resName, coreLibPath.c_str(), coreLibRelaxedPath.c_str(), emitGlobalInfo, emitGlobalInfoSkipConstant, optLevel, &arguments); std::unique_ptr<const char> cmdLine(rsuJoinStrings(arguments.size() - 1, arguments.data())); inputs.push_back(coreLibPath.c_str()); inputs.push_back(coreLibRelaxedPath.c_str()); uint32_t checksum = constructBuildChecksum(nullptr, 0, cmdLine.get(), inputs.data(), inputs.size()); if (checksum == 0) { return; } std::stringstream ss; ss << std::hex << checksum; std::string checksumStr(ss.str()); //===--------------------------------------------------------------------===// // Try to load a shared lib from code cache matching filename and checksum //===--------------------------------------------------------------------===// bool alreadyLoaded = false; std::string cloneName; const bool useRSDebugContext = (mCpuRefImpl->getContext()->getContextType() == RS_CONTEXT_TYPE_DEBUG); const bool reuse = !is_force_recompile() && !useRSDebugContext; if (reuse) { mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName, nullptr, &alreadyLoaded); } if (mScriptObj != nullptr) { // A shared library named resName is found in code cache directory // cacheDir, and loaded with the handle stored in mScriptObj. mExecutable = ScriptExecutable::createFromSharedObject( mScriptObj, checksum); if (mExecutable != nullptr) { // The loaded shared library in mScriptObj has a matching checksum. // An executable object has been created. return; } ALOGV("Failed to create an executable object from so file due to " "mismatching checksum"); if (alreadyLoaded) { // The shared object found in code cache has already been loaded. // A different file name is needed for the new shared library, to // avoid corrupting the currently loaded instance. cloneName.append(resName); cloneName.append("#"); cloneName.append(SharedLibraryUtils::getRandomString(6).c_str()); // The last element in arguments is the output filename. arguments.pop_back(); arguments.push_back(cloneName.c_str()); } dlclose(mScriptObj); mScriptObj = nullptr; } //===--------------------------------------------------------------------===// // Fuse the input kernels and generate native code in an object file //===--------------------------------------------------------------------===// arguments.push_back("-build-checksum"); arguments.push_back(checksumStr.c_str()); arguments.push_back(nullptr); bool compiled = rsuExecuteCommand(RsdCpuScriptImpl::BCC_EXE_PATH, arguments.size()-1, arguments.data()); if (!compiled) { return; } //===--------------------------------------------------------------------===// // Create and load the shared lib //===--------------------------------------------------------------------===// std::string SOPath; if (!SharedLibraryUtils::createSharedLibrary( getCpuRefImpl()->getContext()->getDriverName(), cacheDir, resName, reuse, &SOPath)) { ALOGE("Failed to link object file '%s'", resName); unlink(objFilePath.c_str()); return; } unlink(objFilePath.c_str()); if (reuse) { mScriptObj = SharedLibraryUtils::loadSharedLibrary(cacheDir, resName); } else { mScriptObj = SharedLibraryUtils::loadAndDeleteSharedLibrary(SOPath.c_str()); } if (mScriptObj == nullptr) { ALOGE("Unable to load '%s'", resName); return; } if (alreadyLoaded) { // Delete the temporary, random-named file that we created to avoid // interfering with an already loaded shared library. string cloneFilePath(cacheDir); cloneFilePath.append("/"); cloneFilePath.append(cloneName.c_str()); cloneFilePath.append(".so"); unlink(cloneFilePath.c_str()); } mExecutable = ScriptExecutable::createFromSharedObject(mScriptObj); #endif // RS_COMPATIBILITY_LIB } void CpuScriptGroup2Impl::execute() { for (auto batch : mBatches) { batch->setGlobalsForBatch(); batch->run(); } } void Batch::setGlobalsForBatch() { for (CPUClosure* cpuClosure : mClosures) { const Closure* closure = cpuClosure->mClosure; const IDBase* funcID = closure->mFunctionID.get(); Script* s = funcID->mScript;; for (const auto& p : closure->mGlobals) { const int64_t value = p.second.first; int size = p.second.second; if (value == 0 && size == 0) { // This indicates the current closure depends on another closure for a // global in their shared module (script). In this case we don't need to // copy the value. For example, an invoke intializes a global variable // which a kernel later reads. continue; } rsAssert(p.first != nullptr); Script* script = p.first->mScript; rsAssert(script == s); RsdCpuReferenceImpl* ctxt = mGroup->getCpuRefImpl(); const RsdCpuScriptImpl *cpuScript = (const RsdCpuScriptImpl *)ctxt->lookupScript(script); int slot = p.first->mSlot; ScriptExecutable* exec = mGroup->getExecutable(); if (exec != nullptr) { const char* varName = cpuScript->getFieldName(slot); void* addr = exec->getFieldAddress(varName); if (size < 0) { rsrSetObject(mGroup->getCpuRefImpl()->getContext(), (rs_object_base*)addr, (ObjectBase*)value); } else { memcpy(addr, (const void*)&value, size); } } else { // We use -1 size to indicate an ObjectBase rather than a primitive type if (size < 0) { s->setVarObj(slot, (ObjectBase*)value); } else { s->setVar(slot, (const void*)&value, size); } } } } } void Batch::run() { if (!mClosures.front()->mClosure->mIsKernel) { rsAssert(mClosures.size() == 1); // This batch contains a single closure for an invoke function CPUClosure* cc = mClosures.front(); const Closure* c = cc->mClosure; if (mFunc != nullptr) { // TODO: Need align pointers for x86_64. // See RsdCpuScriptImpl::invokeFunction in rsCpuScript.cpp ((InvokeFuncTy)mFunc)(c->mParams, c->mParamLength); } else { const ScriptInvokeID* invokeID = (const ScriptInvokeID*)c->mFunctionID.get(); rsAssert(invokeID != nullptr); cc->mSi->invokeFunction(invokeID->mSlot, c->mParams, c->mParamLength); } return; } if (mFunc != nullptr) { MTLaunchStructForEach mtls; const CPUClosure* firstCpuClosure = mClosures.front(); const CPUClosure* lastCpuClosure = mClosures.back(); firstCpuClosure->mSi->forEachMtlsSetup( (const Allocation**)firstCpuClosure->mClosure->mArgs, firstCpuClosure->mClosure->mNumArg, lastCpuClosure->mClosure->mReturnValue, nullptr, 0, nullptr, &mtls); mtls.script = nullptr; mtls.fep.usr = nullptr; mtls.kernel = (ForEachFunc_t)mFunc; mGroup->getCpuRefImpl()->launchForEach( (const Allocation**)firstCpuClosure->mClosure->mArgs, firstCpuClosure->mClosure->mNumArg, lastCpuClosure->mClosure->mReturnValue, nullptr, &mtls); return; } for (CPUClosure* cpuClosure : mClosures) { const Closure* closure = cpuClosure->mClosure; const ScriptKernelID* kernelID = (const ScriptKernelID*)closure->mFunctionID.get(); cpuClosure->mSi->preLaunch(kernelID->mSlot, (const Allocation**)closure->mArgs, closure->mNumArg, closure->mReturnValue, nullptr, 0, nullptr); } const CPUClosure* cpuClosure = mClosures.front(); const Closure* closure = cpuClosure->mClosure; MTLaunchStructForEach mtls; if (cpuClosure->mSi->forEachMtlsSetup((const Allocation**)closure->mArgs, closure->mNumArg, closure->mReturnValue, nullptr, 0, nullptr, &mtls)) { mtls.script = nullptr; mtls.kernel = &groupRoot; mtls.fep.usr = &mClosures; mGroup->getCpuRefImpl()->launchForEach(nullptr, 0, nullptr, nullptr, &mtls); } for (CPUClosure* cpuClosure : mClosures) { const Closure* closure = cpuClosure->mClosure; const ScriptKernelID* kernelID = (const ScriptKernelID*)closure->mFunctionID.get(); cpuClosure->mSi->postLaunch(kernelID->mSlot, (const Allocation**)closure->mArgs, closure->mNumArg, closure->mReturnValue, nullptr, 0, nullptr); } } } // namespace renderscript } // namespace android