/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #define LOG_TAG "ExecutionPlan" #include "ExecutionPlan.h" #include "BurstBuilder.h" #include "Callbacks.h" #include "CompilationBuilder.h" #include "ExecutionBuilder.h" #include "ExecutionBurstController.h" #include "GraphDump.h" #include "Manager.h" #include "ModelBuilder.h" #include "OperationsUtils.h" #include "TokenHasher.h" #include "Tracing.h" #include "TypeManager.h" #include "Utils.h" #include <cutils/native_handle.h> #include <fcntl.h> #include <openssl/sha.h> #include <sys/stat.h> #include <sys/types.h> #include <functional> #include <map> #include <mutex> #include <queue> #include <strstream> #include <type_traits> #include <unordered_set> #include <utility> #include <vector> using HidlToken = hidl_array<uint8_t, ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN>; namespace android { namespace nn { namespace { // Opens cache file by filename and sets the handle to the opened fd. Returns false on fail. The // handle is expected to come in as empty, and is only set to a fd when the function returns true. // The file descriptor is always opened with both read and write permission. bool createCacheHandle(const std::string& cache, bool createIfNotExist, hidl_handle* handle) { CHECK(handle->getNativeHandle() == nullptr); int fd = open(cache.c_str(), createIfNotExist ? (O_RDWR | O_CREAT) : O_RDWR, S_IRUSR | S_IWUSR); NN_RET_CHECK_GE(fd, 0); native_handle_t* cacheNativeHandle = native_handle_create(1, 0); if (cacheNativeHandle == nullptr) { close(fd); return false; } cacheNativeHandle->data[0] = fd; handle->setTo(cacheNativeHandle, /*shouldOwn=*/true); return true; } // Opens a list of cache files and returns the handle vector. Returns empty vector on fail. // The file descriptors are always opened with both read and write permission. hidl_vec<hidl_handle> createCacheHandleVec(uint32_t numCacheFiles, const std::string& baseFileName, bool createIfNotExist) { CHECK(numCacheFiles <= static_cast<uint32_t>(Constant::MAX_NUMBER_OF_CACHE_FILES)); hidl_vec<hidl_handle> handles(numCacheFiles); for (uint32_t i = 0; i < numCacheFiles; i++) { std::string filename = baseFileName + std::to_string(i); VLOG(COMPILATION) << "Cache " << i << ": " << filename; if (!createCacheHandle(filename, createIfNotExist, &handles[i])) { return hidl_vec<hidl_handle>(); } } return handles; } // Maps token to cache file names and sets the handle vectors to the opened fds. Returns false on // fail and leaves the vectors empty. Each vector is expected to come in as empty. bool getCacheHandles(const std::string& cacheDir, const uint8_t* token, const std::pair<uint32_t, uint32_t>& numCacheFiles, bool createIfNotExist, hidl_vec<hidl_handle>* modelCache, hidl_vec<hidl_handle>* dataCache) { // The filename includes ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2 characters for token, // and 1 character for model/data cache identifier. std::string filename(ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2 + 1, '0'); for (uint32_t i = 0; i < ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN; i++) { filename[i * 2] = 'A' + (token[i] & 0x0F); filename[i * 2 + 1] = 'A' + (token[i] >> 4); } CHECK(cacheDir.empty() || cacheDir.back() == '/'); std::string cacheFileName = cacheDir + filename; cacheFileName[ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2] = '1'; *modelCache = createCacheHandleVec(numCacheFiles.first, cacheFileName, createIfNotExist); if (modelCache->size() != numCacheFiles.first) { return false; } cacheFileName[ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN * 2] = '2'; *dataCache = createCacheHandleVec(numCacheFiles.second, cacheFileName, createIfNotExist); if (dataCache->size() != numCacheFiles.second) { modelCache->resize(0); return false; } return true; } // Tries to compile directly from cache, returns false on fail. bool compileFromCache(const std::shared_ptr<Device>& device, const std::string& cacheDir, const uint8_t* token, std::shared_ptr<VersionedIPreparedModel>* preparedModel) { CHECK(token != nullptr && device != nullptr); VLOG(COMPILATION) << "compileFromCache"; *preparedModel = nullptr; HidlToken cacheToken(token); hidl_vec<hidl_handle> modelCache, dataCache; NN_RET_CHECK(getCacheHandles(cacheDir, token, device->getNumberOfCacheFilesNeeded(), /*createIfNotExist=*/false, &modelCache, &dataCache)); int ret = device->prepareModelFromCache(modelCache, dataCache, cacheToken, preparedModel); return ret == ANEURALNETWORKS_NO_ERROR; } int compileModelAndCache(const std::shared_ptr<Device>& device, const ModelBuilder* model, int32_t executionPreference, const std::string& cacheDir, const uint8_t* token, std::shared_ptr<VersionedIPreparedModel>* preparedModel) { CHECK(device != nullptr); *preparedModel = nullptr; uint8_t dummyToken[ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN] = {0}; HidlToken cacheToken(token == nullptr ? dummyToken : token); hidl_vec<hidl_handle> modelCache, dataCache; if (token == nullptr || !getCacheHandles(cacheDir, token, device->getNumberOfCacheFilesNeeded(), /*createIfNotExist=*/true, &modelCache, &dataCache)) { modelCache.resize(0); dataCache.resize(0); } Model hidlModel; model->setHidlModel(&hidlModel); return device->prepareModel(hidlModel, static_cast<ExecutionPreference>(executionPreference), modelCache, dataCache, cacheToken, preparedModel); } // Compiles the model on device. // If compilation caching is available, depending on ExecutionPlan::mState, the token may only have // been initialized by the user provided token (SIMPLE body), or is already re-hashed by the // operation indices to be executed (COMPOUND body). The token will be re-hashed further by the // device name, device version string, and the execution preference in this function. int compile(std::shared_ptr<Device> device, const ModelBuilder* model, int32_t executionPreference, const std::string& cacheDir, TokenHasher* token, std::shared_ptr<VersionedIPreparedModel>* preparedModel) { CHECK(device != nullptr); const uint8_t* tokenData = nullptr; if (device->isCachingSupported() && token->ok() && token->updateFromString(device->getName()) && token->updateFromString(device->getVersionString()) && token->update(&executionPreference, sizeof(executionPreference)) && token->finish()) { tokenData = token->getCacheToken(); } if (tokenData != nullptr && compileFromCache(device, cacheDir, tokenData, preparedModel)) { return ANEURALNETWORKS_NO_ERROR; } return compileModelAndCache(device, model, executionPreference, cacheDir, tokenData, preparedModel); } typedef std::function<void(uint32_t)> OperationReadyCallback; int copyOperandExtraParams(ModelBuilder& model, uint32_t toOperandIndex, const Operand& fromOperand) { if (fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL && fromOperand.extraParams.getDiscriminator() == Operand::ExtraParams::hidl_discriminator::channelQuant) { auto& fromChannelQuant = fromOperand.extraParams.channelQuant(); ANeuralNetworksSymmPerChannelQuantParams toChannelQuant = { .channelDim = fromChannelQuant.channelDim, .scaleCount = static_cast<uint32_t>(fromChannelQuant.scales.size()), .scales = fromChannelQuant.scales.data(), }; return model.setOperandSymmPerChannelQuantParams(toOperandIndex, toChannelQuant); } else if (isExtensionOperandType(fromOperand.type) && fromOperand.extraParams.getDiscriminator() == Operand::ExtraParams::hidl_discriminator::extension) { hidl_vec<uint8_t> extensionData = fromOperand.extraParams.extension(); return model.setOperandExtensionData(toOperandIndex, extensionData.data(), extensionData.size()); } else if (fromOperand.extraParams.getDiscriminator() != Operand::ExtraParams::hidl_discriminator::none || fromOperand.type == OperandType::TENSOR_QUANT8_SYMM_PER_CHANNEL) { LOG(ERROR) << "Type " << toString(fromOperand.type) << " has an unexpected extraParams discriminator: " << static_cast<int>(fromOperand.extraParams.getDiscriminator()); return ANEURALNETWORKS_BAD_DATA; } else { return ANEURALNETWORKS_NO_ERROR; } } // This class tracks whether we know the value of an operand as operations // are processed. class OperandTracker { public: // Creates the tracker for this model. Figure out which operations can be // executed right away and cb for each one of them. OperandTracker(const ModelBuilder* model, OperationReadyCallback cb); // Mark the specified operation as having been processed. The output // of the operation now being known, this may make new operations to be // able to run. Call cb for each one of them. void markProcessed(uint32_t operationIndex, OperationReadyCallback cb); private: const ModelBuilder* mModel; std::multimap<uint32_t, uint32_t> mOperandToOperations; std::vector<uint32_t> mUnknownInputCount; // For each operation }; OperandTracker::OperandTracker(const ModelBuilder* model, OperationReadyCallback cb) : mModel(model) { const auto& operations = mModel->getOperations(); mUnknownInputCount.resize(operations.size()); for (uint32_t operationIndex = 0; operationIndex < operations.size(); operationIndex++) { const Operation& operation = operations[operationIndex]; uint32_t count = 0; for (uint32_t operandIndex : operation.inputs) { auto lifetime = mModel->getOperand(operandIndex).lifetime; if (lifetime == OperandLifeTime::TEMPORARY_VARIABLE || lifetime == OperandLifeTime::MODEL_OUTPUT) { count++; mOperandToOperations.insert( std::pair<uint32_t, uint32_t>(operandIndex, operationIndex)); } } if (count == 0) { cb(operationIndex); } mUnknownInputCount[operationIndex] = count; } } void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallback cb) { // Mark all its outputs as known. const Operation& operation = mModel->getOperations()[operationIndex]; for (uint32_t operandIndex : operation.outputs) { auto range = mOperandToOperations.equal_range(operandIndex); for (auto i = range.first; i != range.second; i++) { uint32_t& count = mUnknownInputCount[i->second]; if (--count == 0) { cb(i->second); } } } } } // namespace ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, std::shared_ptr<Device> device) : mPlan(plan), mIndex(stepIndex), mSubModel(), mDevice(device), mToken(plan->getCacheToken()) {} // Adds an operand if it has not been added already. // Sets the index in the submodel for the corresponding operand. int ExecutionStep::addOperand(uint32_t fromOperandIndex, uint32_t* toOperandIndex, const ModelBuilder& fromModel, OperandKind kind) { // Have we added this operand already? auto i = mOperandMap.find(fromOperandIndex); if (i != mOperandMap.end()) { nnAssert(kind == INPUT); *toOperandIndex = i->second; return ANEURALNETWORKS_NO_ERROR; } // First time we add this operand. *toOperandIndex = mSubModel.operandCount(); mOperandMap.insert(std::pair<uint32_t, uint32_t>(fromOperandIndex, *toOperandIndex)); // Add the operand to the submodel. const Operand& operand = fromModel.getOperand(fromOperandIndex); ANeuralNetworksOperandType type = { .type = static_cast<int32_t>(operand.type), .dimensionCount = static_cast<uint32_t>(operand.dimensions.size()), .dimensions = operand.dimensions.size() > 0 ? operand.dimensions.data() : nullptr, .scale = operand.scale, .zeroPoint = operand.zeroPoint, }; int n = mSubModel.addOperand(type); if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "Previous error occurred when partitioning the graph"; return n; } n = copyOperandExtraParams(mSubModel, *toOperandIndex, operand); if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "Error when copying extra parameters to the operand"; return n; } // Sets its value. switch (operand.lifetime) { case OperandLifeTime::CONSTANT_COPY: { const uint8_t* data = fromModel.getPointerToOperandValue(operand.location.offset); n = mSubModel.setOperandValue(*toOperandIndex, data, operand.location.length); if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "Previous error occurred when partitioning the graph"; return n; } } break; case OperandLifeTime::CONSTANT_REFERENCE: { const Memory* memory = fromModel.getMemories()[operand.location.poolIndex]; n = mSubModel.setOperandValueFromMemory(*toOperandIndex, memory, operand.location.offset, operand.location.length); if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "Previous error occurred when partitioning the graph"; return n; } } break; case OperandLifeTime::NO_VALUE: { n = mSubModel.setOperandValue(*toOperandIndex, nullptr, 0); if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "Previous error occurred when partitioning the graph"; return n; } } break; case OperandLifeTime::TEMPORARY_VARIABLE: // handled similarly to MODEL_OUTPUT if (kind == INPUT) { // The first time we've seen this operand is as an // input. That means it must be defined by a // different partition, and is an input to this one. mTempsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex)); } else { // The first time we've seen this operand is as an // output. It may be an input to a different // partition, so keep track of it. mPlan->recordTemporaryDef(fromOperandIndex, mIndex); } break; case OperandLifeTime::MODEL_INPUT: mModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex)); break; case OperandLifeTime::MODEL_OUTPUT: // handled similarly to TEMPORARY_VARIABLE if (kind == INPUT) { // The first time we've seen this operand is as an // input. That means it must be defined by a // different partition, and is an input to this one. mOutputsAsSubModelInputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex)); } else { // The first time we've seen this operand is as an // output. mModelOutputs.push_back(std::make_pair(fromOperandIndex, *toOperandIndex)); } break; default: nnAssert(false); break; } return ANEURALNETWORKS_NO_ERROR; } int ExecutionStep::addOperation(int operationIndex, const ModelBuilder& fromModel) { const Operation& operation = fromModel.getOperation(operationIndex); if (mToken.ok()) { mToken.update(&operationIndex, sizeof(operationIndex)); } // Convert the input and output operand indexes. // // We expect operations to be added in topological order. Therefore: // // - We may not have seen an input if it is a model input, a // constant, or an operand written by a different partition. // // - We should not have seen any outputs. const uint32_t inputCount = static_cast<uint32_t>(operation.inputs.size()); const uint32_t outputCount = static_cast<uint32_t>(operation.outputs.size()); std::vector<uint32_t> inputs(inputCount); std::vector<uint32_t> outputs(outputCount); auto addOperands = [this, &fromModel](const hidl_vec<uint32_t>& globalOperands, std::vector<uint32_t>& localOperands, OperandKind kind) -> int { const uint32_t operandCount = static_cast<uint32_t>(globalOperands.size()); for (uint32_t i = 0; i < operandCount; i++) { uint32_t localOperand = ~0U; int n = addOperand(globalOperands[i], &localOperand, fromModel, kind); if (n != ANEURALNETWORKS_NO_ERROR) return n; localOperands[i] = localOperand; } return ANEURALNETWORKS_NO_ERROR; }; int n; if ((n = addOperands(operation.inputs, inputs, INPUT)) != ANEURALNETWORKS_NO_ERROR || (n = addOperands(operation.outputs, outputs, OUTPUT)) != ANEURALNETWORKS_NO_ERROR) { return n; } return mSubModel.addOperation(static_cast<uint32_t>(operation.type), inputCount, inputs.data(), outputCount, outputs.data()); } void ExecutionStep::mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const { for (uint32_t i = 0, e = mInputIndexSubModelToFromModel.size(); i < e; i++) { stepExecutor->mapInput(mInputIndexSubModelToFromModel[i], i); } for (uint32_t i = 0, e = mOutputIndexSubModelToFromModel.size(); i < e; i++) { stepExecutor->mapOutput(mOutputIndexSubModelToFromModel[i], i); } } void ExecutionPlan::CompoundBody::findTempsAsSubModelOutputs() { for (const auto& step : mSteps) { for (const auto& input : step->getTempsAsSubModelInputs()) { const uint32_t fromModelIndex = input.first; const auto it = mTemporaryToDefiningStep.find(fromModelIndex); nnAssert(it != mTemporaryToDefiningStep.end()); const uint32_t stepIndex = it->second; nnAssert(stepIndex < mSteps.size()); mSteps[stepIndex]->recordTempAsSubModelOutput(fromModelIndex); } } } void ExecutionStep::logSubModel() const { VLOG(COMPILATION) << "ExecutionStep::finishSubModel, step " << mIndex; auto logRemapEntry = [](std::string &toLog, const std::pair<uint32_t, uint32_t>& e) { if (!toLog.empty()) { toLog += ", "; } toLog += "("; toLog += std::to_string(e.first); toLog += "->"; toLog += std::to_string(e.second); toLog += ")"; }; auto logRemapVector = [&logRemapEntry](const char* name, const RemapVectorType& map) { std::string toLog; for (const auto& e : map) { logRemapEntry(toLog, e); } VLOG(COMPILATION) << name << ": " << toLog; }; auto logRemapSet = [&logRemapEntry](const char* name, const SubModelOutputSetType& set) { std::string toLog; for (const auto& e : set) { logRemapEntry(toLog, e); } VLOG(COMPILATION) << name << ": " << toLog; }; logRemapVector("model inputs", mModelInputs); logRemapVector("model outputs", mModelOutputs); logRemapVector("temps as submodel inputs", mTempsAsSubModelInputs); logRemapSet("temps as submodel outputs", mTempsAsSubModelOutputs); logRemapVector("outputs as submodel inputs", mOutputsAsSubModelInputs); } static void convertModelInputsOrOutputs( // IN: mModel{Inputs|Outputs} const ExecutionStep::RemapVectorType& myModelInputsOrOutputs, // IN: fromModel->{input|output}Count() uint32_t fromModelInputOrOutputCount, // IN: fromModel->get{Input|Output}OperandIndex std::function<uint32_t(uint32_t)> fromModelGetInputOrOutputOperandIndex, // OUT: for v : mModel{Inputs|Outputs} : v.second std::vector<uint32_t>* inputsOrOutputs, // OUT: submodel input-or-output index to original model input-or-output index std::vector<uint32_t>* inputOrOutputIndexSubModelToFromModel) { std::map<uint32_t, uint32_t> fromModelIndexMap; // operand index to input-or-output index for (uint32_t i = 0; i < fromModelInputOrOutputCount; i++) { fromModelIndexMap[fromModelGetInputOrOutputOperandIndex(i)] = i; } for (const auto& myInputOrOutput : myModelInputsOrOutputs) { inputsOrOutputs->push_back(myInputOrOutput.second); const uint32_t fromModelInputOrOutputIndex = fromModelIndexMap[myInputOrOutput.first]; inputOrOutputIndexSubModelToFromModel->push_back(fromModelInputOrOutputIndex); } } int ExecutionStep::finishSubModel(const ModelBuilder* fromModel, bool* hasOutputOfUnknownSize, int32_t executionPreference) { nnAssert(mDevice != nullptr); if (VLOG_IS_ON(COMPILATION)) { logSubModel(); } mSubModel.relaxComputationFloat32toFloat16(fromModel->isComputationFloat32RelaxedToFloat16()); // Input order: mModelInputs, mTempsAsSubModelInputs, mOutputsAsSubModelInputs // Output order: mModelOutputs, mTempsAsSubModelOutputs // // ExecutionPlan::next() depends on these orderings. std::vector<uint32_t> inputs; convertModelInputsOrOutputs(mModelInputs, fromModel->inputCount(), [=](uint32_t i) { return fromModel->getInputOperandIndex(i); }, &inputs, &mInputIndexSubModelToFromModel); for (const auto& subModelInput : mTempsAsSubModelInputs) { inputs.push_back(subModelInput.second); } for (const auto& subModelInput : mOutputsAsSubModelInputs) { inputs.push_back(subModelInput.second); } std::vector<uint32_t> outputs; convertModelInputsOrOutputs(mModelOutputs, fromModel->outputCount(), [=](uint32_t i) { return fromModel->getOutputOperandIndex(i); }, &outputs, &mOutputIndexSubModelToFromModel); for (const auto& subModelOutput : mTempsAsSubModelOutputs) { outputs.push_back(subModelOutput.second); const Operand& operand = mSubModel.getOperand(subModelOutput.second); if (operand.dimensions.size() == 0) { *hasOutputOfUnknownSize = true; } else { for (uint32_t dimension : operand.dimensions) { if (dimension == 0) { *hasOutputOfUnknownSize = true; break; } } } if (*hasOutputOfUnknownSize) { VLOG(COMPILATION) << "SubModelOutput (operand#" << subModelOutput.first << " of original graph) has unknown size: " << toString(operand); } } { int n = mSubModel.identifyInputsAndOutputs(inputs.size(), &inputs[0], outputs.size(), &outputs[0]); if (n != ANEURALNETWORKS_NO_ERROR) { return n; } n = mSubModel.finish(); if (n != ANEURALNETWORKS_NO_ERROR) { return n; } } { // Compute mOutputsAsSubModelInputsIndexToFromModel. std::map<uint32_t, uint32_t> fromModelOperandIndexToOutputIndex; for (unsigned i = 0, e = fromModel->outputCount(); i < e; ++i) { fromModelOperandIndexToOutputIndex[fromModel->getOutputOperandIndex(i)] = i; } for (unsigned i = 0, e = mOutputsAsSubModelInputs.size(); i < e; i++) { const uint32_t fromModelOperandIndex = mOutputsAsSubModelInputs[i].first; const auto it = fromModelOperandIndexToOutputIndex.find(fromModelOperandIndex); if (it == fromModelOperandIndexToOutputIndex.end()) { LOG(ERROR) << "Could not find main model output operand " << fromModelOperandIndex << " in main model output operand list"; return ANEURALNETWORKS_BAD_STATE; } mOutputsAsSubModelInputsIndexToFromModel.push_back(it->second); } } // TODO: Move compilation elsewhere? VLOG(COMPILATION) << "ExecutionStep::finishSubModel, compilation on " << mDevice->getName(); return compile(mDevice, &mSubModel, executionPreference, *mPlan->getCacheDir(), &mToken, &mPreparedSubModel); } void ExecutionStep::dump() const { Model model; mSubModel.setHidlModel(&model); if (VLOG_IS_ON(COMPILATION)) { VLOG(COMPILATION) << "ExecutionStep#" << mIndex << " for " << mDevice->getName(); logModelToInfo(model); } } int ExecutionPlan::CompoundBody::finish(const ModelBuilder* fromModel, int32_t executionPreference) { findTempsAsSubModelOutputs(); for (const auto& step : mSteps) { int n = step->finishSubModel(fromModel, &mHasSubModelOutputOfUnknownSize, executionPreference); if (n != ANEURALNETWORKS_NO_ERROR) { VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- finishSubModel failed"; return n; } } if (mHasSubModelOutputOfUnknownSize) { VLOG(COMPILATION) << "ExecutionPlan::CompoundBody::finish -- mHasSubModelOutputOfUnknownSize"; return ANEURALNETWORKS_OP_FAILED; } mSuccessfulFinish = true; return ANEURALNETWORKS_NO_ERROR; } int ExecutionPlan::SimpleBody::finish([[maybe_unused]] const ModelBuilder* fromModel, int32_t executionPreference) { nnAssert(mDevice != nullptr); VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation"; const int n = compile(mDevice, mModel, executionPreference, *mCacheDir, &mToken, &mPreparedModel); mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR); return n; } int ExecutionPlan::finish(const ModelBuilder* fromModel, int32_t executionPreference) { nnAssert(mBody != nullptr); return mBody->finish(fromModel, executionPreference); } ExecutionPlan::Controller::Controller( const ExecutionPlan* plan, ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder, std::shared_ptr<const SubModelInputsAndOutputsType> subModelInputsAndOutputs, uint32_t totalSizeOfTemporaries) : mPlan(plan), mExecutionBuilder(executionBuilder), mBurstBuilder(burstBuilder), mSubModelInputsAndOutputs(subModelInputsAndOutputs), mNextStepIndex(0) { if (totalSizeOfTemporaries) { if (mTemporaries.create(totalSizeOfTemporaries) != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "ExecutionPlan::Controller failed to allocate temporaries"; mNextStepIndex = kBadStepIndex; } } } // Attempt to create a burst object for each PreparedModel/Partition. If the // burst controller object cannot be made, return a nullptr in its place to // indicate the regular execution path should be used. This can occur either // because PreparedModel was nullptr (cpu was best choice), or because the // IPreparedModel was of insufficient version or failed to configure the burst. std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts() const { switch (mState) { // burst object for each partition in the compound case case COMPOUND: { std::vector<std::shared_ptr<ExecutionBurstController>> bursts; bursts.reserve(compound()->mSteps.size()); for (const auto& step : compound()->mSteps) { if (const auto preparedModel = step->getPreparedSubModel()) { bursts.push_back(preparedModel->configureExecutionBurst(/*blocking=*/true)); } else { bursts.push_back(nullptr); } } return bursts; } // single burst object for the simple case case SIMPLE: { std::vector<std::shared_ptr<ExecutionBurstController>> burst; auto simpleBody = static_cast<const SimpleBody*>(mBody); if (const auto preparedModel = simpleBody->mPreparedModel) { burst.push_back(preparedModel->configureExecutionBurst(/*blocking=*/true)); } else { burst.push_back(nullptr); } return burst; } // no burst objects made default: return {}; } } std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController( ExecutionBuilder* executionBuilder, const BurstBuilder* burstBuilder) const { nnAssert(isValid()); // Create the layout for a Memory object big enough for to hold // every TEMPORARY in the original model that is live across // partition boundaries. // // TODO: Rethink this approach for managing temporaries. Some // alternatives: // // 1) Adopt a memory layout scheme analogous to stack allocation, // where objects of non-overlapping lifetime can occupy the same // storage. We would still have a single Memory object in this // case. // // 2) Do something like what CpuExecutor does, and do allocations // and deallocations on the fly (during execution) before first // reference and after last reference, respectively. This would // mean having one Memory object per TEMPORARY; or, in a more // complicated implementation, one Memory object per set of // temporaries that have the same lifetime. Note that the Android // system limits the number of shared memory objects, which are // what our Memory objects represent. // uint32_t totalSizeOfTemporaries = 0; std::shared_ptr<Controller::SubModelInputsAndOutputsType> subModelInputsAndOutputs; if (mState == COMPOUND) { const ModelBuilder* fromModel = executionBuilder->getModel(); for (const auto& step : compound()->mSteps) { for (const auto& output: step->getTempsAsSubModelOutputs()) { const uint32_t fromModelOperandIndex = output.first; const Operand& fromModelOperand = fromModel->getOperand(fromModelOperandIndex); if (subModelInputsAndOutputs == nullptr) { subModelInputsAndOutputs = std::make_shared<Controller::SubModelInputsAndOutputsType>(); } const uint32_t size = TypeManager::get()->getSizeOfData(fromModelOperand); totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size); subModelInputsAndOutputs->insert(std::make_pair(fromModelOperandIndex, totalSizeOfTemporaries)); totalSizeOfTemporaries += size; } } if (VLOG_IS_ON(EXECUTION) && (subModelInputsAndOutputs != nullptr)) { for (const auto& io : *subModelInputsAndOutputs) { VLOG(EXECUTION) << "temp: origOpndIdx = " << io.first << ", offset = " << io.second; } } } return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder, subModelInputsAndOutputs, totalSizeOfTemporaries)); } // TODO: Find a better way to provide this functionality. int ExecutionPlan::fallback(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor) const { *executor = nullptr; VLOG(EXECUTION) << "ExecutionPlan::fallback(" << controller << ", " << executor << "): mNextStepIndex = " << controller->mNextStepIndex; if (controller->mNextStepIndex == 0) { // We haven't called next(). return ANEURALNETWORKS_OP_FAILED; } if (controller->mNextStepIndex == Controller::kBadStepIndex) { // The last call to next() did not produce an executor. return ANEURALNETWORKS_OP_FAILED; } --controller->mNextStepIndex; return next(controller, executor); } int ExecutionPlan::next(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor, std::shared_ptr<ExecutionBurstController>* burstController) const { *executor = nullptr; if (burstController != nullptr) { *burstController = nullptr; } VLOG(EXECUTION) << "ExecutionPlan::next(" << SHOW_IF_DEBUG(controller << ", " << executor) << "): mNextStepIndex = " << controller->mNextStepIndex; if (controller->mNextStepIndex == Controller::kBadStepIndex) { return ANEURALNETWORKS_OP_FAILED; } if (mState == EMPTY) { nnAssert(controller->mNextStepIndex == 0); // end controller->mNextStepIndex = Controller::kBadStepIndex; return ANEURALNETWORKS_NO_ERROR; } if (mState == SIMPLE) { if (controller->mNextStepIndex == 0) { // First (and only) step. auto simpleBody = static_cast<const SimpleBody*>(mBody); *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, simpleBody->mModel, simpleBody->mDevice, simpleBody->mPreparedModel); (*executor)->mapInputsAndOutputsTrivially(); if (burstController != nullptr && controller->mBurstBuilder != nullptr) { *burstController = controller->mBurstBuilder->getControllerAt(0); } controller->mNextStepIndex = 1; return ANEURALNETWORKS_NO_ERROR; } nnAssert(controller->mNextStepIndex == 1); // end controller->mNextStepIndex = Controller::kBadStepIndex; return ANEURALNETWORKS_NO_ERROR; } auto compoundBody = compound(); if (controller->mNextStepIndex == compoundBody->mSteps.size()) { // end controller->mNextStepIndex = Controller::kBadStepIndex; return ANEURALNETWORKS_NO_ERROR; } // Input order: model inputs, temps as submodel inputs, outputs as submodel inputs // Output order: model outputs, temps as submodel outputs // // ExecutionStep::finishSubModel() establishes these orderings. const auto step = compoundBody->mSteps[controller->mNextStepIndex]; *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getSubModel(), step->getDevice(), step->getPreparedSubModel()); (*executor)->setExecutionStep(step); step->mapInputsAndOutputs(*executor); if (burstController != nullptr && controller->mBurstBuilder != nullptr) { *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex); } if (controller->mSubModelInputsAndOutputs != nullptr) { { // Tell executor about temps as submodel outputs. const size_t firstSubModelOutputIndex = step->getModelOutputs().size(); const auto& subModelOutputs = step->getTempsAsSubModelOutputs(); uint32_t idx = 0; for (auto I = subModelOutputs.begin(), E = subModelOutputs.end(); I != E; I++, idx++) { const uint32_t fromModelOperandIndex = I->first; const uint32_t offsetOfTemporary = controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex); int n = (*executor)->setOutputFromTemporaryMemory( firstSubModelOutputIndex + idx, &controller->mTemporaries, offsetOfTemporary); if (n != ANEURALNETWORKS_NO_ERROR) { controller->mNextStepIndex = Controller::kBadStepIndex; return n; } } } { // Tell executor about temps as submodel inputs. const size_t firstSubModelInputIndex = step->getModelInputs().size(); const auto& subModelInputs = step->getTempsAsSubModelInputs(); uint32_t idx = 0; for (auto I = subModelInputs.begin(), E = subModelInputs.end(); I != E; I++, idx++) { const uint32_t fromModelOperandIndex = I->first; const uint32_t offsetOfTemporary = controller->mSubModelInputsAndOutputs->at(fromModelOperandIndex); int n = (*executor)->setInputFromTemporaryMemory( firstSubModelInputIndex + idx, &controller->mTemporaries, offsetOfTemporary); if (n != ANEURALNETWORKS_NO_ERROR) { controller->mNextStepIndex = Controller::kBadStepIndex; return n; } } } } { // Tell executor about outputs as submodel inputs. const size_t firstOutputsAsSubModelInputIndex = step->getModelInputs().size() + step->getTempsAsSubModelInputs().size(); const auto& outputsAsSubModelInputsIndexToFromModel = step->getOutputsAsSubModelInputsIndexToFromModel(); for (uint32_t i = 0, e = outputsAsSubModelInputsIndexToFromModel.size(); i < e; i++) { uint32_t o = outputsAsSubModelInputsIndexToFromModel[i]; (*executor)->mapOutputToInput(o, firstOutputsAsSubModelInputIndex + i); } } controller->mNextStepIndex++; return ANEURALNETWORKS_NO_ERROR; } std::shared_ptr<ExecutionStep> ExecutionPlan::createNewStep(const std::shared_ptr<Device> device) { nnAssert(mState != SIMPLE); if (mState == EMPTY) { mBody = new CompoundBody(); mState = COMPOUND; } auto& steps = compound()->mSteps; auto step = std::make_shared<ExecutionStep>(this, steps.size(), device); steps.push_back(step); return step; } void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device, const ModelBuilder* model) { nnAssert(mState == EMPTY); mBody = new SimpleBody(device, model, mCacheDir, mToken); mState = SIMPLE; } void ExecutionPlan::dump() const { if (mBody) { mBody->dump(); } else { VLOG(COMPILATION) << "EMPTY"; } } void ExecutionPlan::reset() { if (mBody) { delete mBody; mBody = nullptr; } mState = EMPTY; } ExecutionPlan::Kind ExecutionPlan::forTest_getKind() const { switch (mState) { case EMPTY: return Kind::EMPTY; case SIMPLE: nnAssert(mBody); return mBody->mSuccessfulFinish ? Kind::SIMPLE : Kind::ERROR; case COMPOUND: nnAssert(mBody); return mBody->mSuccessfulFinish ? Kind::COMPOUND : Kind::ERROR; default: nnAssert(!"unexpected state"); return Kind::ERROR; } } std::shared_ptr<const Device> ExecutionPlan::forTest_simpleGetDevice() const { nnAssert(mState == SIMPLE); return static_cast<const SimpleBody*>(mBody)->mDevice; } const std::vector<std::shared_ptr<ExecutionStep>>& ExecutionPlan::forTest_compoundGetSteps() const { return compound()->mSteps; } bool ExecutionPlan::forTest_hasSubModelOutputsOfUnknownSize() const { return mBody->hasSubModelOutputsOfUnknownSize(); } const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const { CHECK(mState == SIMPLE) << "Calling forTest_simpleGetCacheToken from execution plan with a non-SIMPLE body"; return static_cast<const SimpleBody*>(mBody)->mToken.getCacheToken(); } void ExecutionPlan::SimpleBody::dump() const { VLOG(COMPILATION) << "SIMPLE for " << mDevice->getName(); } void ExecutionPlan::CompoundBody::dump() const { for (const auto& step : mSteps) { step->dump(); } } int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices, uint32_t preference, ExecutionPlan* plan) const { // This function uses a heuristic approach to partitioning the graph. // It should be good enough for the first release. const size_t deviceCount = devices.size(); const size_t operationCount = mOperations.size(); VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: deviceCount = " << deviceCount << ", operationCount = " << operationCount; // Figure out where each operation will best execute. // The value of the vector is the index in the devices vector. std::vector<int> bestDeviceForOperation(operationCount); NN_RETURN_IF_ERROR( findBestDeviceForEachOperation(preference, devices, &bestDeviceForOperation)); // If one device will run all the operations, we don't need to split the work. if (std::adjacent_find(bestDeviceForOperation.begin(), bestDeviceForOperation.end(), std::not_equal_to<int>()) == bestDeviceForOperation.end()) { const int bestDeviceIndex = bestDeviceForOperation[0]; VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: only one best device: " << bestDeviceIndex << " = " << devices[bestDeviceIndex]->getName(); plan->becomeSingleStep(devices[bestDeviceIndex], this); return plan->finish(this, preference); } // No easy solution, we need to split the work. // We keep track of the operations that are ready to run for each device. std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount); // This helper function enqueues the operation on the appropriate queue. auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) { int deviceIndex = bestDeviceForOperation[operationIndex]; perDeviceQueue[deviceIndex].push(operationIndex); VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto " << deviceIndex; }; // This helper function finds a device that has operations ready to process. // We start by looking at the CPU. We do this to try to maximize the // size of the graph we'll send to non-CPU devices. If the CPU runs first, // it will have the chance to prepare more of the inputs required by the // other devices. This function returns -1 if all queues are empty. auto findNextDeviceToProcess = [&]() -> int { for (int i = deviceCount - 1; i >= 0; i--) { if (!perDeviceQueue[i].empty()) { return i; } } return -1; }; OperandTracker tracker(this, enqueueOnAppropriateDevice); // For each iteration of this loop, we'll create an execution step. while (true) { // Find the device we'll do this step for. int deviceIndex = findNextDeviceToProcess(); VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex; if (deviceIndex < 0) { break; } // Assign as much as possible to this device. std::shared_ptr<ExecutionStep> step = plan->createNewStep(devices[deviceIndex]); auto& queue = perDeviceQueue[deviceIndex]; while (!queue.empty()) { uint32_t operationIndex = queue.front(); queue.pop(); int n = step->addOperation(operationIndex, *this); if (n != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << "failed to add operation " << operationIndex << " to step"; return n; } tracker.markProcessed(operationIndex, enqueueOnAppropriateDevice); } } int n = plan->finish(this, preference); if (VLOG_IS_ON(COMPILATION)) { Model model; setHidlModel(&model); VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: original model: "; logModelToInfo(model); plan->dump(); } return n; } PerformanceInfo ModelBuilder::getPerformanceInfo(const std::shared_ptr<Device> device, uint32_t operationIndex) const { const Operation& operation = getOperation(operationIndex); // TODO This assumes that the type is dictated by the first operand. This is // currently the case but is not a safe assumption to make in the long term. const uint32_t operandIndex = operation.inputs[0]; const OperandType operandType = mOperands[operandIndex].type; switch(operandType) { case OperandType::FLOAT32: if (mRelaxComputationFloat32toFloat16) { return device->getRelaxedFloat32toFloat16PerformanceScalar(); } break; case OperandType::TENSOR_FLOAT32: if (mRelaxComputationFloat32toFloat16) { return device->getRelaxedFloat32toFloat16PerformanceTensor(); } break; default: break; } return device->getPerformance(operandType); } namespace { // Add an element to the end of the vector and return a pair consisting of the // index of the new element and a pointer to the new element. template <class T> std::pair<uint32_t, T*> extend(hidl_vec<T>* vec) { size_t nextIndex = vec->size(); vec->resize(nextIndex + 1); return {nextIndex, &(*vec)[nextIndex]}; } // Add an element to the end of the vector, set it to the specified value, and // return a pair consisting of the index of the new element and a pointer to the // new element. template <class T> std::pair<uint32_t, T*> extend(hidl_vec<T>* vec, const T& val) { auto extended = extend(vec); *extended.second = val; return extended; } template <typename T> bool operator<(const hidl_vec<T>& a, const hidl_vec<T>& b) { return std::lexicographical_compare(a.begin(), a.end(), b.begin(), b.end()); } // Compile-time mapping from a particular Model type to a name for that type. template <class T_Model> struct ModelVersion; template <> struct ModelVersion<V1_0::Model> { static constexpr char name[] = "V1_0"; }; template <> struct ModelVersion<V1_1::Model> { static constexpr char name[] = "V1_1"; }; template <> struct ModelVersion<V1_2::Model> { static constexpr char name[] = "V1_2"; }; // Dispatcher mechanism for calling an appropriate uncheckedConvertToV1_* // given the desired return type. template <typename T_ReturnType> T_ReturnType uncheckedConvertTo(OperationType type); template <> V1_0::OperationType uncheckedConvertTo<V1_0::OperationType>(OperationType type) { return uncheckedConvertToV1_0(type); } template <> V1_1::OperationType uncheckedConvertTo<V1_1::OperationType>(OperationType type) { return uncheckedConvertToV1_1(type); } // Dispatcher mechanism for calling an appropriate convertToV1_* given the // desired return type. Note that there is no V1_1::Operand type. template <typename T_ReturnType> T_ReturnType convertTo(Operand operand); template <> V1_0::Operand convertTo<V1_0::Operand>(Operand operand) { return convertToV1_0(operand); } // Dispatcher mechanism for calling an appropriate compliantWithV1_* given the // desired target model type. template <typename T_SlicedModel> void getNoncompliantOperations(const V1_2::Model& model, std::set<uint32_t>* noncompliantOperations); template <> void getNoncompliantOperations<V1_0::Model>(const V1_2::Model& model, std::set<uint32_t>* noncompliantOperations) { compliantWithV1_0(model, noncompliantOperations); } template <> void getNoncompliantOperations<V1_1::Model>(const V1_2::Model& model, std::set<uint32_t>* noncompliantOperations) { compliantWithV1_1(model, noncompliantOperations); } class PlanModelSlicer : public IModelSlicer { public: PlanModelSlicer(const ModelBuilder* model); std::optional<std::pair<V1_0::Model, std::function<uint32_t(uint32_t)>>> getSliceV1_0() override { return getSlice(&mSliceV1_0); } std::optional<std::pair<V1_1::Model, std::function<uint32_t(uint32_t)>>> getSliceV1_1() override { return getSlice(&mSliceV1_1); } const Model& getModel() const { return mHidlModel; } private: template <class T_SlicedModel> static bool invalid(const T_SlicedModel& model); enum class SliceState { UNINITIALIZED, INVALID, NORMAL }; template <class T_SlicedModel> struct Slice { SliceState mState = SliceState::UNINITIALIZED; T_SlicedModel mHidlModel; std::vector<uint32_t> mSlicedOperationIndexToOrigIndex; }; Slice<V1_0::Model> mSliceV1_0; Slice<V1_1::Model> mSliceV1_1; template <class T_SlicedModel> void initializeSlice(Slice<T_SlicedModel>* slice); template <class T_SlicedModel> std::optional<std::pair<T_SlicedModel, std::function<uint32_t(uint32_t)>>> getSlice( Slice<T_SlicedModel>* slice) { CHECK(slice != nullptr); if (slice->mState == SliceState::UNINITIALIZED) { initializeSlice(slice); } if (slice->mState == SliceState::INVALID) { return {}; } return std::pair<T_SlicedModel, std::function<uint32_t(uint32_t)>>( slice->mHidlModel, [slice](uint32_t slicedOperationIndex) { return slice->mSlicedOperationIndexToOrigIndex.at(slicedOperationIndex); }); } Model mHidlModel; }; template <class T_SlicedModel> bool PlanModelSlicer::invalid(const T_SlicedModel& model) { // A model must have at least one operation. However, it's possible that a // slice has no operations (because no operations from the original model // are compliant with the sliced model type). In this case, the sliced // model would be invalid. const bool looksEmpty = (model.operations.size() == 0); if (DeviceManager::get()->strictSlicing()) { CHECK_EQ(looksEmpty, (model.operands.size() == 0)); } if (looksEmpty) return true; // A model must have at least one output. However, it's possible for a // model to contain dead operations (i.e., outputs on which no model outputs // are data dependent). A slice might contain only dead operations, and // hence have no model outputs. In this case, the sliced model would be // invalid. if (model.outputIndexes.size() == 0) return true; // We shouldn't have to check whether the model is valid. // However, it could be invalid if: // - there is an error in the slicing algorithm; or // - there is an error in compliantWith (see http://b/131845106) if (!validateModel(model)) { LOG(WARNING) << "Sliced model fails validateModel()"; CHECK(!DeviceManager::get()->strictSlicing()); return true; } return false; } PlanModelSlicer::PlanModelSlicer(const ModelBuilder* model) { model->setHidlModel(&mHidlModel); } template <class T_SlicedModel> void PlanModelSlicer::initializeSlice(Slice<T_SlicedModel>* slice) { using SlicedOperand = std::remove_pointer_t<decltype(slice->mHidlModel.operands.data())>; using SlicedOperation = std::remove_pointer_t<decltype(slice->mHidlModel.operations.data())>; using SlicedOperationType = decltype(SlicedOperation::type); CHECK(slice->mState == SliceState::UNINITIALIZED); const auto& origOperands = mHidlModel.operands; const auto& origOperations = mHidlModel.operations; auto& slicedOperands = slice->mHidlModel.operands; auto& slicedOperations = slice->mHidlModel.operations; // Indexes of elements of noncompliant origOperations std::set<uint32_t> noncompliantOperations; getNoncompliantOperations<T_SlicedModel>(mHidlModel, &noncompliantOperations); // Map from an operand index in origOperands to the corresponding operand index in // slicedOperands std::map<uint32_t, uint32_t> origOperandIndexToSlicedIndex; // Collect the operand indexes of every operand that is an input to a // compliant operation. If the operand is a CONSTANT_* or a NO_VALUE, copy // it to the sliced model and update origOperandIndexToSlicedIndex // accordingly. Otherwise, we'll deal with the operand in the subsequent // "Main loop", where we process operation outputs (intermediates and model // outputs). std::set<uint32_t> inputOperandIndexesOfCompliantOperations; for (uint32_t origOperationIndex = 0; origOperationIndex < origOperations.size(); ++origOperationIndex) { if (noncompliantOperations.count(origOperationIndex)) { continue; } for (uint32_t input : origOperations[origOperationIndex].inputs) { if (inputOperandIndexesOfCompliantOperations.insert(input).second) { const Operand& origOperand = origOperands[input]; switch (origOperand.lifetime) { case OperandLifeTime::CONSTANT_COPY: case OperandLifeTime::CONSTANT_REFERENCE: case OperandLifeTime::NO_VALUE: { const uint32_t slicedOperandIndex = extend(&slicedOperands, convertTo<SlicedOperand>(origOperand)) .first; slicedOperands[slicedOperandIndex].numberOfConsumers = 0; origOperandIndexToSlicedIndex[input] = slicedOperandIndex; VLOG(COMPILATION) << "origOperandIndexToSlicedIndex initialization created " << input << " -> " << slicedOperandIndex << ": " << toString(slicedOperands[slicedOperandIndex]); break; } default: break; } } } } // For each output operand of a noncompliant operation that is the input // operand of at least one compliant operation, we will ensure that there is // a sliced model input whose "type" is that of the output operand. This is // a map from output operand "type" (in the original model) to model input // operand index (in the sliced model). Unfortunately, there is no // representation of operand "type" defined in the HAL that we can use // naively here -- we want (OperandType, dimensions, scale, zeroPoint, // extraParams), but these fields exist in Operand along with other fields // that need to be excluded from the map key (numberOfConsumers, lifetime, // location). There are several choices: // - Don't have a map -- each output identified above gets its own sliced // model input (no sharing of sliced model inputs). // - Create an operand "type" representation solely for use as a map key. // - Write a tailored comparison function that ignores the excluded fields. // We choose to write a tailored comparison function. If Treble were to // generate a comparison function for us (http://b/130567619) then it might // be better to instead reset the excluded fields to canonical values -- // then we could use the Treble provided comparison function, and the // solution would be robust (in a correctness sense, not a sharing sense) if // more fields are added and we neglect to canonicalize them. // // We also use this map for model input operands of the original model that // become input operands of the sliced model. This means that an original // model input operand might be coalesced with other original model input // operands and/or with original model temporary operands. class OrigOperandToSlicedInputOperandIndex { public: OrigOperandToSlicedInputOperandIndex(hidl_vec<SlicedOperand>* slicedOperands, hidl_vec<uint32_t>* slicedInputIndexes) : mSlicedOperands(*slicedOperands), mSlicedInputIndexes(*slicedInputIndexes) {} // Given an operand from the original model, return the index of the // corresponding model input operand from the sliced model. Creates a // new operand in the sliced model if necessary. uint32_t getIndex(Operand operand) { // Lookup auto it = mMap.find(operand); if (it != mMap.end()) { VLOG(COMPILATION) << "OrigOperandToSlicedInputOperandIndex::getIndex looked for " << toString(operand) << " and found " << it->second << ": " << toString(it->first); return it->second; } // Create operand.numberOfConsumers = 0; operand.lifetime = OperandLifeTime::MODEL_INPUT; operand.location = {}; uint32_t slicedOperandIndex = extend(&mSlicedOperands, convertTo<SlicedOperand>(operand)).first; mMap[operand] = slicedOperandIndex; extend(&mSlicedInputIndexes, slicedOperandIndex); VLOG(COMPILATION) << "OrigOperandToSlicedInputOperandIndex::getIndex created " << slicedOperandIndex << ": " << toString(operand); return slicedOperandIndex; } private: class Compare { public: bool operator()(const Operand& a, const Operand& b) const { if (a.type != b.type) { return a.type < b.type; } if (a.dimensions != b.dimensions) { return a.dimensions < b.dimensions; } if (a.scale != b.scale) { return a.scale < b.scale; } if (a.zeroPoint != b.zeroPoint) { return a.zeroPoint < b.zeroPoint; } return compare(a.extraParams, b.extraParams); } private: static bool compare(const SymmPerChannelQuantParams& a, const SymmPerChannelQuantParams& b) { if (a.scales != b.scales) { return a.scales < b.scales; } return a.channelDim < b.channelDim; } static bool compare(const Operand::ExtraParams& a, const Operand::ExtraParams& b) { if (a.getDiscriminator() != b.getDiscriminator()) { return a.getDiscriminator() < b.getDiscriminator(); } switch (a.getDiscriminator()) { default: CHECK(false) << "Unexpected"; FALLTHROUGH_INTENDED; case Operand::ExtraParams::hidl_discriminator::none: return false; case Operand::ExtraParams::hidl_discriminator::channelQuant: return compare(a.channelQuant(), b.channelQuant()); case Operand::ExtraParams::hidl_discriminator::extension: return a.extension() < b.extension(); } } }; std::map<Operand, uint32_t, Compare> mMap; hidl_vec<SlicedOperand>& mSlicedOperands; hidl_vec<uint32_t>& mSlicedInputIndexes; } origOperandToSlicedInputOperandIndex(&slicedOperands, &slice->mHidlModel.inputIndexes); // An input of the original model is an input of the sliced model if and // only if it is consumed by at least one compliant operation. Note that in // the sliced model we share all model inputs of the same "type"; and that // we may later add model inputs to the sliced model. for (uint32_t origInputIndex : mHidlModel.inputIndexes) { if (inputOperandIndexesOfCompliantOperations.count(origInputIndex)) { const uint32_t slicedIndex = origOperandToSlicedInputOperandIndex.getIndex(origOperands[origInputIndex]); origOperandIndexToSlicedIndex[origInputIndex] = slicedIndex; VLOG(COMPILATION) << "origOperandIndexToSlicedIndex inputIndexes processing created " << origInputIndex << " -> " << slicedIndex << ": " << toString(slicedOperands[slicedIndex]); } } // Main loop: Process each operation of the original model. for (uint32_t origOperationIndex = 0; origOperationIndex < origOperations.size(); ++origOperationIndex) { const Operation& origOperation = origOperations[origOperationIndex]; if (noncompliantOperations.count(origOperationIndex)) { for (uint32_t output : origOperation.outputs) { if (!inputOperandIndexesOfCompliantOperations.count(output)) { continue; } const uint32_t slicedIndex = origOperandToSlicedInputOperandIndex.getIndex(origOperands[output]); origOperandIndexToSlicedIndex[output] = slicedIndex; VLOG(COMPILATION) << "origOperandIndexToSlicedIndex noncompliant output processing created " << output << " -> " << slicedIndex << ": " << toString(slicedOperands[slicedIndex]); } } else { slice->mSlicedOperationIndexToOrigIndex.push_back(origOperationIndex); SlicedOperation& slicedOperation = *extend(&slicedOperations).second; CHECK(slice->mSlicedOperationIndexToOrigIndex.size() == slicedOperations.size()); slicedOperation.type = uncheckedConvertTo<SlicedOperationType>(origOperation.type); // Model is topologically sorted, so all inputs must be present in // origOperandIndexToSlicedIndex, and no outputs may be. // Operation inputs // - Fill in slicedOperation.inputs // - Update number of consumers for each input operand slicedOperation.inputs.resize(origOperation.inputs.size()); std::transform( origOperation.inputs.begin(), origOperation.inputs.end(), slicedOperation.inputs.begin(), [&origOperandIndexToSlicedIndex, &slicedOperands](uint32_t origOperandIndex) { uint32_t slicedOperandIndex = origOperandIndexToSlicedIndex.at(origOperandIndex); slicedOperands[slicedOperandIndex].numberOfConsumers++; VLOG(COMPILATION) << "origOperandIndexToSlicedIndex compliant input " "processing created " << origOperandIndex << " -> " << slicedOperandIndex << ": " << toString(slicedOperands[slicedOperandIndex]); return slicedOperandIndex; }); // Operation outputs // - Add new operands to slicedOperands // - Update origOperandIndexToSlicedIndex // - Fill in slicedOperation.outputs // - Record as a model output, if necessary const uint32_t firstOutputSlicedOperandIndex = slicedOperands.size(); slicedOperands.resize(firstOutputSlicedOperandIndex + origOperation.outputs.size()); slicedOperation.outputs.resize(origOperation.outputs.size()); for (uint32_t outputNum = 0; outputNum < slicedOperation.outputs.size(); ++outputNum) { uint32_t origOperandIndex = origOperation.outputs[outputNum]; uint32_t slicedOperandIndex = firstOutputSlicedOperandIndex + outputNum; auto& slicedOperand = slicedOperands[slicedOperandIndex]; const auto& origOperand = origOperands[origOperandIndex]; slicedOperand = convertTo<SlicedOperand>(origOperand); slicedOperand.numberOfConsumers = 0; CHECK(origOperandIndexToSlicedIndex.count(origOperandIndex) == 0); origOperandIndexToSlicedIndex[origOperandIndex] = slicedOperandIndex; slicedOperation.outputs[outputNum] = slicedOperandIndex; if (!inputOperandIndexesOfCompliantOperations.count(origOperandIndex) && origOperand.numberOfConsumers) { // Was consumed only by noncompliant operations; convert to // an output of the sliced model. slicedOperand.lifetime = OperandLifeTime::MODEL_OUTPUT; } VLOG(COMPILATION) << "origOperandIndexToSlicedIndex compliant output created " << origOperandIndex << " -> " << slicedOperandIndex << ": " << toString(slicedOperand); if (slicedOperand.lifetime == OperandLifeTime::MODEL_OUTPUT) { extend(&slice->mHidlModel.outputIndexes, slicedOperandIndex); } } } } // To keep things simple, we copy over these fields as-is. We could instead // opt to regenerate them based on the operands present in the sliced model: // This would be more complex and probably take more computation time, but // it would reduce the size of the sliced model, and hence the time spent // copying it around and passing it across the HAL interface. slice->mHidlModel.operandValues = mHidlModel.operandValues; slice->mHidlModel.pools = mHidlModel.pools; if (VLOG_IS_ON(COMPILATION)) { { std::ostrstream fromName; fromName << "Slice: From " << ModelVersion<decltype(mHidlModel)>::name << std::ends; graphDump(fromName.str(), mHidlModel); fromName.freeze(false); } { std::ostrstream toName; toName << "Slice: To " << ModelVersion<decltype(slice->mHidlModel)>::name << std::ends; graphDump(toName.str(), convertToV1_2(slice->mHidlModel)); toName.freeze(false); } } slice->mState = invalid(slice->mHidlModel) ? SliceState::INVALID : SliceState::NORMAL; } // This class determines whether a given device can execute a given operation class CanDo { public: CanDo() {} void initialize(PlanModelSlicer* slicer, std::shared_ptr<Device> device) { device->getSupportedOperations(slicer->getModel(), slicer, &mSupportsOperationByIndex); } bool check(size_t operationIndex) const { return mSupportsOperationByIndex[operationIndex]; } private: hidl_vec<bool> mSupportsOperationByIndex; }; }; // anonymous namespace int ModelBuilder::findBestDeviceForEachOperation( uint32_t preference, const std::vector<std::shared_ptr<Device>>& devices, std::vector<int>* bestDeviceForOperation) const { PlanModelSlicer slicer(this); const size_t deviceCount = devices.size(); std::vector<CanDo> canDo(deviceCount); for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) { canDo[deviceIndex].initialize(&slicer, devices[deviceIndex]); } // Figure out the best driver for each operation. const size_t operationCount = mOperations.size(); for (size_t operationIndex = 0; operationIndex < operationCount; operationIndex++) { // Find which device, including CPU fallback, gives the best performance for this operation. int bestChoice = -1; float bestPerfVal = 0.0; // Do not check bestPerfVal if bestChoice < 0. for (size_t deviceIndex = 0; deviceIndex < deviceCount; deviceIndex++) { const auto& device = devices[deviceIndex]; if (canDo[deviceIndex].check(operationIndex)) { const PerformanceInfo perf = getPerformanceInfo(device, operationIndex); const float perfVal = (preference == ANEURALNETWORKS_PREFER_LOW_POWER ? perf.powerUsage : perf.execTime); if (bestChoice < 0 || perfVal < bestPerfVal || (perfVal == bestPerfVal && device == DeviceManager::getCpuDevice())) { bestChoice = deviceIndex; bestPerfVal = perfVal; } } else { // Somewhat noisy logging, but only place where the user of // NNAPI can get feedback on why an operation was not run on a // specific device. // Logs O(operationCount * deviceCount) times, but // typically deviceCount is very small. VLOG(COMPILATION) << "Device " << device->getName() << " can't do operation " << toString(getOperation(operationIndex).type); } } if (bestChoice < 0) { LOG(ERROR) << "No driver can do the op"; return ANEURALNETWORKS_BAD_DATA; } (*bestDeviceForOperation)[operationIndex] = bestChoice; VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation(" << toString(getOperation(operationIndex).type) << ") = " << bestChoice << " (" << devices[bestChoice]->getName() << ")"; } return ANEURALNETWORKS_NO_ERROR; } } // namespace nn } // namespace android