/* * Copyright (C) 2017 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #define LOG_TAG "ExecutionBuilder" #include "ExecutionBuilder.h" #include "CompilationBuilder.h" #include "CpuExecutor.h" #include "ExecutionBurstController.h" #include "HalInterfaces.h" #include "Manager.h" #include "ModelBuilder.h" #include "Tracing.h" #include "TypeManager.h" #include "Utils.h" #include <mutex> #include <optional> #include <thread> #include <vector> namespace android { namespace nn { using HidlToken = hidl_array<uint8_t, ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN>; const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX}; static MeasureTiming measureTiming(const ExecutionBuilder* execution) { return execution->measureTiming() ? MeasureTiming::YES : MeasureTiming::NO; } static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType, const char* tag, bool allowUnspecified) { if (newType != nullptr) { const Extension::OperandTypeInformation* info = nullptr; if (isExtensionOperandType(operand.type)) { NN_RET_CHECK(TypeManager::get()->getExtensionOperandTypeInfo(operand.type, &info)); } if (validateOperandType(*newType, info, tag, allowUnspecified) != ANEURALNETWORKS_NO_ERROR) { LOG(ERROR) << tag << ": Invalid newType"; return false; } if (operand.dimensions.size() == 0) { return true; } if (operand.dimensions.size() != newType->dimensionCount) { LOG(ERROR) << tag << ": Setting with incompatible dimension count"; return false; } for (uint32_t i = 0; i < newType->dimensionCount; i++) { if (operand.dimensions[i] != newType->dimensions[i] && operand.dimensions[i] != 0) { LOG(ERROR) << tag << ": Overriding a fully specified dimension is disallowed"; return false; } } } else { if (!allowUnspecified && TypeManager::get()->isTensorType(operand.type) && tensorHasUnspecifiedDimensions(operand)) { LOG(ERROR) << tag << ": Setting with operand type that is not fully specified"; return false; } } return true; } int ModelArgumentInfo::setFromPointer(const Operand& operand, const ANeuralNetworksOperandType* type, void* data, uint32_t length) { if ((data == nullptr) != (length == 0)) { const char* dataPtrMsg = data ? "NOT_NULLPTR" : "NULLPTR"; LOG(ERROR) << "Data pointer must be nullptr if and only if length is zero (data = " << dataPtrMsg << ", length = " << length << ")"; return ANEURALNETWORKS_BAD_DATA; } if (data == nullptr) { state = ModelArgumentInfo::HAS_NO_VALUE; } else { NN_RETURN_IF_ERROR(updateDimensionInfo(operand, type)); if (operand.type != OperandType::OEM) { uint32_t neededLength = TypeManager::get()->getSizeOfData(operand.type, dimensions); if (neededLength != length && neededLength != 0) { LOG(ERROR) << "Setting argument with invalid length: " << length << ", expected length: " << neededLength; return ANEURALNETWORKS_BAD_DATA; } } state = ModelArgumentInfo::POINTER; } buffer = data; locationAndLength = {.poolIndex = 0, .offset = 0, .length = length}; return ANEURALNETWORKS_NO_ERROR; } int ModelArgumentInfo::setFromMemory(const Operand& operand, const ANeuralNetworksOperandType* type, uint32_t poolIndex, uint32_t offset, uint32_t length) { NN_RETURN_IF_ERROR(updateDimensionInfo(operand, type)); if (operand.type != OperandType::OEM) { uint32_t neededLength = TypeManager::get()->getSizeOfData(operand.type, dimensions); if (neededLength != length && neededLength != 0) { LOG(ERROR) << "Setting argument with invalid length: " << length << ", expected length: " << neededLength; return ANEURALNETWORKS_BAD_DATA; } } state = ModelArgumentInfo::MEMORY; locationAndLength = {.poolIndex = poolIndex, .offset = offset, .length = length}; buffer = nullptr; return ANEURALNETWORKS_NO_ERROR; } int ModelArgumentInfo::setFromTemporaryMemory(const Operand& operand, uint32_t poolIndex, uint32_t offset, uint32_t length) { NN_RETURN_IF_ERROR(updateDimensionInfo(operand, nullptr)); if (operand.type != OperandType::OEM) { uint32_t neededLength = TypeManager::get()->getSizeOfData(operand.type, dimensions); if (neededLength != length) { LOG(ERROR) << "Setting argument with invalid length: " << length << ", expected length: " << neededLength; return ANEURALNETWORKS_BAD_DATA; } } state = ModelArgumentInfo::MEMORY; locationAndLength = { .poolIndex = poolIndex, .offset = offset, .length = length, }; buffer = nullptr; return ANEURALNETWORKS_NO_ERROR; } int ModelArgumentInfo::updateDimensionInfo(const Operand& operand, const ANeuralNetworksOperandType* newType) { if (newType == nullptr) { dimensions = operand.dimensions; } else { const uint32_t count = newType->dimensionCount; dimensions = hidl_vec<uint32_t>(count); std::copy(&newType->dimensions[0], &newType->dimensions[count], dimensions.begin()); } return ANEURALNETWORKS_NO_ERROR; } ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation) : mCompilation(compilation), mModel(compilation->mModel), mPlan(&compilation->mPlan), mPartitioning(compilation->mPartitioning), mInputs(mModel->inputCount()), mOutputs(mModel->outputCount()) { VLOG(EXECUTION) << "ExecutionBuilder::ExecutionBuilder"; } int ExecutionBuilder::setInput(uint32_t index, const ANeuralNetworksOperandType* type, const void* buffer, size_t length) { if (mStarted) { LOG(ERROR) << "ANeuralNetworksExecution_setInput called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast<uint32_t>(mInputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_setInput bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } if (!checkDimensionInfo(mModel->getInputOperand(index), type, "ANeuralNetworksExecution_setInput", buffer == nullptr)) { return ANEURALNETWORKS_BAD_DATA; } if (length > 0xFFFFFFFF) { LOG(ERROR) << "ANeuralNetworksExecution_setInput input exceeds max length " << length; return ANEURALNETWORKS_BAD_DATA; } uint32_t l = static_cast<uint32_t>(length); return mInputs[index].setFromPointer(mModel->getInputOperand(index), type, const_cast<void*>(buffer), l); } int ExecutionBuilder::setInputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, const Memory* memory, size_t offset, size_t length) { // Should be similar to StepExecutor::setInputOrOutputFromTemporaryMemory() if (mStarted) { LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast<uint32_t>(mInputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } if (!checkDimensionInfo(mModel->getInputOperand(index), type, "ANeuralNetworksExecution_setInputFromMemory", false)) { return ANEURALNETWORKS_BAD_DATA; } // Both offset & length must be zero for Non-BLOB format AHardwareBuffer. if (memory->getHidlMemory().name() == "hardware_buffer" && (offset != 0 || length != 0)) { LOG(ERROR) << "ANeuralNetworksExecution_setInputFromMemory has non-zero offset and length" << " for Non-BLOB format AHardwareBuffer."; return ANEURALNETWORKS_BAD_DATA; } else if (!memory->validateSize(offset, length)) { return ANEURALNETWORKS_BAD_DATA; } // TODO validate the rest uint32_t poolIndex = mMemories.add(memory); return mInputs[index].setFromMemory(mModel->getInputOperand(index), type, poolIndex, offset, length); } int ExecutionBuilder::setOutput(uint32_t index, const ANeuralNetworksOperandType* type, void* buffer, size_t length) { if (mStarted) { LOG(ERROR) << "ANeuralNetworksExecution_setOutput called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast<uint32_t>(mOutputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_setOutput bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } if (!checkDimensionInfo(mModel->getOutputOperand(index), type, "ANeuralNetworksExecution_setOutput", true)) { return ANEURALNETWORKS_BAD_DATA; } if (length > 0xFFFFFFFF) { LOG(ERROR) << "ANeuralNetworksExecution_setOutput input exceeds max length " << length; return ANEURALNETWORKS_BAD_DATA; } uint32_t l = static_cast<uint32_t>(length); return mOutputs[index].setFromPointer(mModel->getOutputOperand(index), type, buffer, l); } int ExecutionBuilder::setOutputFromMemory(uint32_t index, const ANeuralNetworksOperandType* type, const Memory* memory, size_t offset, size_t length) { // Should be similar to StepExecutor::setInputOrOutputFromTemporaryMemory() if (mStarted) { LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast<uint32_t>(mOutputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } if (!checkDimensionInfo(mModel->getOutputOperand(index), type, "ANeuralNetworksExecution_setOutputFromMemory", true)) { return ANEURALNETWORKS_BAD_DATA; } // Both offset & length must be zero for Non-BLOB format AHardwareBuffer. if (memory->getHidlMemory().name() == "hardware_buffer" && (offset != 0 || length != 0)) { LOG(ERROR) << "ANeuralNetworksExecution_setOutputFromMemory has non-zero offset and length" << " for Non-BLOB format AHardwareBuffer."; return ANEURALNETWORKS_BAD_DATA; } else if (!memory->validateSize(offset, length)) { return ANEURALNETWORKS_BAD_DATA; } // TODO validate the rest uint32_t poolIndex = mMemories.add(memory); return mOutputs[index].setFromMemory(mModel->getOutputOperand(index), type, poolIndex, offset, length); } int ExecutionBuilder::setMeasureTiming(bool measure) { if (!mCompilation->mExplicitDeviceList || (mCompilation->mDevices.size() != 1)) { LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called on " << "an ANeuralNetworksExecution created from an ANeuralNetworksCompilation " << "that was not created by ANeuralNetworksCompilation_createForDevices " << "with numDevices = 1"; return ANEURALNETWORKS_BAD_DATA; } if (mStarted) { LOG(ERROR) << "ANeuralNetworksExecution_setMeasureTiming called after the " "execution has started."; return ANEURALNETWORKS_BAD_STATE; } mMeasureTiming = measure; return ANEURALNETWORKS_NO_ERROR; } int ExecutionBuilder::getDuration(int32_t durationCode, uint64_t* duration) const { if (!mFinished) { LOG(ERROR) << "ANeuralNetworksExecution_getDuration called before the " "execution has finished."; return ANEURALNETWORKS_BAD_STATE; } // NOTE: At the HAL level, timing is in microseconds. At the NDK level, nanoseconds. const uint64_t kNanoPerMicro = 1000; if (!mMeasureTiming) { *duration = UINT64_MAX; return ANEURALNETWORKS_BAD_STATE; } uint64_t microDuration = UINT64_MAX; switch (durationCode) { case ANEURALNETWORKS_DURATION_ON_HARDWARE: microDuration = mTiming.timeOnDevice; break; case ANEURALNETWORKS_DURATION_IN_DRIVER: microDuration = mTiming.timeInDriver; break; default: CHECK(!"unexpected"); } *duration = (microDuration == UINT64_MAX) ? UINT64_MAX : kNanoPerMicro * microDuration; VLOG(EXECUTION) << "getDuration(" << durationCode << "): " << *duration; return ANEURALNETWORKS_NO_ERROR; } int ExecutionBuilder::getOutputOperandDimensions(uint32_t index, uint32_t* dimensions) { if (!mFinished) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions called before the " "execution has finished."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast<uint32_t>(mOutputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } const auto& dims = mOutputs[index].dimensions; if (dims.empty()) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandDimensions can not query " "dimensions of a scalar"; return ANEURALNETWORKS_BAD_DATA; } std::copy(dims.begin(), dims.end(), dimensions); return mOutputs[index].isSufficient ? ANEURALNETWORKS_NO_ERROR : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE; } int ExecutionBuilder::getOutputOperandRank(uint32_t index, uint32_t* rank) { if (!mFinished) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank called before the " "execution has finished."; return ANEURALNETWORKS_BAD_STATE; } uint32_t count = static_cast<uint32_t>(mOutputs.size()); if (index >= count) { LOG(ERROR) << "ANeuralNetworksExecution_getOutputOperandRank bad index " << index << " " << count; return ANEURALNETWORKS_BAD_DATA; } *rank = static_cast<uint32_t>(mOutputs[index].dimensions.size()); return mOutputs[index].isSufficient ? ANEURALNETWORKS_NO_ERROR : ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE; } // Attempt synchronous execution of full model on CPU. // Ensure that executionCallback->notify() is called. // TODO: How should we handle timing in this case? // For Q this is irrelevant: We only support timing in conjunction // with an explicit device list; and we do not support CPU fallback // with an explicit device list. See CompilationBuilder::mExplicitDeviceList. static void cpuFallbackFull(ExecutionBuilder* executionBuilder, const sp<ExecutionCallback>& executionCallback) { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackFull"); VLOG(EXECUTION) << "cpuFallbackFull"; StepExecutor executor(executionBuilder, executionBuilder->getModel(), DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr); executor.mapInputsAndOutputsTrivially(); sp<ExecutionCallback> fallbackCallback; int n = executor.startCompute(&fallbackCallback); if (n != ANEURALNETWORKS_NO_ERROR) { executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming); return; } fallbackCallback->wait(); executionCallback->notify(fallbackCallback->getStatus(), fallbackCallback->getOutputShapes(), fallbackCallback->getTiming()); } // Attempt synchronous execution on CPU. // (1) First, attempt to execute this step on CPU. If successful, // return true. (Do not call executionCallback->notify().) // (2) If unsuccessful, attempt to execute the full model on CPU, // ensure that executionCallback->notify() is called, and return // false. // TODO: How should we handle timing in this case? // For Q this is irrelevant: We only support timing in conjunction // with an explicit device list; and we do not support CPU fallback // with an explicit device list. See CompilationBuilder::mExplicitDeviceList. static bool cpuFallbackPartial(ExecutionBuilder* executionBuilder, const ExecutionPlan* plan, std::shared_ptr<ExecutionPlan::Controller> controller, const sp<ExecutionCallback>& executionCallback, std::vector<OutputShape>* outputShapes) { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "cpuFallbackPartial"); VLOG(EXECUTION) << "cpuFallbackPartial"; std::shared_ptr<StepExecutor> executor; int n = plan->fallback(controller, &executor); if (n != ANEURALNETWORKS_NO_ERROR || executor->isCpu()) { cpuFallbackFull(executionBuilder, executionCallback); return false; } sp<ExecutionCallback> fallbackCallback; if (executor->startComputeOnCpu(&fallbackCallback) != ANEURALNETWORKS_NO_ERROR) { cpuFallbackFull(executionBuilder, executionCallback); return false; } fallbackCallback->wait(); ErrorStatus status = fallbackCallback->getStatus(); const auto& stepOutputShapes = fallbackCallback->getOutputShapes(); if (!executor->updateOutputShapes(stepOutputShapes, outputShapes)) { status = ErrorStatus::GENERAL_FAILURE; } if (status != ErrorStatus::NONE) { // OUTPUT_INSUFFICIENT_SIZE is not recoverable if (status == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) { executionCallback->notify(status, *outputShapes, kNoTiming); } else { cpuFallbackFull(executionBuilder, executionCallback); } return false; } return true; } static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder, const ExecutionPlan* plan, std::shared_ptr<ExecutionPlan::Controller> controller, bool allowFallback, const sp<ExecutionCallback>& executionCallback) { VLOG(EXECUTION) << "ExecutionBuilder::compute (from plan, iteratively)"; std::vector<OutputShape> outputShapes; Timing timing = kNoTiming; executionBuilder->initializeOutputShapes(&outputShapes); while (true) { std::shared_ptr<StepExecutor> executor; VLOG(EXECUTION) << "looking for next StepExecutor"; std::shared_ptr<ExecutionBurstController> burstController = nullptr; int n = plan->next(controller, &executor, &burstController); if (n != ANEURALNETWORKS_NO_ERROR) { if (allowFallback) { cpuFallbackFull(executionBuilder, executionCallback); } else { executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming); } return; } if (executor == nullptr) { executionCallback->notify(ErrorStatus::NONE, outputShapes, timing); return; } sp<ExecutionCallback> stepCallback; n = executor->startCompute(&stepCallback, burstController); if (n != ANEURALNETWORKS_NO_ERROR) { if (allowFallback) { if (cpuFallbackPartial(executionBuilder, plan, controller, executionCallback, &outputShapes)) { // Successfully executed one step on CPU. continue; } else { // Either successfully executed entire plan on // CPU, or tried and failed to do so. return; } } else { executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming); return; } } stepCallback->wait(); ErrorStatus status = stepCallback->getStatus(); const auto& stepOutputShapes = stepCallback->getOutputShapes(); if (!executor->updateOutputShapes(stepOutputShapes, &outputShapes)) { status = ErrorStatus::GENERAL_FAILURE; } if (status == ErrorStatus::NONE) { // We only support collection of timing information in the case of a // single step, so it's safe to just keep track of the last step's // timing information. timing = stepCallback->getTiming(); } else { // OUTPUT_INSUFFICIENT_SIZE is not recoverable if (allowFallback && status != ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) { if (cpuFallbackPartial(executionBuilder, plan, controller, executionCallback, &outputShapes)) { // Successfully executed one step on CPU. continue; } else { // Either successfully executed entire plan on // CPU, or tried and failed to do so. return; } } else if (status == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) { executionCallback->notify(status, outputShapes, kNoTiming); return; } else { executionCallback->notify(status, {}, kNoTiming); return; } } } } int ExecutionBuilder::compute(sp<ExecutionCallback>* synchronizationCallback, BurstBuilder* burstBuilder) { CHECK(synchronizationCallback == nullptr || burstBuilder == nullptr) << "synchronizationCallback and burstBuilder cannot simultaneously be used"; const bool synchronous = (synchronizationCallback == nullptr); if (!synchronous) { *synchronizationCallback = nullptr; } // TODO validate that we have full types for all inputs and outputs, // that the graph is not cyclic, auto name = [synchronous, burstBuilder] { return burstBuilder ? "burstCompute" : synchronous ? "compute" : "startCompute"; }; if (mStarted) { LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " called on an execution that has already started"; return ANEURALNETWORKS_BAD_STATE; } for (auto& p : mInputs) { if (p.state == ModelArgumentInfo::UNSPECIFIED) { LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all inputs specified"; return ANEURALNETWORKS_BAD_DATA; } } for (auto& p : mOutputs) { if (p.state == ModelArgumentInfo::UNSPECIFIED) { LOG(ERROR) << "ANeuralNetworksExecution_" << name() << " not all outputs specified"; return ANEURALNETWORKS_BAD_DATA; } } auto wrappedFinish = [this](ErrorStatus error, const std::vector<OutputShape>& outputShapes) { return finish(error, outputShapes); }; // TODO: For asynchronous execution, entire plan-based-path should run in an // asynchronous thread -- take the asynchronous thread logic out of // startComputeOnCpu() and use it to wrap the plan-based-path. mStarted = true; const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning); std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, burstBuilder); if (synchronous) { VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)"; sp<ExecutionCallback> localSynchronizationCallback = new ExecutionCallback(); localSynchronizationCallback->setOnFinish(wrappedFinish); asyncStartComputePartitioned(this, mPlan, controller, allowFallback, localSynchronizationCallback); localSynchronizationCallback->wait(); if (mMeasureTiming) { mTiming = localSynchronizationCallback->getTiming(); } return convertErrorStatusToResultCode(localSynchronizationCallback->getStatus()); } else /* asynchronous */ { // TODO: use a thread pool // Prepare the callback for asynchronous execution. // sp<ExecutionCallback> object is returned when the // execution has been successfully launched, otherwise a // nullptr is returned. The executionCallback is // abstracted in the NN API as an "event". sp<ExecutionCallback> executionCallback = new ExecutionCallback(); executionCallback->setOnFinish(wrappedFinish); if (DeviceManager::get()->syncExecRuntime()) { VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)"; asyncStartComputePartitioned(this, mPlan, controller, allowFallback, executionCallback); } else { VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)"; std::thread thread(asyncStartComputePartitioned, this, mPlan, controller, allowFallback, executionCallback); executionCallback->bindThread(std::move(thread)); } *synchronizationCallback = executionCallback; return ANEURALNETWORKS_NO_ERROR; } } void ExecutionBuilder::initializeOutputShapes(std::vector<OutputShape>* outputShapes) const { outputShapes->resize(mOutputs.size()); for (uint32_t i = 0; i < mOutputs.size(); i++) { (*outputShapes)[i].dimensions = mOutputs[i].dimensions; (*outputShapes)[i].isSufficient = true; } } // Check if the dimensions "to" is updatable by dimensions "from", where "from" must // have a higher specification level. static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) { if (to.size() == 0) return true; NN_RET_CHECK_EQ(to.size(), from.size()); for (uint32_t i = 0; i < to.size(); i++) { NN_RET_CHECK(to[i] == from[i] || to[i] == 0); } return true; } bool ExecutionBuilder::updateOutputShapes(const std::vector<OutputShape>& outputShapes) { if (outputShapes.size() == 0) { return true; } NN_RET_CHECK_EQ(outputShapes.size(), mOutputs.size()); for (uint32_t i = 0; i < outputShapes.size(); i++) { // Check if only unspecified dimensions or rank are overwritten. NN_RET_CHECK(isUpdatable(mOutputs[i].dimensions, outputShapes[i].dimensions)); } for (uint32_t i = 0; i < outputShapes.size(); i++) { mOutputs[i].dimensions = outputShapes[i].dimensions; mOutputs[i].isSufficient = outputShapes[i].isSufficient; } return true; } ErrorStatus ExecutionBuilder::finish(ErrorStatus, const std::vector<OutputShape>& outputShapes) { CHECK(!mFinished) << "ExecutionBuilder::finish is called twice"; mFinished = true; if (!updateOutputShapes(outputShapes)) { return ErrorStatus::GENERAL_FAILURE; } return ErrorStatus::NONE; } bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from, std::vector<OutputShape>* to) { if (from.size() == 0) { return true; } if (mExecutionStep != nullptr) { const auto& indexMapping = mExecutionStep->getOutputIndexSubModelToFromModel(); NN_RET_CHECK_LE(indexMapping.size(), from.size()); for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) { uint32_t toIndex = indexMapping[i]; NN_RET_CHECK_GT(to->size(), toIndex); NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions)); (*to)[toIndex] = from[i]; } } else { NN_RET_CHECK_EQ(from.size(), to->size()); for (uint32_t i = 0, e = from.size(); i < e; i++) { NN_RET_CHECK(isUpdatable(to->at(i).dimensions, from[i].dimensions)); (*to)[i] = from[i]; } } return true; } // Figures out how to place each of the input or outputs in a buffer. This just does the layout, // it does not copy data. Aligns each input a bit. int StepExecutor::allocatePointerArgumentsToPool(std::vector<ModelArgumentInfo>* args, Memory* memory) { uint32_t nextPoolIndex = mMemories.size(); int64_t total = 0; for (auto& info : *args) { if (info.state == ModelArgumentInfo::POINTER) { DataLocation& loc = info.locationAndLength; // TODO Good enough alignment? total += alignBytesNeeded(static_cast<uint32_t>(total), loc.length); loc.poolIndex = nextPoolIndex; loc.offset = static_cast<uint32_t>(total); total += loc.length; } }; if (total > 0xFFFFFFFF) { LOG(ERROR) << "StepExecutor::allocatePointerArgumentsToPool: ANeuralNetworksExecution: " "Size of all inputs or outputs exceeds 2^32."; return ANEURALNETWORKS_BAD_DATA; } hidl_memory hidlMemory; if (total > 0) { memory->create(total); // TODO check error mMemories.add(memory); } return ANEURALNETWORKS_NO_ERROR; } static void setRequestArgumentArray(const std::vector<ModelArgumentInfo>& argumentInfos, hidl_vec<RequestArgument>* ioInfos) { size_t count = argumentInfos.size(); ioInfos->resize(count); for (size_t i = 0; i < count; i++) { const auto& info = argumentInfos[i]; (*ioInfos)[i] = { .hasNoValue = info.state == ModelArgumentInfo::HAS_NO_VALUE, .location = info.locationAndLength, .dimensions = info.dimensions, }; } } StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model, std::shared_ptr<Device> device, std::shared_ptr<VersionedIPreparedModel> preparedModel) : mExecutionBuilder(executionBuilder), mModel(model), mDevice(device), mPreparedModel(preparedModel), mInputs(model->inputCount()), mOutputs(model->outputCount()) { CHECK(mDevice != nullptr); } void StepExecutor::mapInputsAndOutputsTrivially() { mInputs = mExecutionBuilder->mInputs; mOutputs = mExecutionBuilder->mOutputs; mMemories = mExecutionBuilder->mMemories; } void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput, ModelArgumentInfo* executorInputOrOutput) { *executorInputOrOutput = builderInputOrOutput; switch (executorInputOrOutput->state) { default: nnAssert(!"unexpected ModelArgumentInfo::state"); break; case ModelArgumentInfo::HAS_NO_VALUE: case ModelArgumentInfo::POINTER: case ModelArgumentInfo::UNSPECIFIED: break; case ModelArgumentInfo::MEMORY: { const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength.poolIndex; const Memory* memory = mExecutionBuilder->mMemories[builderPoolIndex]; const uint32_t executorPoolIndex = mMemories.add(memory); executorInputOrOutput->locationAndLength.poolIndex = executorPoolIndex; break; } } } int StepExecutor::setInputOrOutputFromTemporaryMemory(const Operand& inputOrOutputOperand, const Memory* memory, uint32_t offset, ModelArgumentInfo* inputOrOutputInfo) { // Should be similar to // ExecutionBuilder::setInputFromMemory() // ExecutionBuilder::setOutputFromMemory() uint32_t poolIndex = mMemories.add(memory); uint32_t length = TypeManager::get()->getSizeOfData(inputOrOutputOperand); return inputOrOutputInfo->setFromTemporaryMemory(inputOrOutputOperand, poolIndex, offset, length); } static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) { for (unsigned i = 0; i < args.size(); i++) { const auto& arg = args[i]; std::string prefix = kind + std::string("[") + std::to_string(i) + "] = "; switch (arg.state) { case ModelArgumentInfo::POINTER: VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer) << ")"; break; case ModelArgumentInfo::MEMORY: VLOG(EXECUTION) << prefix << "MEMORY(" << "pool=" << arg.locationAndLength.poolIndex << ", " << "off=" << arg.locationAndLength.offset << ")"; break; case ModelArgumentInfo::HAS_NO_VALUE: VLOG(EXECUTION) << prefix << "HAS_NO_VALUE"; break; case ModelArgumentInfo::UNSPECIFIED: VLOG(EXECUTION) << prefix << "UNSPECIFIED"; break; default: VLOG(EXECUTION) << prefix << "state(" << arg.state << ")"; break; } } } bool StepExecutor::isCpu() const { return mDevice->getInterface() == nullptr; } int StepExecutor::startCompute(sp<ExecutionCallback>* synchronizationCallback, const std::shared_ptr<ExecutionBurstController>& burstController) { if (VLOG_IS_ON(EXECUTION)) { logArguments("input", mInputs); logArguments("output", mOutputs); } if (isCpu()) { return startComputeOnCpu(synchronizationCallback); } else { return startComputeOnDevice(synchronizationCallback, burstController); } } int StepExecutor::startComputeOnDevice( sp<ExecutionCallback>* synchronizationCallback, const std::shared_ptr<ExecutionBurstController>& burstController) { CHECK(!isCpu()); // Initialize timing information in case we take an error path to exit. mExecutionBuilder->reportTiming(kNoTiming); *synchronizationCallback = nullptr; // TODO: Remove the mPreparedModel == nullptr case once we've fully integrated // ExecutionPlan with the compilation and execution phases of the NN API if (mPreparedModel == nullptr) { Model model; mModel->setHidlModel(&model); // TODO(butlermichael): Propagate user preference to this point instead of // using default value of ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER, or // remove this entire block of code since it is a stale path that is only // encountered on an #if-removed code. ExecutionPreference preference = static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER); ErrorStatus status = ErrorStatus::GENERAL_FAILURE; std::tie(status, mPreparedModel) = mDevice->getInterface()->prepareModel(model, preference, {}, {}, {}); if (status != ErrorStatus::NONE) { return convertErrorStatusToResultCode(status); } if (mPreparedModel == nullptr) { return ANEURALNETWORKS_OP_FAILED; } } NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "StepExecutor::startComputeOnDevice"); // We separate the input & output pools so that we reduce the copying done if we // do an eventual remoting (hidl_memory->update()). We could also use it to set // protection on read only memory but that's not currently done. Memory inputPointerArguments; Memory outputPointerArguments; // Layout the input and output data int n = allocatePointerArgumentsToPool(&mInputs, &inputPointerArguments); if (n != ANEURALNETWORKS_NO_ERROR) { return n; } n = allocatePointerArgumentsToPool(&mOutputs, &outputPointerArguments); if (n != ANEURALNETWORKS_NO_ERROR) { return n; } // Copy the input data that was specified via a pointer. // inputPointerArguments.update(); for (auto& info : mInputs) { if (info.state == ModelArgumentInfo::POINTER) { DataLocation& loc = info.locationAndLength; uint8_t* data = nullptr; int n = inputPointerArguments.getPointer(&data); if (n != ANEURALNETWORKS_NO_ERROR) { return n; } memcpy(data + loc.offset, info.buffer, loc.length); } } // TODO: Add inputPointerArguments.commit() and .update() at all the right places Request request; setRequestArgumentArray(mInputs, &request.inputs); setRequestArgumentArray(mOutputs, &request.outputs); uint32_t count = mMemories.size(); request.pools.resize(count); for (uint32_t i = 0; i < count; i++) { request.pools[i] = mMemories[i]->getHidlMemory(); } NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION, "StepExecutor::startComputeOnDevice::execute"); // Prepare the callback for asynchronous execution. sp<ExecutionCallback> // object is returned when the execution has been successfully launched, // otherwise a nullptr is returned. The executionCallback is abstracted in // the NN API as an "event". // // The sp is used for ref-counting purposes. Without it, the HIDL service // could attempt to communicate with a dead callback object. // // TODO: Explain the "dead callback" problem further, either here or // in the design document. sp<ExecutionCallback> executionCallback = new ExecutionCallback(); // compute using burst if present const bool burstCompute = (burstController != nullptr); bool burstFallback = false; if (burstCompute) { std::vector<intptr_t> memoryIds; memoryIds.reserve(mMemories.size()); for (const Memory* memory : mMemories) { memory->usedBy(burstController); memoryIds.push_back(memory->getKey()); } VLOG(EXECUTION) << "Before ExecutionBurstController->tryCompute() " << SHOW_IF_DEBUG(toString(request)); auto [status, outputShapes, timing, fallback] = burstController->tryCompute(request, measureTiming(mExecutionBuilder), memoryIds); burstFallback = fallback; if (!fallback) { executionCallback->notify(status, outputShapes, timing); } } // compute from IPreparedModel if either: // (1) burst was not supplied, or // (2) the burst execution failed and requested a fallback execution if (!burstCompute || burstFallback) { if (DeviceManager::get()->syncExecHal()) { VLOG(EXECUTION) << "Before mPreparedModel->executeSynchronously() " << SHOW_IF_DEBUG(toString(request)); auto syncExecuteResult = mPreparedModel->executeSynchronously(request, measureTiming(mExecutionBuilder)); executionCallback->notify(std::get<0>(syncExecuteResult), std::get<1>(syncExecuteResult), std::get<2>(syncExecuteResult)); } else { VLOG(EXECUTION) << "Before mPreparedModel->execute() " << SHOW_IF_DEBUG(toString(request)); // Execute. // TODO: What happens to the Callback if the service dies abnormally // -- won't that keep the Callback live forever, because the service // never has the opportunity to bump the reference count down? Or // maybe the HIDL infrastructure handles this magically? At worst, // it seems like this is a small memory leak, if the Callback stays // alive forever. Return<ErrorStatus> executeStatus = mPreparedModel->execute( request, measureTiming(mExecutionBuilder), executionCallback); if (!executeStatus.isOk() || executeStatus != ErrorStatus::NONE) { VLOG(EXECUTION) << "**Execute launch failed**"; return executeStatus.isOk() ? convertErrorStatusToResultCode(executeStatus) : ANEURALNETWORKS_OP_FAILED; } } } // TODO: Remove this synchronization point when the block of code below is // removed. executionCallback->wait(); NNTRACE_FULL_SWITCH(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_EXECUTION, "StepExecutor::startComputeOnDevice::waited"); Return<ErrorStatus> callbackStatus = executionCallback->getStatus(); if (!callbackStatus.isOk() || callbackStatus != ErrorStatus::NONE) { VLOG(EXECUTION) << "**Execution failed**"; if (callbackStatus == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) { *synchronizationCallback = executionCallback; return ANEURALNETWORKS_NO_ERROR; } return callbackStatus.isOk() ? convertErrorStatusToResultCode(callbackStatus) : ANEURALNETWORKS_OP_FAILED; } mExecutionBuilder->reportTiming(executionCallback->getTiming()); // Copy the output data from shared memory to the output buffers. // TODO: Move this block of code somewhere else. It should not be in the // startCompute function. // TODO: outputMemory->update(); outputMemory->commit() NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "StepExecutor::startComputeOnDevice"); for (auto& info : mOutputs) { if (info.state == ModelArgumentInfo::POINTER) { DataLocation& loc = info.locationAndLength; uint8_t* data = nullptr; int n = outputPointerArguments.getPointer(&data); if (n != ANEURALNETWORKS_NO_ERROR) { return n; } memcpy(info.buffer, data + loc.offset, loc.length); } } VLOG(EXECUTION) << "StepExecutor::startComputeOnDevice completed"; *synchronizationCallback = executionCallback; return ANEURALNETWORKS_NO_ERROR; } static void computeOnCpu(const Model& model, const Request& request, const std::vector<RunTimePoolInfo>& modelPoolInfos, const std::vector<RunTimePoolInfo>& requestPoolInfos, const sp<IExecutionCallback>& executionCallback) { NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu"); CpuExecutor executor; int err = executor.run(model, request, modelPoolInfos, requestPoolInfos); const auto& outputShapes = executor.getOutputShapes(); executionCallback->notify_1_2(convertResultCodeToErrorStatus(err), outputShapes, kNoTiming); } int StepExecutor::startComputeOnCpu(sp<ExecutionCallback>* synchronizationCallback) { // TODO: use a thread pool // TODO(mikie): this could have NNTRACE so we could measure the overhead of // spinning up a new thread. Model model; mModel->setHidlModel(&model); // Prepare the callback for asynchronous execution. sp<ExecutionCallback> // object is returned when the execution has been successfully launched, // otherwise a nullptr is returned. The executionCallback is abstracted in // the NN API as an "event". sp<ExecutionCallback> executionCallback = new ExecutionCallback(); *synchronizationCallback = nullptr; std::vector<RunTimePoolInfo> modelPoolInfos; if (!setRunTimePoolInfosFromHidlMemories(&modelPoolInfos, model.pools)) { return ANEURALNETWORKS_UNMAPPABLE; } std::vector<RunTimePoolInfo> requestPoolInfos; requestPoolInfos.reserve(mMemories.size()); for (const Memory* mem : mMemories) { if (std::optional<RunTimePoolInfo> poolInfo = RunTimePoolInfo::createFromHidlMemory(mem->getHidlMemory())) { requestPoolInfos.emplace_back(*poolInfo); } else { return ANEURALNETWORKS_UNMAPPABLE; } } // Create as many pools as there are input / output. auto fixPointerArguments = [&requestPoolInfos](std::vector<ModelArgumentInfo>& argumentInfos) { for (ModelArgumentInfo& argumentInfo : argumentInfos) { if (argumentInfo.state == ModelArgumentInfo::POINTER) { argumentInfo.locationAndLength.poolIndex = static_cast<uint32_t>(requestPoolInfos.size()); argumentInfo.locationAndLength.offset = 0; requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer( static_cast<uint8_t*>(argumentInfo.buffer))); } } }; fixPointerArguments(mInputs); fixPointerArguments(mOutputs); Request request; setRequestArgumentArray(mInputs, &request.inputs); setRequestArgumentArray(mOutputs, &request.outputs); if (DeviceManager::get()->syncExecCpu()) { computeOnCpu(model, request, modelPoolInfos, requestPoolInfos, executionCallback); } else { // TODO: should model be moved with a std::cref? std::thread thread(computeOnCpu, model, std::move(request), std::move(modelPoolInfos), std::move(requestPoolInfos), executionCallback); executionCallback->bindThread(std::move(thread)); } *synchronizationCallback = executionCallback; return ANEURALNETWORKS_NO_ERROR; } } // namespace nn } // namespace android