// Copyright 2015 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // multi_thread_common.h: Multithreading code shared by different meta gemm // versions. #ifndef GEMMLOWP_META_MULTI_THREAD_COMMON_H_ #define GEMMLOWP_META_MULTI_THREAD_COMMON_H_ #include "../internal/multi_thread_gemm.h" namespace gemmlowp { namespace meta { namespace internal { const std::int32_t kMinTaskSize = 10000; const std::int32_t kMinTaskDimension = 6; struct TaskRect { std::int32_t m_offset; std::int32_t m; std::int32_t n_offset; std::int32_t n; TaskRect(std::int32_t m_offset, std::int32_t m, std::int32_t n_offset, std::int32_t n) : m_offset(m_offset), m(m), n_offset(n_offset), n(n) {} }; template <typename IN_TYPE, typename OUT_TYPE, typename F> struct MetaTask : gemmlowp::Task { std::uint8_t* scratch; const IN_TYPE* lhs; const IN_TYPE* rhs; TaskRect task_rect; std::int32_t k; OUT_TYPE* result; std::int32_t result_stride; const F& operation; MetaTask(std::uint8_t* scratch, const IN_TYPE* lhs, const IN_TYPE* rhs, const TaskRect& task_rect, std::int32_t k, OUT_TYPE* result, std::int32_t result_stride, const F& operation) : scratch(scratch), lhs(lhs), rhs(rhs), task_rect(task_rect), k(k), result(result), result_stride(result_stride), operation(operation) {} void Run() const override { const IN_TYPE* task_lhs = lhs + task_rect.m_offset * k; const IN_TYPE* task_rhs = rhs + task_rect.n_offset * k; OUT_TYPE* task_result = result + task_rect.m_offset * result_stride + task_rect.n_offset; operation.ExecuteMatrixMatrix(scratch, task_lhs, task_rhs, task_rect.m, task_rect.n, k, task_result, result_stride); } }; std::int32_t ResolveMaxThreads(std::int32_t max_threads) { if (max_threads == 0) { static const int hardware_threads_count = static_cast<int>(sysconf(_SC_NPROCESSORS_CONF)); return hardware_threads_count; } return max_threads; } void PrepareTasks(std::int32_t max_tasks, std::int32_t m, std::int32_t n, std::int32_t k, std::vector<internal::TaskRect>* tasks) { const std::int32_t max_tasks_by_size = (m * n * k) / kMinTaskSize; const std::int32_t max_tasks_m = m / kMinTaskDimension; const std::int32_t max_tasks_n = n / kMinTaskDimension; const std::int32_t max_tasks_dimension = std::max(max_tasks_m, max_tasks_n); std::int32_t real_tasks = std::max( 1, std::min(max_tasks, std::min(max_tasks_by_size, max_tasks_dimension))); if (real_tasks == 1) { tasks->push_back(TaskRect(0, m, 0, n)); return; } if (max_tasks_m > max_tasks_n) { const std::int32_t m_chunk = m / real_tasks; for (int i = 0; i < real_tasks - 1; ++i) { tasks->push_back(TaskRect(i * m_chunk, m_chunk, 0, n)); } const std::int32_t last_m_offset = (real_tasks - 1) * m_chunk; tasks->push_back(TaskRect(last_m_offset, m - last_m_offset, 0, n)); } else { const std::int32_t n_chunk = n / real_tasks; for (int i = 0; i < real_tasks - 1; ++i) { tasks->push_back(TaskRect(0, m, i * n_chunk, n_chunk)); } const std::int32_t last_n_offset = (real_tasks - 1) * n_chunk; tasks->push_back(TaskRect(0, m, last_n_offset, n - last_n_offset)); } } template <typename IN_TYPE, typename OUT_TYPE, typename F> void MultiThreadedMatrixMatrix(gemmlowp::WorkersPool* pool, std::int32_t max_threads, std::uint8_t* scratch, const IN_TYPE* lhs, const IN_TYPE* rhs, std::int32_t m, std::int32_t n, std::int32_t k, OUT_TYPE* result, std::int32_t result_stride, const F& operation) { max_threads = internal::ResolveMaxThreads(max_threads); if (max_threads > 1) { pool->CreateWorkers(max_threads - 1); } std::vector<internal::TaskRect> task_rects; internal::PrepareTasks(max_threads, m, n, k, &task_rects); if (task_rects.size() == 1) { operation.ExecuteMatrixMatrix(scratch, lhs, rhs, m, n, k, result, result_stride); return; } std::uint8_t* task_scratch = scratch; std::int32_t scratch_per_thread = operation.ScratchPerThread(m, n, k); std::int32_t worker_tasks = task_rects.size() - 1; pool->counter_to_decrement_when_ready().Reset(worker_tasks); for (std::int32_t i = 0; i < worker_tasks; ++i) { auto task = new internal::MetaTask<IN_TYPE, OUT_TYPE, F>( task_scratch, lhs, rhs, task_rects[i], k, result, result_stride, operation); pool->StartWorker(i, task); task_scratch += scratch_per_thread; } { internal::MetaTask<IN_TYPE, OUT_TYPE, F> master_task( task_scratch, lhs, rhs, task_rects.back(), k, result, result_stride, operation); master_task.Run(); } pool->counter_to_decrement_when_ready().Wait(); } } // namespace internal } // namespace meta } // namespace gemmlowp #endif // GEMMLOWP_META_MULTI_THREAD_COMMON_H_