// Copyright 2015 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// multi_thread_common.h: Multithreading code shared by different meta gemm
// versions.

#ifndef GEMMLOWP_META_MULTI_THREAD_COMMON_H_
#define GEMMLOWP_META_MULTI_THREAD_COMMON_H_

#include "../internal/multi_thread_gemm.h"

namespace gemmlowp {
namespace meta {
namespace internal {

const std::int32_t kMinTaskSize = 10000;
const std::int32_t kMinTaskDimension = 6;

struct TaskRect {
  std::int32_t m_offset;
  std::int32_t m;
  std::int32_t n_offset;
  std::int32_t n;

  TaskRect(std::int32_t m_offset, std::int32_t m, std::int32_t n_offset,
           std::int32_t n)
      : m_offset(m_offset), m(m), n_offset(n_offset), n(n) {}
};

template <typename IN_TYPE, typename OUT_TYPE, typename F>
struct MetaTask : gemmlowp::Task {
  std::uint8_t* scratch;
  const IN_TYPE* lhs;
  const IN_TYPE* rhs;
  TaskRect task_rect;
  std::int32_t k;
  OUT_TYPE* result;
  std::int32_t result_stride;
  const F& operation;

  MetaTask(std::uint8_t* scratch, const IN_TYPE* lhs, const IN_TYPE* rhs,
           const TaskRect& task_rect, std::int32_t k, OUT_TYPE* result,
           std::int32_t result_stride, const F& operation)
      : scratch(scratch),
        lhs(lhs),
        rhs(rhs),
        task_rect(task_rect),
        k(k),
        result(result),
        result_stride(result_stride),
        operation(operation) {}

  void Run() const override {
    const IN_TYPE* task_lhs = lhs + task_rect.m_offset * k;
    const IN_TYPE* task_rhs = rhs + task_rect.n_offset * k;
    OUT_TYPE* task_result =
        result + task_rect.m_offset * result_stride + task_rect.n_offset;
    operation.ExecuteMatrixMatrix(scratch, task_lhs, task_rhs, task_rect.m,
                                  task_rect.n, k, task_result, result_stride);
  }
};

std::int32_t ResolveMaxThreads(std::int32_t max_threads) {
  if (max_threads == 0) {
    static const int hardware_threads_count =
        static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
    return hardware_threads_count;
  }
  return max_threads;
}

void PrepareTasks(std::int32_t max_tasks, std::int32_t m, std::int32_t n,
                  std::int32_t k, std::vector<internal::TaskRect>* tasks) {
  const std::int32_t max_tasks_by_size = (m * n * k) / kMinTaskSize;
  const std::int32_t max_tasks_m = m / kMinTaskDimension;
  const std::int32_t max_tasks_n = n / kMinTaskDimension;
  const std::int32_t max_tasks_dimension = std::max(max_tasks_m, max_tasks_n);

  std::int32_t real_tasks = std::max(
      1, std::min(max_tasks, std::min(max_tasks_by_size, max_tasks_dimension)));

  if (real_tasks == 1) {
    tasks->push_back(TaskRect(0, m, 0, n));
    return;
  }

  if (max_tasks_m > max_tasks_n) {
    const std::int32_t m_chunk = m / real_tasks;
    for (int i = 0; i < real_tasks - 1; ++i) {
      tasks->push_back(TaskRect(i * m_chunk, m_chunk, 0, n));
    }
    const std::int32_t last_m_offset = (real_tasks - 1) * m_chunk;
    tasks->push_back(TaskRect(last_m_offset, m - last_m_offset, 0, n));
  } else {
    const std::int32_t n_chunk = n / real_tasks;
    for (int i = 0; i < real_tasks - 1; ++i) {
      tasks->push_back(TaskRect(0, m, i * n_chunk, n_chunk));
    }
    const std::int32_t last_n_offset = (real_tasks - 1) * n_chunk;
    tasks->push_back(TaskRect(0, m, last_n_offset, n - last_n_offset));
  }
}

template <typename IN_TYPE, typename OUT_TYPE, typename F>
void MultiThreadedMatrixMatrix(gemmlowp::WorkersPool* pool,
                               std::int32_t max_threads, std::uint8_t* scratch,
                               const IN_TYPE* lhs, const IN_TYPE* rhs,
                               std::int32_t m, std::int32_t n, std::int32_t k,
                               OUT_TYPE* result, std::int32_t result_stride,
                               const F& operation) {
  max_threads = internal::ResolveMaxThreads(max_threads);
  if (max_threads > 1) {
    pool->CreateWorkers(max_threads - 1);
  }

  std::vector<internal::TaskRect> task_rects;
  internal::PrepareTasks(max_threads, m, n, k, &task_rects);

  if (task_rects.size() == 1) {
    operation.ExecuteMatrixMatrix(scratch, lhs, rhs, m, n, k, result,
                                  result_stride);
    return;
  }

  std::uint8_t* task_scratch = scratch;
  std::int32_t scratch_per_thread = operation.ScratchPerThread(m, n, k);
  std::int32_t worker_tasks = task_rects.size() - 1;
  pool->counter_to_decrement_when_ready().Reset(worker_tasks);

  for (std::int32_t i = 0; i < worker_tasks; ++i) {
    auto task = new internal::MetaTask<IN_TYPE, OUT_TYPE, F>(
        task_scratch, lhs, rhs, task_rects[i], k, result, result_stride,
        operation);
    pool->StartWorker(i, task);
    task_scratch += scratch_per_thread;
  }

  {
    internal::MetaTask<IN_TYPE, OUT_TYPE, F> master_task(
        task_scratch, lhs, rhs, task_rects.back(), k, result, result_stride,
        operation);
    master_task.Run();
  }

  pool->counter_to_decrement_when_ready().Wait();
}

}  // namespace internal
}  // namespace meta
}  // namespace gemmlowp

#endif  // GEMMLOWP_META_MULTI_THREAD_COMMON_H_