/*
 * Copyright (C) 2013 Christoph Bumiller
 * Copyright (C) 2015 Samuel Pitoiset
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

/**
 * Performance monitoring counters interface to gallium.
 */

#include "st_debug.h"
#include "st_context.h"
#include "st_cb_bitmap.h"
#include "st_cb_perfmon.h"

#include "util/bitset.h"

#include "pipe/p_context.h"
#include "pipe/p_screen.h"
#include "util/u_memory.h"

static bool
init_perf_monitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
{
   struct st_context *st = st_context(ctx);
   struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
   struct pipe_context *pipe = st->pipe;
   unsigned *batch = NULL;
   unsigned num_active_counters = 0;
   unsigned max_batch_counters = 0;
   unsigned num_batch_counters = 0;
   int gid, cid;

   st_flush_bitmap_cache(st);

   /* Determine the number of active counters. */
   for (gid = 0; gid < ctx->PerfMonitor.NumGroups; gid++) {
      const struct gl_perf_monitor_group *g = &ctx->PerfMonitor.Groups[gid];
      const struct st_perf_monitor_group *stg = &st->perfmon[gid];

      if (m->ActiveGroups[gid] > g->MaxActiveCounters) {
         /* Maximum number of counters reached. Cannot start the session. */
         if (ST_DEBUG & DEBUG_MESA) {
            debug_printf("Maximum number of counters reached. "
                         "Cannot start the session!\n");
         }
         return false;
      }

      num_active_counters += m->ActiveGroups[gid];
      if (stg->has_batch)
         max_batch_counters += m->ActiveGroups[gid];
   }

   if (!num_active_counters)
      return true;

   stm->active_counters = CALLOC(num_active_counters,
                                 sizeof(*stm->active_counters));
   if (!stm->active_counters)
      return false;

   if (max_batch_counters) {
      batch = CALLOC(max_batch_counters, sizeof(*batch));
      if (!batch)
         return false;
   }

   /* Create a query for each active counter. */
   for (gid = 0; gid < ctx->PerfMonitor.NumGroups; gid++) {
      const struct gl_perf_monitor_group *g = &ctx->PerfMonitor.Groups[gid];
      const struct st_perf_monitor_group *stg = &st->perfmon[gid];
      BITSET_WORD tmp;

      BITSET_FOREACH_SET(cid, tmp, m->ActiveCounters[gid], g->NumCounters) {
         const struct st_perf_monitor_counter *stc = &stg->counters[cid];
         struct st_perf_counter_object *cntr =
            &stm->active_counters[stm->num_active_counters];

         cntr->id       = cid;
         cntr->group_id = gid;
         if (stc->flags & PIPE_DRIVER_QUERY_FLAG_BATCH) {
            cntr->batch_index = num_batch_counters;
            batch[num_batch_counters++] = stc->query_type;
         } else {
            cntr->query = pipe->create_query(pipe, stc->query_type, 0);
            if (!cntr->query)
               goto fail;
         }
         ++stm->num_active_counters;
      }
   }

   /* Create the batch query. */
   if (num_batch_counters) {
      stm->batch_query = pipe->create_batch_query(pipe, num_batch_counters,
                                                  batch);
      stm->batch_result = CALLOC(num_batch_counters, sizeof(stm->batch_result->batch[0]));
      if (!stm->batch_query || !stm->batch_result)
         goto fail;
   }

   FREE(batch);
   return true;

fail:
   FREE(batch);
   return false;
}

static void
reset_perf_monitor(struct st_perf_monitor_object *stm,
                   struct pipe_context *pipe)
{
   unsigned i;

   for (i = 0; i < stm->num_active_counters; ++i) {
      struct pipe_query *query = stm->active_counters[i].query;
      if (query)
         pipe->destroy_query(pipe, query);
   }
   FREE(stm->active_counters);
   stm->active_counters = NULL;
   stm->num_active_counters = 0;

   if (stm->batch_query) {
      pipe->destroy_query(pipe, stm->batch_query);
      stm->batch_query = NULL;
   }
   FREE(stm->batch_result);
   stm->batch_result = NULL;
}

static struct gl_perf_monitor_object *
st_NewPerfMonitor(struct gl_context *ctx)
{
   struct st_perf_monitor_object *stq = ST_CALLOC_STRUCT(st_perf_monitor_object);
   if (stq)
      return &stq->base;
   return NULL;
}

static void
st_DeletePerfMonitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
{
   struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
   struct pipe_context *pipe = st_context(ctx)->pipe;

   reset_perf_monitor(stm, pipe);
   FREE(stm);
}

static GLboolean
st_BeginPerfMonitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
{
   struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
   struct pipe_context *pipe = st_context(ctx)->pipe;
   unsigned i;

   if (!stm->num_active_counters) {
      /* Create a query for each active counter before starting
       * a new monitoring session. */
      if (!init_perf_monitor(ctx, m))
         goto fail;
   }

   /* Start the query for each active counter. */
   for (i = 0; i < stm->num_active_counters; ++i) {
      struct pipe_query *query = stm->active_counters[i].query;
      if (query && !pipe->begin_query(pipe, query))
          goto fail;
   }

   if (stm->batch_query && !pipe->begin_query(pipe, stm->batch_query))
      goto fail;

   return true;

fail:
   /* Failed to start the monitoring session. */
   reset_perf_monitor(stm, pipe);
   return false;
}

static void
st_EndPerfMonitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
{
   struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
   struct pipe_context *pipe = st_context(ctx)->pipe;
   unsigned i;

   /* Stop the query for each active counter. */
   for (i = 0; i < stm->num_active_counters; ++i) {
      struct pipe_query *query = stm->active_counters[i].query;
      if (query)
         pipe->end_query(pipe, query);
   }

   if (stm->batch_query)
      pipe->end_query(pipe, stm->batch_query);
}

static void
st_ResetPerfMonitor(struct gl_context *ctx, struct gl_perf_monitor_object *m)
{
   struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
   struct pipe_context *pipe = st_context(ctx)->pipe;

   if (!m->Ended)
      st_EndPerfMonitor(ctx, m);

   reset_perf_monitor(stm, pipe);

   if (m->Active)
      st_BeginPerfMonitor(ctx, m);
}

static GLboolean
st_IsPerfMonitorResultAvailable(struct gl_context *ctx,
                                struct gl_perf_monitor_object *m)
{
   struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
   struct pipe_context *pipe = st_context(ctx)->pipe;
   unsigned i;

   if (!stm->num_active_counters)
      return false;

   /* The result of a monitoring session is only available if the query of
    * each active counter is idle. */
   for (i = 0; i < stm->num_active_counters; ++i) {
      struct pipe_query *query = stm->active_counters[i].query;
      union pipe_query_result result;
      if (query && !pipe->get_query_result(pipe, query, FALSE, &result)) {
         /* The query is busy. */
         return false;
      }
   }

   if (stm->batch_query &&
       !pipe->get_query_result(pipe, stm->batch_query, FALSE, stm->batch_result))
      return false;

   return true;
}

static void
st_GetPerfMonitorResult(struct gl_context *ctx,
                        struct gl_perf_monitor_object *m,
                        GLsizei dataSize,
                        GLuint *data,
                        GLint *bytesWritten)
{
   struct st_perf_monitor_object *stm = st_perf_monitor_object(m);
   struct pipe_context *pipe = st_context(ctx)->pipe;
   unsigned i;

   /* Copy data to the supplied array (data).
    *
    * The output data format is: <group ID, counter ID, value> for each
    * active counter. The API allows counters to appear in any order.
    */
   GLsizei offset = 0;
   bool have_batch_query = false;

   if (stm->batch_query)
      have_batch_query = pipe->get_query_result(pipe, stm->batch_query, TRUE,
                                                stm->batch_result);

   /* Read query results for each active counter. */
   for (i = 0; i < stm->num_active_counters; ++i) {
      struct st_perf_counter_object *cntr = &stm->active_counters[i];
      union pipe_query_result result = { 0 };
      int gid, cid;
      GLenum type;

      cid  = cntr->id;
      gid  = cntr->group_id;
      type = ctx->PerfMonitor.Groups[gid].Counters[cid].Type;

      if (cntr->query) {
         if (!pipe->get_query_result(pipe, cntr->query, TRUE, &result))
            continue;
      } else {
         if (!have_batch_query)
            continue;
         result.batch[0] = stm->batch_result->batch[cntr->batch_index];
      }

      data[offset++] = gid;
      data[offset++] = cid;
      switch (type) {
      case GL_UNSIGNED_INT64_AMD:
         memcpy(&data[offset], &result.u64, sizeof(uint64_t));
         offset += sizeof(uint64_t) / sizeof(GLuint);
         break;
      case GL_UNSIGNED_INT:
         memcpy(&data[offset], &result.u32, sizeof(uint32_t));
         offset += sizeof(uint32_t) / sizeof(GLuint);
         break;
      case GL_FLOAT:
      case GL_PERCENTAGE_AMD:
         memcpy(&data[offset], &result.f, sizeof(GLfloat));
         offset += sizeof(GLfloat) / sizeof(GLuint);
         break;
      }
   }

   if (bytesWritten)
      *bytesWritten = offset * sizeof(GLuint);
}


bool
st_have_perfmon(struct st_context *st)
{
   struct pipe_screen *screen = st->pipe->screen;

   if (!screen->get_driver_query_info || !screen->get_driver_query_group_info)
      return false;

   return screen->get_driver_query_group_info(screen, 0, NULL) != 0;
}

static void
st_InitPerfMonitorGroups(struct gl_context *ctx)
{
   struct st_context *st = st_context(ctx);
   struct gl_perf_monitor_state *perfmon = &st->ctx->PerfMonitor;
   struct pipe_screen *screen = st->pipe->screen;
   struct gl_perf_monitor_group *groups = NULL;
   struct st_perf_monitor_group *stgroups = NULL;
   int num_counters, num_groups;
   int gid, cid;

   /* Get the number of available queries. */
   num_counters = screen->get_driver_query_info(screen, 0, NULL);

   /* Get the number of available groups. */
   num_groups = screen->get_driver_query_group_info(screen, 0, NULL);
   groups = CALLOC(num_groups, sizeof(*groups));
   if (!groups)
      return;

   stgroups = CALLOC(num_groups, sizeof(*stgroups));
   if (!stgroups)
      goto fail_only_groups;

   for (gid = 0; gid < num_groups; gid++) {
      struct gl_perf_monitor_group *g = &groups[perfmon->NumGroups];
      struct st_perf_monitor_group *stg = &stgroups[perfmon->NumGroups];
      struct pipe_driver_query_group_info group_info;
      struct gl_perf_monitor_counter *counters = NULL;
      struct st_perf_monitor_counter *stcounters = NULL;

      if (!screen->get_driver_query_group_info(screen, gid, &group_info))
         continue;

      g->Name = group_info.name;
      g->MaxActiveCounters = group_info.max_active_queries;

      if (group_info.num_queries)
         counters = CALLOC(group_info.num_queries, sizeof(*counters));
      if (!counters)
         goto fail;
      g->Counters = counters;

      stcounters = CALLOC(group_info.num_queries, sizeof(*stcounters));
      if (!stcounters)
         goto fail;
      stg->counters = stcounters;

      for (cid = 0; cid < num_counters; cid++) {
         struct gl_perf_monitor_counter *c = &counters[g->NumCounters];
         struct st_perf_monitor_counter *stc = &stcounters[g->NumCounters];
         struct pipe_driver_query_info info;

         if (!screen->get_driver_query_info(screen, cid, &info))
            continue;
         if (info.group_id != gid)
            continue;

         c->Name = info.name;
         switch (info.type) {
            case PIPE_DRIVER_QUERY_TYPE_UINT64:
            case PIPE_DRIVER_QUERY_TYPE_BYTES:
            case PIPE_DRIVER_QUERY_TYPE_MICROSECONDS:
            case PIPE_DRIVER_QUERY_TYPE_HZ:
               c->Minimum.u64 = 0;
               c->Maximum.u64 = info.max_value.u64 ? info.max_value.u64 : -1;
               c->Type = GL_UNSIGNED_INT64_AMD;
               break;
            case PIPE_DRIVER_QUERY_TYPE_UINT:
               c->Minimum.u32 = 0;
               c->Maximum.u32 = info.max_value.u32 ? info.max_value.u32 : -1;
               c->Type = GL_UNSIGNED_INT;
               break;
            case PIPE_DRIVER_QUERY_TYPE_FLOAT:
               c->Minimum.f = 0.0;
               c->Maximum.f = info.max_value.f ? info.max_value.f : -1;
               c->Type = GL_FLOAT;
               break;
            case PIPE_DRIVER_QUERY_TYPE_PERCENTAGE:
               c->Minimum.f = 0.0f;
               c->Maximum.f = 100.0f;
               c->Type = GL_PERCENTAGE_AMD;
               break;
            default:
               unreachable("Invalid driver query type!");
         }

         stc->query_type = info.query_type;
         stc->flags = info.flags;
         if (stc->flags & PIPE_DRIVER_QUERY_FLAG_BATCH)
            stg->has_batch = true;

         g->NumCounters++;
      }
      perfmon->NumGroups++;
   }
   perfmon->Groups = groups;
   st->perfmon = stgroups;

   return;

fail:
   for (gid = 0; gid < num_groups; gid++) {
      FREE(stgroups[gid].counters);
      FREE((void *)groups[gid].Counters);
   }
   FREE(stgroups);
fail_only_groups:
   FREE(groups);
}

void
st_destroy_perfmon(struct st_context *st)
{
   struct gl_perf_monitor_state *perfmon = &st->ctx->PerfMonitor;
   int gid;

   for (gid = 0; gid < perfmon->NumGroups; gid++) {
      FREE(st->perfmon[gid].counters);
      FREE((void *)perfmon->Groups[gid].Counters);
   }
   FREE(st->perfmon);
   FREE((void *)perfmon->Groups);
}

void st_init_perfmon_functions(struct dd_function_table *functions)
{
   functions->InitPerfMonitorGroups = st_InitPerfMonitorGroups;
   functions->NewPerfMonitor = st_NewPerfMonitor;
   functions->DeletePerfMonitor = st_DeletePerfMonitor;
   functions->BeginPerfMonitor = st_BeginPerfMonitor;
   functions->EndPerfMonitor = st_EndPerfMonitor;
   functions->ResetPerfMonitor = st_ResetPerfMonitor;
   functions->IsPerfMonitorResultAvailable = st_IsPerfMonitorResultAvailable;
   functions->GetPerfMonitorResult = st_GetPerfMonitorResult;
}