/**************************************************************************
 * 
 * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
 * All Rights Reserved.
 * 
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sub license, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice (including the
 * next paragraph) shall be included in all copies or substantial portions
 * of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
 * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 * 
 **************************************************************************/

 /*
  * Authors:
  *   Keith Whitwell <keith@tungstengraphics.com>
  */

#include "util/u_memory.h"
#include "util/u_math.h"
#include "draw/draw_context.h"
#include "draw/draw_private.h"
#include "draw/draw_vbuf.h"
#include "draw/draw_vertex.h"
#include "draw/draw_vs.h"
#include "translate/translate.h"

/* A first pass at incorporating vertex fetch/emit functionality into 
 */
struct draw_vs_variant_generic {
   struct draw_vs_variant base;

   struct draw_vertex_shader *shader;
   struct draw_context *draw;
   
   /* Basic plan is to run these two translate functions before/after
    * the vertex shader's existing run_linear() routine to simulate
    * the inclusion of this functionality into the shader...  
    * 
    * Next will look at actually including it.
    */
   struct translate *fetch;
   struct translate *emit;

   unsigned temp_vertex_stride;
};





static void vsvg_set_buffer( struct draw_vs_variant *variant,
                             unsigned buffer,
                             const void *ptr,
                             unsigned stride,
                             unsigned max_index )
{
   struct draw_vs_variant_generic *vsvg = (struct draw_vs_variant_generic *)variant;

   vsvg->fetch->set_buffer(vsvg->fetch, 
                           buffer, 
                           ptr, 
                           stride,
                           max_index );
}


/* Mainly for debug at this stage:
 */
static void do_rhw_viewport( struct draw_vs_variant_generic *vsvg,
                             unsigned count,
                             void *output_buffer )
{
   char *ptr = (char *)output_buffer;
   const float *scale = vsvg->base.vs->draw->viewport.scale;
   const float *trans = vsvg->base.vs->draw->viewport.translate;
   unsigned stride = vsvg->temp_vertex_stride;
   unsigned j;

   ptr += vsvg->base.vs->position_output * 4 * sizeof(float);

   for (j = 0; j < count; j++, ptr += stride) {
      float *data = (float *)ptr;
      float w = 1.0f / data[3];

      data[0] = data[0] * w * scale[0] + trans[0];
      data[1] = data[1] * w * scale[1] + trans[1];
      data[2] = data[2] * w * scale[2] + trans[2];
      data[3] = w;
   }
}

static void do_viewport( struct draw_vs_variant_generic *vsvg,
                         unsigned count,
                         void *output_buffer )
{
   char *ptr = (char *)output_buffer;
   const float *scale = vsvg->base.vs->draw->viewport.scale;
   const float *trans = vsvg->base.vs->draw->viewport.translate;
   unsigned stride = vsvg->temp_vertex_stride;
   unsigned j;

   ptr += vsvg->base.vs->position_output * 4 * sizeof(float);

   for (j = 0; j < count; j++, ptr += stride) {
      float *data = (float *)ptr;

      data[0] = data[0] * scale[0] + trans[0];
      data[1] = data[1] * scale[1] + trans[1];
      data[2] = data[2] * scale[2] + trans[2];
   }
}
                         

static void PIPE_CDECL vsvg_run_elts( struct draw_vs_variant *variant,
                                      const unsigned *elts,
                                      unsigned count,
                                      void *output_buffer)
{
   struct draw_vs_variant_generic *vsvg = (struct draw_vs_variant_generic *)variant;
   unsigned temp_vertex_stride = vsvg->temp_vertex_stride;
   void *temp_buffer = MALLOC( align(count,4) * temp_vertex_stride );
   
   if (0) debug_printf("%s %d \n", __FUNCTION__,  count);
			
   /* Want to do this in small batches for cache locality?
    */
   
   vsvg->fetch->run_elts( vsvg->fetch, 
                          elts,
                          count,
                          vsvg->draw->instance_id,
                          temp_buffer );

   vsvg->base.vs->run_linear( vsvg->base.vs, 
                              temp_buffer,
                              temp_buffer,
                              vsvg->base.vs->draw->pt.user.vs_constants,
                              vsvg->base.vs->draw->pt.user.vs_constants_size,
                              count,
                              temp_vertex_stride, 
                              temp_vertex_stride);

   /* FIXME: geometry shading? */

   if (vsvg->base.key.clip) {
      /* not really handling clipping, just do the rhw so we can
       * see the results...
       */
      do_rhw_viewport( vsvg,
                       count,
                       temp_buffer );
   }
   else if (vsvg->base.key.viewport) {
      do_viewport( vsvg,
                   count,
                   temp_buffer );
   }


   vsvg->emit->set_buffer( vsvg->emit,
                           0, 
                           temp_buffer,
                           temp_vertex_stride,
                           ~0 );

   vsvg->emit->set_buffer( vsvg->emit, 
                           1,
                           &vsvg->draw->rasterizer->point_size,
                           0,
                           ~0 );

   vsvg->emit->run( vsvg->emit,
                    0, count,
                    vsvg->draw->instance_id,
                    output_buffer );

   FREE(temp_buffer);
}


static void PIPE_CDECL vsvg_run_linear( struct draw_vs_variant *variant,
                                        unsigned start,
                                        unsigned count,
                                        void *output_buffer )
{
   struct draw_vs_variant_generic *vsvg = (struct draw_vs_variant_generic *)variant;
   unsigned temp_vertex_stride = vsvg->temp_vertex_stride;
   void *temp_buffer = MALLOC( align(count,4) * temp_vertex_stride );
	
   if (0) debug_printf("%s %d %d (sz %d, %d)\n", __FUNCTION__, start, count,
                       vsvg->base.key.output_stride,
                       temp_vertex_stride);

   vsvg->fetch->run( vsvg->fetch, 
                     start,
                     count,
                     vsvg->draw->instance_id,
                     temp_buffer );

   vsvg->base.vs->run_linear( vsvg->base.vs, 
                              temp_buffer,
                              temp_buffer,
                              vsvg->base.vs->draw->pt.user.vs_constants,
                              vsvg->base.vs->draw->pt.user.vs_constants_size,
                              count,
                              temp_vertex_stride, 
                              temp_vertex_stride);

   if (vsvg->base.key.clip) {
      /* not really handling clipping, just do the rhw so we can
       * see the results...
       */
      do_rhw_viewport( vsvg,
                       count,
                       temp_buffer );
   }
   else if (vsvg->base.key.viewport) {
      do_viewport( vsvg,
                   count,
                   temp_buffer );
   }

   vsvg->emit->set_buffer( vsvg->emit,
                           0, 
                           temp_buffer,
                           temp_vertex_stride,
                           ~0 );
   
   vsvg->emit->set_buffer( vsvg->emit, 
                           1,
                           &vsvg->draw->rasterizer->point_size,
                           0,
                           ~0 );
   
   vsvg->emit->run( vsvg->emit,
                    0, count,
                    vsvg->draw->instance_id,
                    output_buffer );

   FREE(temp_buffer);
}





static void vsvg_destroy( struct draw_vs_variant *variant )
{
   FREE(variant);
}


struct draw_vs_variant *
draw_vs_create_variant_generic( struct draw_vertex_shader *vs,
                                const struct draw_vs_variant_key *key )
{
   unsigned i;
   struct translate_key fetch, emit;

   struct draw_vs_variant_generic *vsvg = CALLOC_STRUCT( draw_vs_variant_generic );
   if (vsvg == NULL)
      return NULL;

   vsvg->base.key = *key;
   vsvg->base.vs = vs;
   vsvg->base.set_buffer    = vsvg_set_buffer;
   vsvg->base.run_elts      = vsvg_run_elts;
   vsvg->base.run_linear    = vsvg_run_linear;
   vsvg->base.destroy       = vsvg_destroy;

   vsvg->draw = vs->draw;

   vsvg->temp_vertex_stride = MAX2(key->nr_inputs,
                                   vsvg->base.vs->info.num_outputs) * 4 * sizeof(float);

   /* Build free-standing fetch and emit functions:
    */
   fetch.nr_elements = key->nr_inputs;
   fetch.output_stride = vsvg->temp_vertex_stride;
   for (i = 0; i < key->nr_inputs; i++) {
      fetch.element[i].type = TRANSLATE_ELEMENT_NORMAL;
      fetch.element[i].input_format = key->element[i].in.format;
      fetch.element[i].input_buffer = key->element[i].in.buffer;
      fetch.element[i].input_offset = key->element[i].in.offset;
      fetch.element[i].instance_divisor = 0;
      fetch.element[i].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
      fetch.element[i].output_offset = i * 4 * sizeof(float);
      assert(fetch.element[i].output_offset < fetch.output_stride);
   }


   emit.nr_elements = key->nr_outputs;
   emit.output_stride = key->output_stride;
   for (i = 0; i < key->nr_outputs; i++) {
      if (key->element[i].out.format != EMIT_1F_PSIZE)
      {      
         emit.element[i].type = TRANSLATE_ELEMENT_NORMAL;
         emit.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
         emit.element[i].input_buffer = 0;
         emit.element[i].input_offset = key->element[i].out.vs_output * 4 * sizeof(float);
         emit.element[i].instance_divisor = 0;
         emit.element[i].output_format = draw_translate_vinfo_format(key->element[i].out.format);
         emit.element[i].output_offset = key->element[i].out.offset;
         assert(emit.element[i].input_offset <= fetch.output_stride);
      }
      else {
         emit.element[i].type = TRANSLATE_ELEMENT_NORMAL;
         emit.element[i].input_format = PIPE_FORMAT_R32_FLOAT;
         emit.element[i].input_buffer = 1;
         emit.element[i].input_offset = 0;
         emit.element[i].instance_divisor = 0;
         emit.element[i].output_format = PIPE_FORMAT_R32_FLOAT;
         emit.element[i].output_offset = key->element[i].out.offset;
      }
   }

   vsvg->fetch = draw_vs_get_fetch( vs->draw, &fetch );
   vsvg->emit = draw_vs_get_emit( vs->draw, &emit );

   return &vsvg->base;
}