/*M///////////////////////////////////////////////////////////////////////////////////////
//
//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
//  By downloading, copying, installing or using the software you agree to this license.
//  If you do not agree to this license, do not download, install,
//  copy or use the software.
//
//
//                           License Agreement
//                For Open Source Computer Vision Library
//
// Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// @Authors
//    Zhang Ying, zhangying913@gmail.com
//	  Niko Li, newlife20080214@gmail.com
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
//   * Redistribution's of source code must retain the above copyright notice,
//     this list of conditions and the following disclaimer.
//
//   * Redistribution's in binary form must reproduce the above copyright notice,
//     this list of conditions and the following disclaimer in the documentation
//     and/or other materials provided with the distribution.
//
//   * The name of the copyright holders may not be used to endorse or promote products
//     derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors as is and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/

#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif

#define INTER_RESIZE_COEF_SCALE (1 << INTER_RESIZE_COEF_BITS)
#define CAST_BITS (INTER_RESIZE_COEF_BITS << 1)
#define INC(x,l) min(x+1,l-1)

#define noconvert

#if cn != 3
#define loadpix(addr)  *(__global const T *)(addr)
#define storepix(val, addr)  *(__global T *)(addr) = val
#define TSIZE (int)sizeof(T)
#else
#define loadpix(addr)  vload3(0, (__global const T1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global T1 *)(addr))
#define TSIZE (int)sizeof(T1)*cn
#endif

#if defined USE_SAMPLER

#if cn == 1
#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).x
#define INTERMEDIATE_TYPE  float
#elif cn == 2
#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xy
#define INTERMEDIATE_TYPE  float2
#elif cn == 3
#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z).xyz
#define INTERMEDIATE_TYPE  float3
#elif cn == 4
#define READ_IMAGE(X,Y,Z)  read_imagef(X,Y,Z)
#define INTERMEDIATE_TYPE  float4
#endif

#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
//#define INTERMEDIATE_TYPE CAT(float, cn)
#define float1 float

#if depth == 0
#define RESULT_SCALE    255.0f
#elif depth == 1
#define RESULT_SCALE    127.0f
#elif depth == 2
#define RESULT_SCALE    65535.0f
#elif depth == 3
#define RESULT_SCALE    32767.0f
#else
#define RESULT_SCALE    1.0f
#endif

__kernel void resizeSampler(__read_only image2d_t srcImage,
                            __global uchar* dstptr, int dststep, int dstoffset,
                            int dstrows, int dstcols,
                            float ifx, float ify)
{
    const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE |
                              CLK_ADDRESS_CLAMP_TO_EDGE |
                              CLK_FILTER_LINEAR;

    int dx = get_global_id(0);
    int dy = get_global_id(1);

    float sx = ((dx+0.5f) * ifx), sy = ((dy+0.5f) * ify);

    INTERMEDIATE_TYPE intermediate = READ_IMAGE(srcImage, sampler, (float2)(sx, sy));

#if depth <= 4
    T uval = convertToDT(round(intermediate * RESULT_SCALE));
#else
    T uval = convertToDT(intermediate * RESULT_SCALE);
#endif

    if(dx < dstcols && dy < dstrows)
    {
        storepix(uval, dstptr + mad24(dy, dststep, dstoffset + dx*TSIZE));
    }
}

#elif defined INTER_LINEAR_INTEGER

__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                       __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                       __global const uchar * buffer)
{
    int dx = get_global_id(0);
    int dy = get_global_id(1);

    if (dx < dst_cols && dy < dst_rows)
    {
        __global const int * xofs = (__global const int *)(buffer), * yofs = xofs + dst_cols;
        __global const short * ialpha = (__global const short *)(yofs + dst_rows);
        __global const short * ibeta = ialpha + ((dst_cols + dy) << 1);
        ialpha += dx << 1;

        int sx0 = xofs[dx], sy0 = clamp(yofs[dy], 0, src_rows - 1),
        sy1 = clamp(yofs[dy] + 1, 0, src_rows - 1);
        short a0 = ialpha[0], a1 = ialpha[1];
        short b0 = ibeta[0], b1 = ibeta[1];

        int src_index0 = mad24(sy0, src_step, mad24(sx0, TSIZE, src_offset)),
        src_index1 = mad24(sy1, src_step, mad24(sx0, TSIZE, src_offset));
        WT data0 = convertToWT(loadpix(srcptr + src_index0));
        WT data1 = convertToWT(loadpix(srcptr + src_index0 + TSIZE));
        WT data2 = convertToWT(loadpix(srcptr + src_index1));
        WT data3 = convertToWT(loadpix(srcptr + src_index1 + TSIZE));

        WT val = ( (((data0 * a0 + data1 * a1) >> 4) * b0) >> 16) +
                 ( (((data2 * a0 + data3 * a1) >> 4) * b1) >> 16);

        storepix(convertToDT((val + 2) >> 2),
                 dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
    }
}

#elif defined INTER_LINEAR

__kernel void resizeLN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                       __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                       float ifx, float ify)
{
    int dx = get_global_id(0);
    int dy = get_global_id(1);

    if (dx < dst_cols && dy < dst_rows)
    {
        float sx = ((dx+0.5f) * ifx - 0.5f), sy = ((dy+0.5f) * ify - 0.5f);
        int x = floor(sx), y = floor(sy);

        float u = sx - x, v = sy - y;

        if ( x<0 ) x=0,u=0;
        if ( x>=src_cols ) x=src_cols-1,u=0;
        if ( y<0 ) y=0,v=0;
        if ( y>=src_rows ) y=src_rows-1,v=0;

        int y_ = INC(y, src_rows);
        int x_ = INC(x, src_cols);

#if depth <= 4
        u = u * INTER_RESIZE_COEF_SCALE;
        v = v * INTER_RESIZE_COEF_SCALE;

        int U = rint(u);
        int V = rint(v);
        int U1 = rint(INTER_RESIZE_COEF_SCALE - u);
        int V1 = rint(INTER_RESIZE_COEF_SCALE - v);

        WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
        WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
        WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
        WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));

        WT val = mul24((WT)mul24(U1, V1), data0) + mul24((WT)mul24(U, V1), data1) +
                   mul24((WT)mul24(U1, V), data2) + mul24((WT)mul24(U, V), data3);

        T uval = convertToDT((val + (1<<(CAST_BITS-1)))>>CAST_BITS);
#else
        float u1 = 1.f - u;
        float v1 = 1.f - v;
        WT data0 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x, TSIZE, src_offset))));
        WT data1 = convertToWT(loadpix(srcptr + mad24(y, src_step, mad24(x_, TSIZE, src_offset))));
        WT data2 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x, TSIZE, src_offset))));
        WT data3 = convertToWT(loadpix(srcptr + mad24(y_, src_step, mad24(x_, TSIZE, src_offset))));

        T uval = u1 * v1 * data0 + u * v1 * data1 + u1 * v *data2 + u * v *data3;
#endif
        storepix(uval, dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
    }
}

#elif defined INTER_NEAREST

__kernel void resizeNN(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
                       __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                       float ifx, float ify)
{
    int dx = get_global_id(0);
    int dy = get_global_id(1);

    if (dx < dst_cols && dy < dst_rows)
    {
        float s1 = dx * ifx;
        float s2 = dy * ify;
        int sx = min(convert_int_rtz(s1), src_cols - 1);
        int sy = min(convert_int_rtz(s2), src_rows - 1);

        storepix(loadpix(srcptr + mad24(sy, src_step, mad24(sx, TSIZE, src_offset))),
                 dstptr + mad24(dy, dst_step, mad24(dx, TSIZE, dst_offset)));
    }
}

#elif defined INTER_AREA

#ifdef INTER_AREA_FAST

__kernel void resizeAREA_FAST(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
                              __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
{
    int dx = get_global_id(0);
    int dy = get_global_id(1);

    if (dx < dst_cols && dy < dst_rows)
    {
        int dst_index = mad24(dy, dst_step, dst_offset);

        int sx = XSCALE * dx;
        int sy = YSCALE * dy;
        WTV sum = (WTV)(0);

        #pragma unroll
        for (int py = 0; py < YSCALE; ++py)
        {
            int y = min(sy + py, src_rows - 1);
            int src_index = mad24(y, src_step, src_offset);
            #pragma unroll
            for (int px = 0; px < XSCALE; ++px)
            {
                int x = min(sx + px, src_cols - 1);
                sum += convertToWTV(loadpix(src + src_index + x*TSIZE));
            }
        }

        storepix(convertToT(convertToWT2V(sum) * (WT2V)(SCALE)), dst + mad24(dx, TSIZE, dst_index));
    }
}

#else

__kernel void resizeAREA(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
                         float ifx, float ify, __global const int * ofs_tab,
                         __global const int * map_tab, __global const float * alpha_tab)
{
    int dx = get_global_id(0);
    int dy = get_global_id(1);

    if (dx < dst_cols && dy < dst_rows)
    {
        int dst_index = mad24(dy, dst_step, dst_offset);

        __global const int * xmap_tab = map_tab;
        __global const int * ymap_tab = (__global const int *)(map_tab + (src_cols << 1));
        __global const float * xalpha_tab = alpha_tab;
        __global const float * yalpha_tab = (__global const float *)(alpha_tab + (src_cols << 1));
        __global const int * xofs_tab = ofs_tab;
        __global const int * yofs_tab = (__global const int *)(ofs_tab + dst_cols + 1);

        int xk0 = xofs_tab[dx], xk1 = xofs_tab[dx + 1];
        int yk0 = yofs_tab[dy], yk1 = yofs_tab[dy + 1];

        int sy0 = ymap_tab[yk0], sy1 = ymap_tab[yk1 - 1];
        int sx0 = xmap_tab[xk0], sx1 = xmap_tab[xk1 - 1];

        WTV sum = (WTV)(0), buf;
        int src_index = mad24(sy0, src_step, src_offset);

        for (int sy = sy0, yk = yk0; sy <= sy1; ++sy, src_index += src_step, ++yk)
        {
            WTV beta = (WTV)(yalpha_tab[yk]);
            buf = (WTV)(0);

            for (int sx = sx0, xk = xk0; sx <= sx1; ++sx, ++xk)
            {
                WTV alpha = (WTV)(xalpha_tab[xk]);
                buf += convertToWTV(loadpix(src + mad24(sx, TSIZE, src_index))) * alpha;
            }
            sum += buf * beta;
        }

        storepix(convertToT(sum), dst + mad24(dx, TSIZE, dst_index));
    }
}

#endif

#endif