// OpenCL port of the ORB feature detector and descriptor extractor
// Copyright (C) 2014, Itseez Inc. See the license at http://opencv.org
//
// The original code has been contributed by Peter Andreas Entschev, peter@entschev.com

#define LAYERINFO_SIZE 1
#define LAYERINFO_OFS 0
#define KEYPOINT_SIZE 3
#define ORIENTED_KEYPOINT_SIZE 4
#define KEYPOINT_X 0
#define KEYPOINT_Y 1
#define KEYPOINT_Z 2
#define KEYPOINT_ANGLE 3

/////////////////////////////////////////////////////////////

#ifdef ORB_RESPONSES

__kernel void
ORB_HarrisResponses(__global const uchar* imgbuf, int imgstep, int imgoffset0,
                    __global const int* layerinfo, __global const int* keypoints,
                    __global float* responses, int nkeypoints )
{
    int idx = get_global_id(0);
    if( idx < nkeypoints )
    {
        __global const int* kpt = keypoints + idx*KEYPOINT_SIZE;
        __global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;
        __global const uchar* img = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +
            (kpt[KEYPOINT_Y] - blockSize/2)*imgstep + (kpt[KEYPOINT_X] - blockSize/2);

        int i, j;
        int a = 0, b = 0, c = 0;
        for( i = 0; i < blockSize; i++, img += imgstep-blockSize )
        {
            for( j = 0; j < blockSize; j++, img++ )
            {
                int Ix = (img[1] - img[-1])*2 + img[-imgstep+1] - img[-imgstep-1] + img[imgstep+1] - img[imgstep-1];
                int Iy = (img[imgstep] - img[-imgstep])*2 + img[imgstep-1] - img[-imgstep-1] + img[imgstep+1] - img[-imgstep+1];
                a += Ix*Ix;
                b += Iy*Iy;
                c += Ix*Iy;
            }
        }
        responses[idx] = ((float)a * b - (float)c * c - HARRIS_K * (float)(a + b) * (a + b))*scale_sq_sq;
    }
}

#endif

/////////////////////////////////////////////////////////////

#ifdef ORB_ANGLES

#define _DBL_EPSILON 2.2204460492503131e-16f
#define atan2_p1 (0.9997878412794807f*57.29577951308232f)
#define atan2_p3 (-0.3258083974640975f*57.29577951308232f)
#define atan2_p5 (0.1555786518463281f*57.29577951308232f)
#define atan2_p7 (-0.04432655554792128f*57.29577951308232f)

inline float fastAtan2( float y, float x )
{
    float ax = fabs(x), ay = fabs(y);
    float a, c, c2;
    if( ax >= ay )
    {
        c = ay/(ax + _DBL_EPSILON);
        c2 = c*c;
        a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
    }
    else
    {
        c = ax/(ay + _DBL_EPSILON);
        c2 = c*c;
        a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;
    }
    if( x < 0 )
        a = 180.f - a;
    if( y < 0 )
        a = 360.f - a;
    return a;
}


__kernel void
ORB_ICAngle(__global const uchar* imgbuf, int imgstep, int imgoffset0,
            __global const int* layerinfo, __global const int* keypoints,
            __global float* responses, const __global int* u_max,
            int nkeypoints, int half_k )
{
    int idx = get_global_id(0);
    if( idx < nkeypoints )
    {
        __global const int* kpt = keypoints + idx*KEYPOINT_SIZE;

        __global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;
        __global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +
            kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];

        int u, v, m_01 = 0, m_10 = 0;

        // Treat the center line differently, v=0
        for( u = -half_k; u <= half_k; u++ )
            m_10 += u * center[u];

        // Go line by line in the circular patch
        for( v = 1; v <= half_k; v++ )
        {
            // Proceed over the two lines
            int v_sum = 0;
            int d = u_max[v];
            for( u = -d; u <= d; u++ )
            {
                int val_plus = center[u + v*imgstep], val_minus = center[u - v*imgstep];
                v_sum += (val_plus - val_minus);
                m_10 += u * (val_plus + val_minus);
            }
            m_01 += v * v_sum;
        }

        // we do not use OpenCL's atan2 intrinsic,
        // because we want to get _exactly_ the same results as the CPU version
        responses[idx] = fastAtan2((float)m_01, (float)m_10);
    }
}

#endif

/////////////////////////////////////////////////////////////

#ifdef ORB_DESCRIPTORS

__kernel void
ORB_computeDescriptor(__global const uchar* imgbuf, int imgstep, int imgoffset0,
                      __global const int* layerinfo, __global const int* keypoints,
                      __global uchar* _desc, const __global int* pattern,
                      int nkeypoints, int dsize )
{
    int idx = get_global_id(0);
    if( idx < nkeypoints )
    {
        int i;
        __global const int* kpt = keypoints + idx*ORIENTED_KEYPOINT_SIZE;

        __global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;
        __global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +
                                kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];
        float angle = as_float(kpt[KEYPOINT_ANGLE]);
        angle *= 0.01745329251994329547f;

        float cosa;
        float sina = sincos(angle, &cosa);

        __global uchar* desc = _desc + idx*dsize;

        #define GET_VALUE(idx) \
            center[mad24(convert_int_rte(pattern[(idx)*2] * sina + pattern[(idx)*2+1] * cosa), imgstep, \
                        convert_int_rte(pattern[(idx)*2] * cosa - pattern[(idx)*2+1] * sina))]

        for( i = 0; i < dsize; i++ )
        {
            int val;
        #if WTA_K == 2
            int t0, t1;

            t0 = GET_VALUE(0); t1 = GET_VALUE(1);
            val = t0 < t1;

            t0 = GET_VALUE(2); t1 = GET_VALUE(3);
            val |= (t0 < t1) << 1;

            t0 = GET_VALUE(4); t1 = GET_VALUE(5);
            val |= (t0 < t1) << 2;

            t0 = GET_VALUE(6); t1 = GET_VALUE(7);
            val |= (t0 < t1) << 3;

            t0 = GET_VALUE(8); t1 = GET_VALUE(9);
            val |= (t0 < t1) << 4;

            t0 = GET_VALUE(10); t1 = GET_VALUE(11);
            val |= (t0 < t1) << 5;

            t0 = GET_VALUE(12); t1 = GET_VALUE(13);
            val |= (t0 < t1) << 6;

            t0 = GET_VALUE(14); t1 = GET_VALUE(15);
            val |= (t0 < t1) << 7;

            pattern += 16*2;

        #elif WTA_K == 3
            int t0, t1, t2;

            t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
            val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);

            t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
            val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;

            t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
            val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;

            t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
            val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;

            pattern += 12*2;

        #elif WTA_K == 4
            int t0, t1, t2, t3, k;
            int a, b;

            t0 = GET_VALUE(0); t1 = GET_VALUE(1);
            t2 = GET_VALUE(2); t3 = GET_VALUE(3);
            a = 0, b = 2;
            if( t1 > t0 ) t0 = t1, a = 1;
            if( t3 > t2 ) t2 = t3, b = 3;
            k = t0 > t2 ? a : b;
            val = k;

            t0 = GET_VALUE(4); t1 = GET_VALUE(5);
            t2 = GET_VALUE(6); t3 = GET_VALUE(7);
            a = 0, b = 2;
            if( t1 > t0 ) t0 = t1, a = 1;
            if( t3 > t2 ) t2 = t3, b = 3;
            k = t0 > t2 ? a : b;
            val |= k << 2;

            t0 = GET_VALUE(8); t1 = GET_VALUE(9);
            t2 = GET_VALUE(10); t3 = GET_VALUE(11);
            a = 0, b = 2;
            if( t1 > t0 ) t0 = t1, a = 1;
            if( t3 > t2 ) t2 = t3, b = 3;
            k = t0 > t2 ? a : b;
            val |= k << 4;

            t0 = GET_VALUE(12); t1 = GET_VALUE(13);
            t2 = GET_VALUE(14); t3 = GET_VALUE(15);
            a = 0, b = 2;
            if( t1 > t0 ) t0 = t1, a = 1;
            if( t3 > t2 ) t2 = t3, b = 3;
            k = t0 > t2 ? a : b;
            val |= k << 6;

            pattern += 16*2;
        #else
            #error "unknown/undefined WTA_K value; should be 2, 3 or 4"
        #endif
            desc[i] = (uchar)val;
        }
    }
}

#endif