/******************************************************************************
*
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
/**
*******************************************************************************
* @file
*  ihevc_sao.c
*
* @brief
*  Contains leaf level function definitions for sample adaptive offset process
*
* @author
*  Srinivas T
*
* @par List of Functions:
*   - ihevc_sao_band_offset_luma()
*   - ihevc_sao_band_offset_chroma()
*   - ihevc_sao_edge_offset_class0()
*   - ihevc_sao_edge_offset_class0_chroma()
*   - ihevc_sao_edge_offset_class1()
*   - ihevc_sao_edge_offset_class1_chroma()
*   - ihevc_sao_edge_offset_class2()
*   - ihevc_sao_edge_offset_class2_chroma()
*   - ihevc_sao_edge_offset_class3()
*   - ihevc_sao_edge_offset_class3_chroma()
* @remarks
*  None
*
*******************************************************************************
*/
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include "ihevc_typedefs.h"
#include "ihevc_macros.h"
#include "ihevc_platform_macros.h"
#include "ihevc_func_selector.h"
#include "ihevc_defs.h"
#include "ihevc_structs.h"
#include "ihevc_sao.h"

#define NUM_BAND_TABLE  32

const WORD32 gi4_ihevc_table_edge_idx[5] = { 1, 2, 0, 3, 4 };
/**
 * au4_avail is an array of flags - one for each neighboring block specifying if the block is available
 * au4_avail[0] - left
 * au4_avail[1] - right
 * au4_avail[2] - top
 * au4_avail[3] - bottom
 * au4_avail[4] - top-left
 * au4_avail[5] - top-right
 * au4_avail[6] - bottom-left
 * au4_avail[7] - bottom-right
 */


void ihevc_sao_band_offset_luma(UWORD8 *pu1_src,
                                WORD32 src_strd,
                                UWORD8 *pu1_src_left,
                                UWORD8 *pu1_src_top,
                                UWORD8 *pu1_src_top_left,
                                WORD32 sao_band_pos,
                                WORD8 *pi1_sao_offset,
                                WORD32 wd,
                                WORD32 ht)
{
    WORD32 band_shift;
    WORD32 band_table[NUM_BAND_TABLE];
    WORD32 i;
    WORD32 row, col;

    /* Updating left and top and top-left */
    for(row = 0; row < ht; row++)
    {
        pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
    }
    pu1_src_top_left[0] = pu1_src_top[wd - 1];
    for(col = 0; col < wd; col++)
    {
        pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
    }

    band_shift = BIT_DEPTH_LUMA - 5;
    for(i = 0; i < NUM_BAND_TABLE; i++)
    {
        band_table[i] = 0;
    }
    for(i = 0; i < 4; i++)
    {
        band_table[(i + sao_band_pos) & 31] = i + 1;
    }

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            WORD32 band_idx;

            band_idx = band_table[pu1_src[col] >> band_shift];
            pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[band_idx], 0, (1 << (band_shift + 5)) - 1);
        }
        pu1_src += src_strd;
    }
}



/* input 'wd' has to be for the interleaved block and not for each color component */
void ihevc_sao_band_offset_chroma(UWORD8 *pu1_src,
                                  WORD32 src_strd,
                                  UWORD8 *pu1_src_left,
                                  UWORD8 *pu1_src_top,
                                  UWORD8 *pu1_src_top_left,
                                  WORD32 sao_band_pos_u,
                                  WORD32 sao_band_pos_v,
                                  WORD8 *pi1_sao_offset_u,
                                  WORD8 *pi1_sao_offset_v,
                                  WORD32 wd,
                                  WORD32 ht)
{
    WORD32 band_shift;
    WORD32 band_table_u[NUM_BAND_TABLE];
    WORD32 band_table_v[NUM_BAND_TABLE];
    WORD32 i;
    WORD32 row, col;

    /* Updating left and top and top-left */
    for(row = 0; row < ht; row++)
    {
        pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
    }
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
    for(col = 0; col < wd; col++)
    {
        pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
    }


    band_shift = BIT_DEPTH_CHROMA - 5;
    for(i = 0; i < NUM_BAND_TABLE; i++)
    {
        band_table_u[i] = 0;
        band_table_v[i] = 0;
    }
    for(i = 0; i < 4; i++)
    {
        band_table_u[(i + sao_band_pos_u) & 31] = i + 1;
        band_table_v[(i + sao_band_pos_v) & 31] = i + 1;
    }

    for(row = 0; row < ht; row++)
    {
        for(col = 0; col < wd; col++)
        {
            WORD32 band_idx;
            WORD8 *pi1_sao_offset;

            pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;
            band_idx = (0 == col % 2) ? band_table_u[pu1_src[col] >> band_shift] : band_table_v[pu1_src[col] >> band_shift];
            pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[band_idx], 0, (1 << (band_shift + 5)) - 1);
        }
        pu1_src += src_strd;
    }
}



/* Horizontal filtering */
void ihevc_sao_edge_offset_class0(UWORD8 *pu1_src,
                                  WORD32 src_strd,
                                  UWORD8 *pu1_src_left,
                                  UWORD8 *pu1_src_top,
                                  UWORD8 *pu1_src_top_left,
                                  UWORD8 *pu1_src_top_right,
                                  UWORD8 *pu1_src_bot_left,
                                  UWORD8 *pu1_avail,
                                  WORD8 *pi1_sao_offset,
                                  WORD32 wd,
                                  WORD32 ht)
{
    WORD32 row, col;
    UWORD8 au1_mask[MAX_CTB_SIZE];
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE];
    WORD8 u1_sign_left, u1_sign_right;
    WORD32 bit_depth;
    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);
    bit_depth = BIT_DEPTH_LUMA;

    /* Initialize the mask values */
    memset(au1_mask, 0xFF, MAX_CTB_SIZE);

    /* Update top and top-left arrays */
    *pu1_src_top_left = pu1_src_top[wd - 1];
    for(row = 0; row < ht; row++)
    {
        au1_src_left_tmp[row] = pu1_src[row * src_strd + wd - 1];
    }
    for(col = 0; col < wd; col++)
    {
        pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
    }

    /* Update masks based on the availability flags */
    if(0 == pu1_avail[0])
    {
        au1_mask[0] = 0;
    }
    if(0 == pu1_avail[1])
    {
        au1_mask[wd - 1] = 0;
    }

    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
    {
        for(row = 0; row < ht; row++)
        {
            u1_sign_left = SIGN(pu1_src[0] - pu1_src_left[row]);
            for(col = 0; col < wd; col++)
            {
                WORD32 edge_idx;

                u1_sign_right = SIGN(pu1_src[col] - pu1_src[col + 1]);
                edge_idx = 2 + u1_sign_left + u1_sign_right;
                u1_sign_left = -u1_sign_right;

                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];

                if(0 != edge_idx)
                {
                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
                }
            }

            pu1_src += src_strd;
        }
    }

    /* Update left array */
    for(row = 0; row < ht; row++)
    {
        pu1_src_left[row] = au1_src_left_tmp[row];
    }

}




/* input 'wd' has to be for the interleaved block and not for each color component */
void ihevc_sao_edge_offset_class0_chroma(UWORD8 *pu1_src,
                                         WORD32 src_strd,
                                         UWORD8 *pu1_src_left,
                                         UWORD8 *pu1_src_top,
                                         UWORD8 *pu1_src_top_left,
                                         UWORD8 *pu1_src_top_right,
                                         UWORD8 *pu1_src_bot_left,
                                         UWORD8 *pu1_avail,
                                         WORD8 *pi1_sao_offset_u,
                                         WORD8 *pi1_sao_offset_v,
                                         WORD32 wd,
                                         WORD32 ht)
{
    WORD32 row, col;
    UWORD8 au1_mask[MAX_CTB_SIZE];
    UWORD8 au1_src_left_tmp[2 * MAX_CTB_SIZE];
    WORD8 u1_sign_left_u, u1_sign_right_u;
    WORD8 u1_sign_left_v, u1_sign_right_v;
    WORD32 bit_depth;
    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);
    bit_depth = BIT_DEPTH_CHROMA;

    /* Initialize the mask values */
    memset(au1_mask, 0xFF, MAX_CTB_SIZE);

    /* Update left, top and top-left arrays */
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
    for(row = 0; row < ht; row++)
    {
        au1_src_left_tmp[2 * row] = pu1_src[row * src_strd + wd - 2];
        au1_src_left_tmp[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
    }
    for(col = 0; col < wd; col++)
    {
        pu1_src_top[col] = pu1_src[(ht - 1) * src_strd + col];
    }

    /* Update masks based on the availability flags */
    if(0 == pu1_avail[0])
    {
        au1_mask[0] = 0;
    }
    if(0 == pu1_avail[1])
    {
        au1_mask[(wd - 1) >> 1] = 0;
    }

    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
    {
        for(row = 0; row < ht; row++)
        {
            u1_sign_left_u = SIGN(pu1_src[0] - pu1_src_left[2 * row]);
            u1_sign_left_v = SIGN(pu1_src[1] - pu1_src_left[2 * row + 1]);
            for(col = 0; col < wd; col++)
            {
                WORD32 edge_idx;
                WORD8 *pi1_sao_offset;

                if(0 == col % 2)
                {
                    pi1_sao_offset = pi1_sao_offset_u;
                    u1_sign_right_u = SIGN(pu1_src[col] - pu1_src[col + 2]);
                    edge_idx = 2 + u1_sign_left_u + u1_sign_right_u;
                    u1_sign_left_u = -u1_sign_right_u;
                }
                else
                {
                    pi1_sao_offset = pi1_sao_offset_v;
                    u1_sign_right_v = SIGN(pu1_src[col] - pu1_src[col + 2]);
                    edge_idx = 2 + u1_sign_left_v + u1_sign_right_v;
                    u1_sign_left_v = -u1_sign_right_v;
                }

                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];

                if(0 != edge_idx)
                {
                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
                }
            }

            pu1_src += src_strd;
        }
    }

    for(row = 0; row < 2 * ht; row++)
    {
        pu1_src_left[row] = au1_src_left_tmp[row];
    }

}



/* Vertical filtering */
void ihevc_sao_edge_offset_class1(UWORD8 *pu1_src,
                                  WORD32 src_strd,
                                  UWORD8 *pu1_src_left,
                                  UWORD8 *pu1_src_top,
                                  UWORD8 *pu1_src_top_left,
                                  UWORD8 *pu1_src_top_right,
                                  UWORD8 *pu1_src_bot_left,
                                  UWORD8 *pu1_avail,
                                  WORD8 *pi1_sao_offset,
                                  WORD32 wd,
                                  WORD32 ht)
{
    WORD32 row, col;
    UWORD8 au1_mask[MAX_CTB_SIZE];
    UWORD8 au1_src_top_tmp[MAX_CTB_SIZE];
    WORD8 au1_sign_up[MAX_CTB_SIZE];
    WORD8 u1_sign_down;
    WORD32 bit_depth;
    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);

    bit_depth = BIT_DEPTH_LUMA;

    /* Initialize the mask values */
    memset(au1_mask, 0xFF, MAX_CTB_SIZE);

    /* Update left, top and top-left arrays */
    *pu1_src_top_left = pu1_src_top[wd - 1];
    for(row = 0; row < ht; row++)
    {
        pu1_src_left[row] = pu1_src[row * src_strd + wd - 1];
    }
    for(col = 0; col < wd; col++)
    {
        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
    }

    /* Update height and source pointers based on the availability flags */
    if(0 == pu1_avail[2])
    {
        pu1_src += src_strd;
        ht--;
        for(col = 0; col < wd; col++)
        {
            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - src_strd]);
        }
    }
    else
    {
        for(col = 0; col < wd; col++)
        {
            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col]);
        }
    }
    if(0 == pu1_avail[3])
    {
        ht--;
    }

    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
    {
        for(row = 0; row < ht; row++)
        {
            for(col = 0; col < wd; col++)
            {
                WORD32 edge_idx;

                u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + src_strd]);
                edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
                au1_sign_up[col] = -u1_sign_down;

                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];

                if(0 != edge_idx)
                {
                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
                }
            }

            pu1_src += src_strd;
        }
    }

    for(col = 0; col < wd; col++)
    {
        pu1_src_top[col] = au1_src_top_tmp[col];
    }

}



/* input 'wd' has to be for the interleaved block and not for each color component */
void ihevc_sao_edge_offset_class1_chroma(UWORD8 *pu1_src,
                                         WORD32 src_strd,
                                         UWORD8 *pu1_src_left,
                                         UWORD8 *pu1_src_top,
                                         UWORD8 *pu1_src_top_left,
                                         UWORD8 *pu1_src_top_right,
                                         UWORD8 *pu1_src_bot_left,
                                         UWORD8 *pu1_avail,
                                         WORD8 *pi1_sao_offset_u,
                                         WORD8 *pi1_sao_offset_v,
                                         WORD32 wd,
                                         WORD32 ht)
{
    WORD32 row, col;
    UWORD8 au1_mask[MAX_CTB_SIZE];
    UWORD8 au1_src_top_tmp[MAX_CTB_SIZE];
    WORD8 au1_sign_up[MAX_CTB_SIZE];
    WORD8 u1_sign_down;
    WORD32 bit_depth;
    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);

    bit_depth = BIT_DEPTH_CHROMA;

    /* Initialize the mask values */
    memset(au1_mask, 0xFF, MAX_CTB_SIZE);

    /* Update left, top and top-left arrays */
    pu1_src_top_left[0] = pu1_src_top[wd - 2];
    pu1_src_top_left[1] = pu1_src_top[wd - 1];
    for(row = 0; row < ht; row++)
    {
        pu1_src_left[2 * row] = pu1_src[row * src_strd + wd - 2];
        pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
    }
    for(col = 0; col < wd; col++)
    {
        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
    }

    /* Update height and source pointers based on the availability flags */
    if(0 == pu1_avail[2])
    {
        pu1_src += src_strd;
        ht--;
        for(col = 0; col < wd; col++)
        {
            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - src_strd]);
        }
    }
    else
    {
        for(col = 0; col < wd; col++)
        {
            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col]);
        }
    }
    if(0 == pu1_avail[3])
    {
        ht--;
    }

    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
    {
        for(row = 0; row < ht; row++)
        {
            for(col = 0; col < wd; col++)
            {
                WORD32 edge_idx;
                WORD8 *pi1_sao_offset;

                pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;

                u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + src_strd]);
                edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
                au1_sign_up[col] = -u1_sign_down;

                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];

                if(0 != edge_idx)
                {
                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
                }
            }

            pu1_src += src_strd;
        }
    }

    for(col = 0; col < wd; col++)
    {
        pu1_src_top[col] = au1_src_top_tmp[col];
    }

}



/* 135 degree filtering */
void ihevc_sao_edge_offset_class2(UWORD8 *pu1_src,
                                  WORD32 src_strd,
                                  UWORD8 *pu1_src_left,
                                  UWORD8 *pu1_src_top,
                                  UWORD8 *pu1_src_top_left,
                                  UWORD8 *pu1_src_top_right,
                                  UWORD8 *pu1_src_bot_left,
                                  UWORD8 *pu1_avail,
                                  WORD8 *pi1_sao_offset,
                                  WORD32 wd,
                                  WORD32 ht)
{
    WORD32 row, col;
    UWORD8 au1_mask[MAX_CTB_SIZE];
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE], au1_src_top_tmp[MAX_CTB_SIZE];
    UWORD8 u1_src_top_left_tmp;
    WORD8 au1_sign_up[MAX_CTB_SIZE + 1], au1_sign_up_tmp[MAX_CTB_SIZE + 1];
    WORD8 u1_sign_down;
    WORD8 *pu1_sign_up;
    WORD8 *pu1_sign_up_tmp;
    UWORD8 *pu1_src_left_cpy;

    WORD32 bit_depth;
    UWORD8 u1_pos_0_0_tmp;
    UWORD8 u1_pos_wd_ht_tmp;
    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);

    bit_depth = BIT_DEPTH_LUMA;
    pu1_sign_up = au1_sign_up;
    pu1_sign_up_tmp = au1_sign_up_tmp;
    pu1_src_left_cpy = pu1_src_left;

    /* Initialize the mask values */
    memset(au1_mask, 0xFF, MAX_CTB_SIZE);

    /* Update left, top and top-left arrays */
    u1_src_top_left_tmp = pu1_src_top[wd - 1];
    for(row = 0; row < ht; row++)
    {
        au1_src_left_tmp[row] = pu1_src[row * src_strd + wd - 1];
    }
    for(col = 0; col < wd; col++)
    {
        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
    }


    /* If top-left is available, process separately */
    if(0 != pu1_avail[4])
    {
        WORD32 edge_idx;

        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
                        SIGN(pu1_src[0] - pu1_src[1 + src_strd]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_0_tmp = pu1_src[0];
        }
    }
    else
    {
        u1_pos_0_0_tmp = pu1_src[0];
    }

    /* If bottom-right is available, process separately */
    if(0 != pu1_avail[7])
    {
        WORD32 edge_idx;

        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
        }
    }
    else
    {
        u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
    }

    /* If Left is not available */
    if(0 == pu1_avail[0])
    {
        au1_mask[0] = 0;
    }

    /* If Top is not available */
    if(0 == pu1_avail[2])
    {
        pu1_src += src_strd;
        ht--;
        pu1_src_left_cpy += 1;
        for(col = 1; col < wd; col++)
        {
            pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - 1 - src_strd]);
        }
    }
    else
    {
        for(col = 1; col < wd; col++)
        {
            pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col - 1]);
        }
    }

    /* If Right is not available */
    if(0 == pu1_avail[1])
    {
        au1_mask[wd - 1] = 0;
    }

    /* If Bottom is not available */
    if(0 == pu1_avail[3])
    {
        ht--;
    }

    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
    {
        for(row = 0; row < ht; row++)
        {
            pu1_sign_up[0] = SIGN(pu1_src[0] - pu1_src_left_cpy[row - 1]);
            for(col = 0; col < wd; col++)
            {
                WORD32 edge_idx;

                u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + 1 + src_strd]);
                edge_idx = 2 + pu1_sign_up[col] + u1_sign_down;
                pu1_sign_up_tmp[col + 1] = -u1_sign_down;

                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];

                if(0 != edge_idx)
                {
                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
                }
            }

            /* Swapping pu1_sign_up_tmp and pu1_sign_up */
            {
                WORD8 *pu1_swap_tmp = pu1_sign_up;
                pu1_sign_up = pu1_sign_up_tmp;
                pu1_sign_up_tmp = pu1_swap_tmp;
            }

            pu1_src += src_strd;
        }

        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd] = u1_pos_0_0_tmp;
        pu1_src[(pu1_avail[3] ? wd - 1 - src_strd : wd - 1)] = u1_pos_wd_ht_tmp;
    }

    if(0 == pu1_avail[2])
        ht++;
    if(0 == pu1_avail[3])
        ht++;
    *pu1_src_top_left = u1_src_top_left_tmp;
    for(row = 0; row < ht; row++)
    {
        pu1_src_left[row] = au1_src_left_tmp[row];
    }
    for(col = 0; col < wd; col++)
    {
        pu1_src_top[col] = au1_src_top_tmp[col];
    }

}




/* 135 degree filtering */
void ihevc_sao_edge_offset_class2_chroma(UWORD8 *pu1_src,
                                         WORD32 src_strd,
                                         UWORD8 *pu1_src_left,
                                         UWORD8 *pu1_src_top,
                                         UWORD8 *pu1_src_top_left,
                                         UWORD8 *pu1_src_top_right,
                                         UWORD8 *pu1_src_bot_left,
                                         UWORD8 *pu1_avail,
                                         WORD8 *pi1_sao_offset_u,
                                         WORD8 *pi1_sao_offset_v,
                                         WORD32 wd,
                                         WORD32 ht)
{
    WORD32 row, col;
    UWORD8 au1_mask[MAX_CTB_SIZE];
    UWORD8 au1_src_left_tmp[2 * MAX_CTB_SIZE], au1_src_top_tmp[MAX_CTB_SIZE];
    UWORD8 au1_src_top_left_tmp[2];
    WORD8 au1_sign_up[MAX_CTB_SIZE + 2], au1_sign_up_tmp[MAX_CTB_SIZE + 2];
    WORD8 u1_sign_down;
    WORD8 *pu1_sign_up;
    WORD8 *pu1_sign_up_tmp;
    UWORD8 *pu1_src_left_cpy;

    WORD32 bit_depth;

    UWORD8 u1_pos_0_0_tmp_u;
    UWORD8 u1_pos_0_0_tmp_v;
    UWORD8 u1_pos_wd_ht_tmp_u;
    UWORD8 u1_pos_wd_ht_tmp_v;
    UNUSED(pu1_src_top_right);
    UNUSED(pu1_src_bot_left);


    bit_depth = BIT_DEPTH_CHROMA;
    pu1_sign_up = au1_sign_up;
    pu1_sign_up_tmp = au1_sign_up_tmp;
    pu1_src_left_cpy = pu1_src_left;

    /* Initialize the mask values */
    memset(au1_mask, 0xFF, MAX_CTB_SIZE);

    /* Update left, top and top-left arrays */
    au1_src_top_left_tmp[0] = pu1_src_top[wd - 2];
    au1_src_top_left_tmp[1] = pu1_src_top[wd - 1];
    for(row = 0; row < ht; row++)
    {
        au1_src_left_tmp[2 * row] = pu1_src[row * src_strd + wd - 2];
        au1_src_left_tmp[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
    }
    for(col = 0; col < wd; col++)
    {
        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
    }


    /* If top-left is available, process separately */
    if(0 != pu1_avail[4])
    {
        WORD32 edge_idx;

        /* U */
        edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
                        SIGN(pu1_src[0] - pu1_src[2 + src_strd]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_0_tmp_u = pu1_src[0];
        }

        /* V */
        edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
                        SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_0_tmp_v = pu1_src[1];
        }
    }
    else
    {
        u1_pos_0_0_tmp_u = pu1_src[0];
        u1_pos_0_0_tmp_v = pu1_src[1];
    }

    /* If bottom-right is available, process separately */
    if(0 != pu1_avail[7])
    {
        WORD32 edge_idx;

        /* U */
        edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
                        SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
        }

        /* V */
        edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
                        SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
        }
    }
    else
    {
        u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
        u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
    }

    /* If Left is not available */
    if(0 == pu1_avail[0])
    {
        au1_mask[0] = 0;
    }

    /* If Top is not available */
    if(0 == pu1_avail[2])
    {
        pu1_src += src_strd;
        pu1_src_left_cpy += 2;
        ht--;
        for(col = 2; col < wd; col++)
        {
            pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col - 2 - src_strd]);
        }
    }
    else
    {
        for(col = 2; col < wd; col++)
        {
            pu1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col - 2]);
        }
    }

    /* If Right is not available */
    if(0 == pu1_avail[1])
    {
        au1_mask[(wd - 1) >> 1] = 0;
    }

    /* If Bottom is not available */
    if(0 == pu1_avail[3])
    {
        ht--;
    }

    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
    {
        for(row = 0; row < ht; row++)
        {
            pu1_sign_up[0] = SIGN(pu1_src[0] - pu1_src_left_cpy[2 * (row - 1)]);
            pu1_sign_up[1] = SIGN(pu1_src[1] - pu1_src_left_cpy[2 * (row - 1) + 1]);
            for(col = 0; col < wd; col++)
            {
                WORD32 edge_idx;
                WORD8 *pi1_sao_offset;

                pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;

                u1_sign_down = SIGN(pu1_src[col] - pu1_src[col + 2 + src_strd]);
                edge_idx = 2 + pu1_sign_up[col] + u1_sign_down;
                pu1_sign_up_tmp[col + 2] = -u1_sign_down;

                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];

                if(0 != edge_idx)
                {
                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
                }
            }

            /* Swapping pu1_sign_up_tmp and pu1_sign_up */
            {
                WORD8 *pu1_swap_tmp = pu1_sign_up;
                pu1_sign_up = pu1_sign_up_tmp;
                pu1_sign_up_tmp = pu1_swap_tmp;
            }

            pu1_src += src_strd;
        }

        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd] = u1_pos_0_0_tmp_u;
        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + 1] = u1_pos_0_0_tmp_v;
        pu1_src[(pu1_avail[3] ? wd - 2 - src_strd : wd - 2)] = u1_pos_wd_ht_tmp_u;
        pu1_src[(pu1_avail[3] ? wd - 1 - src_strd : wd - 1)] = u1_pos_wd_ht_tmp_v;
    }

    if(0 == pu1_avail[2])
        ht++;
    if(0 == pu1_avail[3])
        ht++;
    pu1_src_top_left[0] = au1_src_top_left_tmp[0];
    pu1_src_top_left[1] = au1_src_top_left_tmp[1];
    for(row = 0; row < 2 * ht; row++)
    {
        pu1_src_left[row] = au1_src_left_tmp[row];
    }
    for(col = 0; col < wd; col++)
    {
        pu1_src_top[col] = au1_src_top_tmp[col];
    }

}




/* 45 degree filtering */
void ihevc_sao_edge_offset_class3(UWORD8 *pu1_src,
                                  WORD32 src_strd,
                                  UWORD8 *pu1_src_left,
                                  UWORD8 *pu1_src_top,
                                  UWORD8 *pu1_src_top_left,
                                  UWORD8 *pu1_src_top_right,
                                  UWORD8 *pu1_src_bot_left,
                                  UWORD8 *pu1_avail,
                                  WORD8 *pi1_sao_offset,
                                  WORD32 wd,
                                  WORD32 ht)
{
    WORD32 row, col;
    UWORD8 au1_mask[MAX_CTB_SIZE];
    UWORD8 au1_src_top_tmp[MAX_CTB_SIZE];
    UWORD8 au1_src_left_tmp[MAX_CTB_SIZE];
    UWORD8 u1_src_top_left_tmp;
    WORD8 au1_sign_up[MAX_CTB_SIZE];
    UWORD8 *pu1_src_left_cpy;
    WORD8 u1_sign_down;
    WORD32 bit_depth;

    UWORD8 u1_pos_0_ht_tmp;
    UWORD8 u1_pos_wd_0_tmp;

    bit_depth = BIT_DEPTH_LUMA;
    pu1_src_left_cpy = pu1_src_left;

    /* Initialize the mask values */
    memset(au1_mask, 0xFF, MAX_CTB_SIZE);

    /* Update left, top and top-left arrays */
    u1_src_top_left_tmp = pu1_src_top[wd - 1];
    for(row = 0; row < ht; row++)
    {
        au1_src_left_tmp[row] = pu1_src[row * src_strd + wd - 1];
    }
    for(col = 0; col < wd; col++)
    {
        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
    }

    /* If top-right is available, process separately */
    if(0 != pu1_avail[5])
    {
        WORD32 edge_idx;

        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_0_tmp = pu1_src[wd - 1];
        }
    }
    else
    {
        u1_pos_wd_0_tmp = pu1_src[wd - 1];
    }

    /* If bottom-left is available, process separately */
    if(0 != pu1_avail[6])
    {
        WORD32 edge_idx;

        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
        }
    }
    else
    {
        u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
    }

    /* If Left is not available */
    if(0 == pu1_avail[0])
    {
        au1_mask[0] = 0;
    }

    /* If Top is not available */
    if(0 == pu1_avail[2])
    {
        pu1_src += src_strd;
        ht--;
        pu1_src_left_cpy += 1;
        for(col = 0; col < wd - 1; col++)
        {
            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col + 1 - src_strd]);
        }
    }
    else
    {
        for(col = 0; col < wd - 1; col++)
        {
            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col + 1]);
        }
    }

    /* If Right is not available */
    if(0 == pu1_avail[1])
    {
        au1_mask[wd - 1] = 0;
    }

    /* If Bottom is not available */
    if(0 == pu1_avail[3])
    {
        ht--;
    }

    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
    {
        for(row = 0; row < ht; row++)
        {
            au1_sign_up[wd - 1] = SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 + 1 - src_strd]);
            for(col = 0; col < wd; col++)
            {
                WORD32 edge_idx;

                u1_sign_down = SIGN(pu1_src[col] - ((col == 0) ? pu1_src_left_cpy[row + 1] :
                                                                 pu1_src[col - 1 + src_strd]));
                edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
                if(col > 0)
                    au1_sign_up[col - 1] = -u1_sign_down;

                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col];

                if(0 != edge_idx)
                {
                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
                }
            }

            pu1_src += src_strd;
        }

        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + wd - 1] = u1_pos_wd_0_tmp;
        pu1_src[(pu1_avail[3] ?  (-src_strd) : 0)] = u1_pos_0_ht_tmp;
    }

    if(0 == pu1_avail[2])
        ht++;
    if(0 == pu1_avail[3])
        ht++;
    *pu1_src_top_left = u1_src_top_left_tmp;
    for(row = 0; row < ht; row++)
    {
        pu1_src_left[row] = au1_src_left_tmp[row];
    }
    for(col = 0; col < wd; col++)
    {
        pu1_src_top[col] = au1_src_top_tmp[col];
    }

}




void ihevc_sao_edge_offset_class3_chroma(UWORD8 *pu1_src,
                                         WORD32 src_strd,
                                         UWORD8 *pu1_src_left,
                                         UWORD8 *pu1_src_top,
                                         UWORD8 *pu1_src_top_left,
                                         UWORD8 *pu1_src_top_right,
                                         UWORD8 *pu1_src_bot_left,
                                         UWORD8 *pu1_avail,
                                         WORD8 *pi1_sao_offset_u,
                                         WORD8 *pi1_sao_offset_v,
                                         WORD32 wd,
                                         WORD32 ht)
{
    WORD32 row, col;
    UWORD8 au1_mask[MAX_CTB_SIZE];
    UWORD8 au1_src_left_tmp[2 * MAX_CTB_SIZE], au1_src_top_tmp[MAX_CTB_SIZE];
    UWORD8 au1_src_top_left_tmp[2];
    WORD8 au1_sign_up[MAX_CTB_SIZE];
    UWORD8 *pu1_src_left_cpy;
    WORD8 u1_sign_down;
    WORD32 bit_depth;

    UWORD8 u1_pos_wd_0_tmp_u;
    UWORD8 u1_pos_wd_0_tmp_v;
    UWORD8 u1_pos_0_ht_tmp_u;
    UWORD8 u1_pos_0_ht_tmp_v;

    bit_depth = BIT_DEPTH_CHROMA;
    pu1_src_left_cpy = pu1_src_left;

    /* Initialize the mask values */
    memset(au1_mask, 0xFF, MAX_CTB_SIZE);

    /* Update left, top and top-left arrays */
    au1_src_top_left_tmp[0] = pu1_src_top[wd - 2];
    au1_src_top_left_tmp[1] = pu1_src_top[wd - 1];
    for(row = 0; row < ht; row++)
    {
        au1_src_left_tmp[2 * row] = pu1_src[row * src_strd + wd - 2];
        au1_src_left_tmp[2 * row + 1] = pu1_src[row * src_strd + wd - 1];
    }
    for(col = 0; col < wd; col++)
    {
        au1_src_top_tmp[col] = pu1_src[(ht - 1) * src_strd + col];
    }


    /* If top-right is available, process separately */
    if(0 != pu1_avail[5])
    {
        WORD32 edge_idx;

        /* U */
        edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
                        SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
        }

        /* V */
        edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
                        SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
        }
    }
    else
    {
        u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
        u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
    }

    /* If bottom-left is available, process separately */
    if(0 != pu1_avail[6])
    {
        WORD32 edge_idx;

        /* U */
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
                        SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
        }

        /* V */
        edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
                        SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);

        edge_idx = gi4_ihevc_table_edge_idx[edge_idx];

        if(0 != edge_idx)
        {
            u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
        }
        else
        {
            u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
        }
    }
    else
    {
        u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
        u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
    }

    /* If Left is not available */
    if(0 == pu1_avail[0])
    {
        au1_mask[0] = 0;
    }

    /* If Top is not available */
    if(0 == pu1_avail[2])
    {
        pu1_src += src_strd;
        ht--;
        pu1_src_left_cpy += 2;
        for(col = 0; col < wd - 2; col++)
        {
            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src[col + 2 - src_strd]);
        }
    }
    else
    {
        for(col = 0; col < wd - 2; col++)
        {
            au1_sign_up[col] = SIGN(pu1_src[col] - pu1_src_top[col + 2]);
        }
    }

    /* If Right is not available */
    if(0 == pu1_avail[1])
    {
        au1_mask[(wd - 1) >> 1] = 0;
    }

    /* If Bottom is not available */
    if(0 == pu1_avail[3])
    {
        ht--;
    }

    /* Processing is done on the intermediate buffer and the output is written to the source buffer */
    {
        for(row = 0; row < ht; row++)
        {
            au1_sign_up[wd - 2] = SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 + 2 - src_strd]);
            au1_sign_up[wd - 1] = SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 + 2 - src_strd]);
            for(col = 0; col < wd; col++)
            {
                WORD32 edge_idx;
                WORD8 *pi1_sao_offset;

                pi1_sao_offset = (0 == col % 2) ? pi1_sao_offset_u : pi1_sao_offset_v;

                u1_sign_down = SIGN(pu1_src[col] - ((col < 2) ? pu1_src_left_cpy[2 * (row + 1) + col] :
                                                                pu1_src[col - 2 + src_strd]));
                edge_idx = 2 + au1_sign_up[col] + u1_sign_down;
                if(col > 1)
                    au1_sign_up[col - 2] = -u1_sign_down;

                edge_idx = gi4_ihevc_table_edge_idx[edge_idx] & au1_mask[col >> 1];

                if(0 != edge_idx)
                {
                    pu1_src[col] = CLIP3(pu1_src[col] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
                }
            }

            pu1_src += src_strd;
        }

        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + wd - 2] = u1_pos_wd_0_tmp_u;
        pu1_src[-(pu1_avail[2] ? ht : ht + 1) * src_strd + wd - 1] = u1_pos_wd_0_tmp_v;
        pu1_src[(pu1_avail[3] ?  (-src_strd) : 0)] = u1_pos_0_ht_tmp_u;
        pu1_src[(pu1_avail[3] ?  (-src_strd) : 0) + 1] = u1_pos_0_ht_tmp_v;
    }

    if(0 == pu1_avail[2])
        ht++;
    if(0 == pu1_avail[3])
        ht++;
    pu1_src_top_left[0] = au1_src_top_left_tmp[0];
    pu1_src_top_left[1] = au1_src_top_left_tmp[1];
    for(row = 0; row < 2 * ht; row++)
    {
        pu1_src_left[row] = au1_src_left_tmp[row];
    }
    for(col = 0; col < wd; col++)
    {
        pu1_src_top[col] = au1_src_top_tmp[col];
    }

}