C++程序  |  5708行  |  202.74 KB

/******************************************************************************
 *
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/

/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/
/* System include files */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <stdarg.h>
#include <math.h>
#include <limits.h>

/* User include files */
#include "ihevc_typedefs.h"
#include "itt_video_api.h"
#include "ihevce_api.h"

#include "rc_cntrl_param.h"
#include "rc_frame_info_collector.h"
#include "rc_look_ahead_params.h"

#include "ihevc_defs.h"
#include "ihevc_structs.h"
#include "ihevc_platform_macros.h"
#include "ihevc_deblk.h"
#include "ihevc_itrans_recon.h"
#include "ihevc_chroma_itrans_recon.h"
#include "ihevc_chroma_intra_pred.h"
#include "ihevc_intra_pred.h"
#include "ihevc_inter_pred.h"
#include "ihevc_mem_fns.h"
#include "ihevc_padding.h"
#include "ihevc_weighted_pred.h"
#include "ihevc_sao.h"
#include "ihevc_resi_trans.h"
#include "ihevc_quant_iquant_ssd.h"
#include "ihevc_cabac_tables.h"

#include "ihevce_defs.h"
#include "ihevce_lap_enc_structs.h"
#include "ihevce_multi_thrd_structs.h"
#include "ihevce_multi_thrd_funcs.h"
#include "ihevce_me_common_defs.h"
#include "ihevce_had_satd.h"
#include "ihevce_error_codes.h"
#include "ihevce_bitstream.h"
#include "ihevce_cabac.h"
#include "ihevce_rdoq_macros.h"
#include "ihevce_function_selector.h"
#include "ihevce_enc_structs.h"
#include "ihevce_entropy_structs.h"
#include "ihevce_cmn_utils_instr_set_router.h"
#include "ihevce_enc_loop_structs.h"
#include "ihevce_inter_pred.h"
#include "ihevce_global_tables.h"
#include "ihevce_dep_mngr_interface.h"
#include "hme_datatype.h"
#include "hme_interface.h"
#include "hme_common_defs.h"
#include "hme_defs.h"
#include "ihevce_me_instr_set_router.h"
#include "hme_globals.h"
#include "hme_utils.h"
#include "hme_coarse.h"
#include "hme_fullpel.h"
#include "hme_subpel.h"
#include "hme_refine.h"
#include "hme_err_compute.h"
#include "hme_common_utils.h"
#include "hme_search_algo.h"
#include "ihevce_stasino_helpers.h"
#include "ihevce_common_utils.h"

/*****************************************************************************/
/* Macros                                                                    */
/*****************************************************************************/
#define UNI_SATD_SCALE 1

/*****************************************************************************/
/* Function Definitions                                                      */
/*****************************************************************************/
void ihevce_open_loop_pred_data(
    me_frm_ctxt_t *ps_ctxt,
    inter_pu_results_t *ps_pu_results,
    U08 *pu1_src,
    U08 *pu1_temp_pred,
    S32 stride,
    S32 src_strd,
    UWORD8 e_part_id)
{
    S32 best_sad_l0 = -1, best_sad_l1 = -1;
    S32 sad_diff, status;
    inter_pred_me_ctxt_t *ps_inter_pred_me_ctxt;
    U08 enable_bi = 0;
    pu_t s_pu;

    ps_inter_pred_me_ctxt = &ps_ctxt->s_mc_ctxt;
    ps_ctxt->i4_count++;
    /* L0*/
    if(ps_pu_results->u1_num_results_per_part_l0[e_part_id])
    {
        pu_result_t *ps_best_l0_pu;
        ps_best_l0_pu = ps_pu_results->aps_pu_results[0][PRT_2Nx2N];
        best_sad_l0 = ps_best_l0_pu->i4_tot_cost - ps_best_l0_pu->i4_mv_cost;
        s_pu.b2_pred_mode = PRED_L0;
        s_pu.b4_ht = ps_best_l0_pu->pu.b4_ht;
        s_pu.b4_wd = ps_best_l0_pu->pu.b4_wd;
        s_pu.b4_pos_x = ps_best_l0_pu->pu.b4_pos_x;
        s_pu.b4_pos_y = ps_best_l0_pu->pu.b4_pos_y;
        s_pu.b1_intra_flag = 0;
        s_pu.mv.s_l0_mv.i2_mvx = ps_best_l0_pu->pu.mv.s_l0_mv.i2_mvx;
        s_pu.mv.s_l0_mv.i2_mvy = ps_best_l0_pu->pu.mv.s_l0_mv.i2_mvy;
        s_pu.mv.i1_l0_ref_idx = ps_best_l0_pu->pu.mv.i1_l0_ref_idx;
    }
    /*L1*/
    if(ps_pu_results->u1_num_results_per_part_l1[e_part_id])
    {
        pu_result_t *ps_best_l1_pu;
        ps_best_l1_pu = ps_pu_results->aps_pu_results[1][PRT_2Nx2N];
        best_sad_l1 = ps_best_l1_pu->i4_tot_cost - ps_best_l1_pu->i4_mv_cost;
        s_pu.b2_pred_mode = PRED_L1;
        s_pu.b4_ht = ps_best_l1_pu->pu.b4_ht;
        s_pu.b4_wd = ps_best_l1_pu->pu.b4_wd;
        s_pu.b4_pos_x = ps_best_l1_pu->pu.b4_pos_x;
        s_pu.b4_pos_y = ps_best_l1_pu->pu.b4_pos_y;
        s_pu.b1_intra_flag = 0;
        s_pu.mv.s_l1_mv.i2_mvx = ps_best_l1_pu->pu.mv.s_l1_mv.i2_mvx;
        s_pu.mv.s_l1_mv.i2_mvy = ps_best_l1_pu->pu.mv.s_l1_mv.i2_mvy;
        s_pu.mv.i1_l1_ref_idx = ps_best_l1_pu->pu.mv.i1_l1_ref_idx;
    }
    ASSERT((best_sad_l0 != -1) || (best_sad_l1 != -1));
    /*bi selection*/
    if((best_sad_l0 != -1) && (best_sad_l1 != -1))
    {
        sad_diff = abs(best_sad_l0 - best_sad_l1);
        if((sad_diff < (best_sad_l0 * 0.15)) && (sad_diff < (best_sad_l1 * 0.15)))
        {
            enable_bi = 1;
            s_pu.b2_pred_mode = PRED_BI;
        }
        if(!enable_bi)
        {
            if(best_sad_l0 < best_sad_l1)
            {
                s_pu.b2_pred_mode = PRED_L0;
            }
            else
            {
                s_pu.b2_pred_mode = PRED_L1;
            }
        }
    }
    status = ihevce_luma_inter_pred_pu(ps_inter_pred_me_ctxt, &s_pu, pu1_temp_pred, stride, 1);
    if(status == -1)
    {
        ASSERT(0);
    }
}

/**
********************************************************************************
*  @fn     void *hme_get_wkg_mem(buf_mgr_t *ps_buf_mgr, S32 i4_size)
*
*  @brief  Allocates a block of size = i4_size from working memory and returns
*
*  @param[in,out] ps_buf_mgr: Buffer manager for wkg memory
*
*  @param[in]  i4_size : size required
*
*  @return void pointer to allocated memory, NULL if failure
********************************************************************************
*/
void *hme_get_wkg_mem(buf_mgr_t *ps_buf_mgr, S32 i4_size)
{
    U08 *pu1_mem;

    if(ps_buf_mgr->i4_used + i4_size > ps_buf_mgr->i4_total)
        return NULL;

    pu1_mem = ps_buf_mgr->pu1_wkg_mem + ps_buf_mgr->i4_used;
    ps_buf_mgr->i4_used += i4_size;

    return ((void *)pu1_mem);
}

/**
********************************************************************************
*  @fn     hme_init_histogram(
*
*  @brief  Top level entry point for Coarse ME. Runs across blocks and does the
*          needful by calling other low level routines.
*
*  @param[in,out]  ps_hist : the histogram structure
*
*  @param[in]  i4_max_mv_x : Maximum mv allowed in x direction (fpel units)
*
*  @param[in]  i4_max_mv_y : Maximum mv allowed in y direction (fpel units)
*
*  @return None
********************************************************************************
*/

void hme_init_histogram(mv_hist_t *ps_hist, S32 i4_max_mv_x, S32 i4_max_mv_y)
{
    S32 i4_num_bins, i4_num_cols, i4_num_rows;
    S32 i4_shift_x, i4_shift_y, i, i4_range, i4_val;

    /*************************************************************************/
    /* Evaluate the shift_x and shift_y. For this, we use the following logic*/
    /* Assuming that we use up all MAX_NUM_BINS. Then the number of bins is  */
    /* given by formula ((max_mv_x * 2) >> shift_x)*((max_mv_y * 2)>>shift_y)*/
    /* or shift_x + shift_y is log ((max_mv_x * max_mv_y * 4) / MAX_NUM_BINS)*/
    /* if above quantity is negative, then we make it zero.                  */
    /* If result is odd, then shift_y is result >> 1, shift_x is shift_y + 1 */
    /*************************************************************************/
    i4_val = i4_max_mv_x * i4_max_mv_y * 4;
    i4_range = (hme_get_range(i4_val - 1)) + 1;
    if(i4_range > LOG_MAX_NUM_BINS)
    {
        i4_shift_y = (i4_range - LOG_MAX_NUM_BINS);
        i4_shift_x = (i4_shift_y + 1) >> 1;
        i4_shift_y >>= 1;
    }
    else
    {
        i4_shift_y = 0;
        i4_shift_x = 0;
    }

    /* we assume the mv range is -max_mv_x to +max_mv_x, ditto for y */
    /* So number of columns is 2*max_mv_x >> i4_shift_x. Ditto for rows */
    /* this helps us compute num bins that are active for this histo session */
    i4_num_cols = (i4_max_mv_x << 1) >> i4_shift_x;
    i4_num_rows = (i4_max_mv_y << 1) >> i4_shift_y;
    i4_num_bins = i4_num_rows * i4_num_cols;

    ASSERT(i4_num_bins <= MAX_NUM_BINS);

    ps_hist->i4_num_rows = i4_num_rows;
    ps_hist->i4_num_cols = i4_num_cols;
    ps_hist->i4_min_x = -i4_max_mv_x;
    ps_hist->i4_min_y = -i4_max_mv_y;
    ps_hist->i4_shift_x = i4_shift_x;
    ps_hist->i4_shift_y = i4_shift_y;
    ps_hist->i4_lobe1_size = 5;
    ps_hist->i4_lobe2_size = 3;

    ps_hist->i4_num_bins = i4_num_bins;

    for(i = 0; i < i4_num_bins; i++)
    {
        ps_hist->ai4_bin_count[i] = 0;
    }
}

/**
********************************************************************************
*  @fn     hme_update_histogram(
*
*  @brief  Updates the histogram given an mv entry
*
*  @param[in,out]  ps_hist : the histogram structure
*
*  @param[in]  i4_mv_x : x component of the mv (fpel units)
*
*  @param[in]  i4_mv_y : y component of the mv (fpel units)
*
*  @return None
********************************************************************************
*/
void hme_update_histogram(mv_hist_t *ps_hist, S32 i4_mv_x, S32 i4_mv_y)
{
    S32 i4_bin_index, i4_col, i4_row;

    i4_col = (i4_mv_x - ps_hist->i4_min_x) >> ps_hist->i4_shift_x;
    i4_row = (i4_mv_y - ps_hist->i4_min_y) >> ps_hist->i4_shift_y;

    i4_bin_index = i4_col + (i4_row * ps_hist->i4_num_cols);
    /* Sanity Check */
    ASSERT(i4_bin_index < MAX_NUM_BINS);

    ps_hist->ai4_bin_count[i4_bin_index]++;
}

/**
********************************************************************************
*  @fn     hme_get_global_mv(
*
*  @brief  returns the global mv of a previous picture. Accounts for the fact
*          that the delta poc of the previous picture may have been different
*          from delta poc of current picture. Delta poc is POC difference
*          between a picture and its reference.
*
*  @param[out]  ps_mv: mv_t structure where the motion vector is returned
*
*  @param[in]  i4_delta_poc: the delta poc for the current pic w.r.t. reference
*
*  @return None
********************************************************************************
*/
void hme_get_global_mv(layer_ctxt_t *ps_prev_layer, hme_mv_t *ps_mv, S32 i4_delta_poc)
{
    S16 i2_mv_x, i2_mv_y;
    S32 i4_delta_poc_prev;
    S32 i4_poc_prev = ps_prev_layer->i4_poc;
    S32 i4_poc_prev_ref = ps_prev_layer->ai4_ref_id_to_poc_lc[0];

    i4_delta_poc_prev = i4_poc_prev - i4_poc_prev_ref;
    i2_mv_x = ps_prev_layer->s_global_mv[0][GMV_THICK_LOBE].i2_mv_x;
    i2_mv_y = ps_prev_layer->s_global_mv[0][GMV_THICK_LOBE].i2_mv_y;

    i2_mv_x = (S16)((i2_mv_x * i4_delta_poc) / i4_delta_poc_prev);
    i2_mv_y = (S16)((i2_mv_y * i4_delta_poc) / i4_delta_poc_prev);

    ps_mv->i2_mv_x = i2_mv_x;
    ps_mv->i2_mv_y = i2_mv_y;
}

/**
********************************************************************************
*  @fn     hme_calculate_global_mv(
*
*  @brief  Calculates global mv for a given histogram
*
*  @param[in]  ps_hist : the histogram structure
*
*  @param[in]  ps_mv : used to return the global mv
*
*  @param[in]  e_lobe_type : refer to GMV_MVTYPE_T
*
*  @return None
********************************************************************************
*/
void hme_calculate_global_mv(mv_hist_t *ps_hist, hme_mv_t *ps_mv, GMV_MVTYPE_T e_lobe_type)
{
    S32 i4_offset, i4_lobe_size, i4_y, i4_x, *pi4_bin_count;
    S32 i4_max_sum = -1;
    S32 i4_max_x = 0, i4_max_y = 0;

    if(e_lobe_type == GMV_THICK_LOBE)
        i4_lobe_size = ps_hist->i4_lobe1_size;
    else
        i4_lobe_size = ps_hist->i4_lobe2_size;

    i4_offset = i4_lobe_size >> 1;
    for(i4_y = i4_offset; i4_y < ps_hist->i4_num_rows - i4_offset; i4_y++)
    {
        for(i4_x = i4_offset; i4_x < ps_hist->i4_num_cols - i4_offset; i4_x++)
        {
            S32 i4_bin_id, i4_sum;
            i4_bin_id = (i4_x - 2) + ((i4_y - 2) * ps_hist->i4_num_cols);

            pi4_bin_count = &ps_hist->ai4_bin_count[i4_bin_id];
            i4_sum = hme_compute_2d_sum_unsigned(
                (void *)pi4_bin_count,
                i4_lobe_size,
                i4_lobe_size,
                ps_hist->i4_num_cols,
                sizeof(U32));

            if(i4_sum > i4_max_sum)
            {
                i4_max_x = i4_x;
                i4_max_y = i4_y;
                i4_max_sum = i4_sum;
            }
        }
    }

    ps_mv->i2_mv_y = (S16)((i4_max_y << ps_hist->i4_shift_y) + ps_hist->i4_min_y);
    ps_mv->i2_mv_x = (S16)((i4_max_x << ps_hist->i4_shift_x) + ps_hist->i4_min_x);
}

/**
********************************************************************************
*  @fn    ctb_node_t *hme_get_ctb_node(ctb_mem_mgr_t *ps_mem_mgr)
*
*  @brief  returns a new ctb node usable for creating a new ctb candidate
*
*  @param[in] ps_mem_mgr : memory manager holding all ctb nodes
*
*  @return NULL if no free nodes, else ptr to the new ctb node
********************************************************************************
*/
ctb_node_t *hme_get_ctb_node(ctb_mem_mgr_t *ps_mem_mgr)
{
    U08 *pu1_ret;
    if((ps_mem_mgr->i4_used + ps_mem_mgr->i4_size) > ps_mem_mgr->i4_tot)
        return (NULL);
    pu1_ret = ps_mem_mgr->pu1_mem + ps_mem_mgr->i4_used;
    ps_mem_mgr->i4_used += ps_mem_mgr->i4_size;
    return ((ctb_node_t *)pu1_ret);
}

/**
********************************************************************************
*  @fn     hme_map_mvs_to_grid(mv_grid_t **pps_mv_grid,
search_results_t *ps_search_results, S32 i4_num_ref)
*
*  @brief  For a given CU whose results are in ps_search_results, the 17x17
*          mv grid is updated for future use within the CTB
*
*  @param[in] ps_search_results : Search results data structure
*
*  @param[out] pps_mv_grid: The mv grid (as many as num ref)
*
*  @param[in]  i4_num_ref: nuber of search iterations to update
*
*  @return None
********************************************************************************
*/
void hme_map_mvs_to_grid(
    mv_grid_t **pps_mv_grid,
    search_results_t *ps_search_results,
    U08 *pu1_pred_dir_searched,
    S32 i4_num_pred_dir)
{
    S32 i4_cu_start_offset;
    /*************************************************************************/
    /* Start x, y offset of CU relative to CTB. To update the mv grid which  */
    /* stores 1 mv per 4x4, we convert pixel offset to 4x4 blk offset        */
    /*************************************************************************/
    S32 i4_cu_offset_x = (S32)ps_search_results->u1_x_off >> 2;
    S32 i4_cu_offset_y = (S32)ps_search_results->u1_y_off >> 2;

    /* Controls the attribute of a given partition within CU   */
    /* , i.e. start locn, size                                 */
    part_attr_t *ps_part_attr;

    S32 i4_part, i4_part_id, num_parts, i4_stride;
    S16 i2_mv_x, i2_mv_y;
    S08 i1_ref_idx;

    /* Per partition, attributes w.r.t. CU start */
    S32 x_start, y_start, x_end, y_end, i4_x, i4_y;
    PART_TYPE_T e_part_type;

    /* Points to exact mv structures within the grid to be udpated */
    search_node_t *ps_grid_node, *ps_grid_node_tmp;

    /* points to exact mv grid (based on search iteration) to be updated */
    mv_grid_t *ps_mv_grid;

    search_node_t *ps_search_node;

    S32 shift, i, mv_shift = 2;
    /* Proportional to the size of CU, controls the number of 4x4 blks */
    /* to be updated                                                   */
    shift = ps_search_results->e_cu_size;
    ASSERT(i4_num_pred_dir <= 2);

    e_part_type = (PART_TYPE_T)ps_search_results->ps_cu_results->ps_best_results[0].u1_part_type;

    if((ps_search_results->e_cu_size == CU_16x16) && (ps_search_results->u1_split_flag) &&
       (ps_search_results->i4_part_mask & ENABLE_NxN))
    {
        e_part_type = PRT_NxN;
    }

    for(i = 0; i < i4_num_pred_dir; i++)
    {
        num_parts = gau1_num_parts_in_part_type[e_part_type];
        ps_mv_grid = pps_mv_grid[pu1_pred_dir_searched[i]];
        i4_stride = ps_mv_grid->i4_stride;

        i4_cu_start_offset =
            i4_cu_offset_x + i4_cu_offset_y * i4_stride + ps_mv_grid->i4_start_offset;

        /* Move to the appropriate 2d locn of CU start within Grid */
        ps_grid_node = &ps_mv_grid->as_node[i4_cu_start_offset];

        for(i4_part = 0; i4_part < num_parts; i4_part++)
        {
            i4_part_id = ge_part_type_to_part_id[e_part_type][i4_part];

            /* Pick the mvx and y and ref id corresponding to this partition */
            ps_search_node =
                ps_search_results->aps_part_results[pu1_pred_dir_searched[i]][i4_part_id];

            i2_mv_x = ps_search_node->s_mv.i2_mvx;
            i2_mv_y = ps_search_node->s_mv.i2_mvy;
            i1_ref_idx = ps_search_node->i1_ref_idx;

            /* Move to the appropriate location within the CU */
            ps_part_attr = &gas_part_attr_in_cu[i4_part_id];
            x_start = ps_part_attr->u1_x_start;
            x_end = x_start + ps_part_attr->u1_x_count;
            y_start = ps_part_attr->u1_y_start;
            y_end = y_start + ps_part_attr->u1_y_count;

            /* Convert attributes from 8x8 CU size to given CU size */
            x_start = (x_start << shift) >> mv_shift;
            x_end = (x_end << shift) >> mv_shift;
            y_start = (y_start << shift) >> mv_shift;
            y_end = (y_end << shift) >> mv_shift;

            ps_grid_node_tmp = ps_grid_node + y_start * i4_stride;

            /* Update all 4x4 blk mvs with the part mv */
            /* For e.g. we update 4 units in case of NxN for 16x16 CU */
            for(i4_y = y_start; i4_y < y_end; i4_y++)
            {
                for(i4_x = x_start; i4_x < x_end; i4_x++)
                {
                    ps_grid_node_tmp[i4_x].s_mv.i2_mvx = i2_mv_x;
                    ps_grid_node_tmp[i4_x].s_mv.i2_mvy = i2_mv_y;
                    ps_grid_node_tmp[i4_x].i1_ref_idx = i1_ref_idx;
                    ps_grid_node_tmp[i4_x].u1_subpel_done = 1;
                }
                ps_grid_node_tmp += i4_stride;
            }
        }
    }
}

void hme_set_ctb_pred_attr(ctb_node_t *ps_parent, U08 *pu1_pred0, U08 *pu1_pred1, S32 i4_stride)
{
    ps_parent->apu1_pred[0] = pu1_pred0;
    ps_parent->apu1_pred[1] = pu1_pred1;
    ps_parent->i4_pred_stride = i4_stride;
    if(ps_parent->ps_tl != NULL)
    {
        S32 blk_wd = (S32)ps_parent->ps_tr->u1_x_off;
        blk_wd -= (S32)ps_parent->u1_x_off;

        hme_set_ctb_pred_attr(ps_parent->ps_tl, pu1_pred0, pu1_pred1, i4_stride >> 1);

        hme_set_ctb_pred_attr(
            ps_parent->ps_tr, pu1_pred0 + blk_wd, pu1_pred1 + blk_wd, i4_stride >> 1);

        hme_set_ctb_pred_attr(
            ps_parent->ps_bl,
            pu1_pred0 + (blk_wd * i4_stride),
            pu1_pred1 + (blk_wd * i4_stride),
            i4_stride >> 1);

        hme_set_ctb_pred_attr(
            ps_parent->ps_tr,
            pu1_pred0 + (blk_wd * (1 + i4_stride)),
            pu1_pred1 + (blk_wd * (1 + i4_stride)),
            i4_stride >> 1);
    }
}

/**
********************************************************************************
*  @fn     hme_create_valid_part_ids(S32 i4_part_mask, S32 *pi4_valid_part_ids)
*
*  @brief  Expands the part mask to a list of valid part ids terminated by -1
*
*  @param[in] i4_part_mask : bit mask of active partitino ids
*
*  @param[out] pi4_valid_part_ids : array, each entry has one valid part id
*               Terminated by -1 to signal end.
*
*  @return number of partitions
********************************************************************************
*/
S32 hme_create_valid_part_ids(S32 i4_part_mask, S32 *pi4_valid_part_ids)
{
    S32 id = 0, i;
    for(i = 0; i < TOT_NUM_PARTS; i++)
    {
        if(i4_part_mask & (1 << i))
        {
            pi4_valid_part_ids[id] = i;
            id++;
        }
    }
    pi4_valid_part_ids[id] = -1;

    return id;
}

ctb_boundary_attrs_t *
    get_ctb_attrs(S32 ctb_start_x, S32 ctb_start_y, S32 pic_wd, S32 pic_ht, me_frm_ctxt_t *ps_ctxt)
{
    S32 horz_crop, vert_crop;
    ctb_boundary_attrs_t *ps_attrs;

    horz_crop = ((ctb_start_x + 64) > pic_wd) ? 2 : 0;
    vert_crop = ((ctb_start_y + 64) > pic_ht) ? 1 : 0;
    switch(horz_crop + vert_crop)
    {
    case 0:
        ps_attrs = &ps_ctxt->as_ctb_bound_attrs[CTB_CENTRE];
        break;
    case 1:
        ps_attrs = &ps_ctxt->as_ctb_bound_attrs[CTB_BOT_PIC_BOUNDARY];
        break;
    case 2:
        ps_attrs = &ps_ctxt->as_ctb_bound_attrs[CTB_RT_PIC_BOUNDARY];
        break;
    case 3:
        ps_attrs = &ps_ctxt->as_ctb_bound_attrs[CTB_BOT_RT_PIC_BOUNDARY];
        break;
    }
    return (ps_attrs);
}

/**
********************************************************************************
*  @fn     hevc_avg_2d(U08 *pu1_src1,
*                   U08 *pu1_src2,
*                   S32 i4_src1_stride,
*                   S32 i4_src2_stride,
*                   S32 i4_blk_wd,
*                   S32 i4_blk_ht,
*                   U08 *pu1_dst,
*                   S32 i4_dst_stride)
*
*
*  @brief  point wise average of two buffers into a third buffer
*
*  @param[in] pu1_src1 : first source buffer
*
*  @param[in] pu1_src2 : 2nd source buffer
*
*  @param[in] i4_src1_stride : stride of source 1 buffer
*
*  @param[in] i4_src2_stride : stride of source 2 buffer
*
*  @param[in] i4_blk_wd : block width
*
*  @param[in] i4_blk_ht : block height
*
*  @param[out] pu1_dst : destination buffer
*
*  @param[in] i4_dst_stride : stride of the destination buffer
*
*  @return void
********************************************************************************
*/
void hevc_avg_2d(
    U08 *pu1_src1,
    U08 *pu1_src2,
    S32 i4_src1_stride,
    S32 i4_src2_stride,
    S32 i4_blk_wd,
    S32 i4_blk_ht,
    U08 *pu1_dst,
    S32 i4_dst_stride)
{
    S32 i, j;

    for(i = 0; i < i4_blk_ht; i++)
    {
        for(j = 0; j < i4_blk_wd; j++)
        {
            pu1_dst[j] = (pu1_src1[j] + pu1_src2[j] + 1) >> 1;
        }
        pu1_src1 += i4_src1_stride;
        pu1_src2 += i4_src2_stride;
        pu1_dst += i4_dst_stride;
    }
}
/**
********************************************************************************
*  @fn     hme_pick_back_search_node(search_results_t *ps_search_results,
*                                   search_node_t *ps_search_node_fwd,
*                                   S32 i4_part_idx,
*                                   layer_ctxt_t *ps_curr_layer)
*
*
*  @brief  returns the search node corresponding to a ref idx in same or
*          opp direction. Preference is given to opp direction, but if that
*          does not yield results, same direction is attempted.
*
*  @param[in] ps_search_results: search results overall
*
*  @param[in] ps_search_node_fwd: search node corresponding to "fwd" direction
*
*  @param[in] i4_part_idx : partition id
*
*  @param[in] ps_curr_layer : layer context for current layer.
*
*  @return search node corresponding to hte "other direction"
********************************************************************************
*/
//#define PICK_L1_REF_SAME_DIR
search_node_t *hme_pick_back_search_node(
    search_results_t *ps_search_results,
    search_node_t *ps_search_node_fwd,
    S32 i4_part_idx,
    layer_ctxt_t *ps_curr_layer)
{
    S32 is_past_l0, is_past_l1, id, i, i4_poc;
    S32 *pi4_ref_id_to_poc_lc = ps_curr_layer->ai4_ref_id_to_poc_lc;
    //ref_attr_t *ps_ref_attr_lc;
    S08 i1_ref_idx_fwd;
    S16 i2_mv_x, i2_mv_y;
    search_node_t *ps_search_node;

    i1_ref_idx_fwd = ps_search_node_fwd->i1_ref_idx;
    i2_mv_x = ps_search_node_fwd->s_mv.i2_mvx;
    i2_mv_y = ps_search_node_fwd->s_mv.i2_mvy;
    i4_poc = ps_curr_layer->i4_poc;

    //ps_ref_attr_lc = &ps_curr_layer->as_ref_attr_lc[0];
    /* If the ref id already picked up maps to a past pic, then we pick */
    /* a result corresponding to future pic. If such a result is not    */
    /* to be found, then we pick a result corresponding to a past pic   */
    //is_past = ps_ref_attr_lc[i1_ref_idx_fwd].u1_is_past;
    is_past_l0 = (i4_poc > pi4_ref_id_to_poc_lc[i1_ref_idx_fwd]) ? 1 : 0;

    ASSERT(ps_search_results->u1_num_active_ref <= 2);

    /* pick the right iteration of search nodes to pick up */
#ifdef PICK_L1_REF_SAME_DIR
    if(ps_search_results->u1_num_active_ref == 2)
        id = !is_past_l0;
#else
    if(ps_search_results->u1_num_active_ref == 2)
        id = is_past_l0;
#endif
    else
        id = 0;

    ps_search_node = ps_search_results->aps_part_results[id][i4_part_idx];

    for(i = 0; i < ps_search_results->u1_num_results_per_part; i++)
    {
        S08 i1_ref_test = ps_search_node[i].i1_ref_idx;
        is_past_l1 = (pi4_ref_id_to_poc_lc[i1_ref_test] < i4_poc) ? 1 : 0;
        //if (ps_ref_attr_lc[ps_search_node[i].i1_ref_idx].u1_is_past != is_past)
#ifdef PICK_L1_REF_SAME_DIR
        if(is_past_l1 == is_past_l0)
#else
        if(is_past_l1 != is_past_l0)
#endif
        {
            /* belongs to same direction as the ref idx passed, so continue */
            return (ps_search_node + i);
        }
    }

    /* Unable to find best result in opp direction, so try same direction */
    /* However we need to ensure that we do not pick up same result       */
    for(i = 0; i < ps_search_results->u1_num_results_per_part; i++)
    {
        if((ps_search_node->i1_ref_idx != i1_ref_idx_fwd) ||
           (ps_search_node->s_mv.i2_mvx != i2_mv_x) || (ps_search_node->s_mv.i2_mvy != i2_mv_y))
        {
            return (ps_search_node);
        }
        ps_search_node++;
    }

    //ASSERT(0);
    return (ps_search_results->aps_part_results[id][i4_part_idx]);

    //return (NULL);
}

/**
********************************************************************************
*  @fn     hme_study_input_segmentation(U08 *pu1_inp, S32 i4_inp_stride)
*
*
*  @brief  Examines input 16x16 for possible edges and orientations of those,
*          and returns a bit mask of partitions that should be searched for
*
*  @param[in] pu1_inp : input buffer
*
*  @param[in] i4_inp_stride: input stride
*
*  @return part mask (bit mask of active partitions to search)
********************************************************************************
*/

S32 hme_study_input_segmentation(U08 *pu1_inp, S32 i4_inp_stride, S32 limit_active_partitions)
{
    S32 i4_rsum[16], i4_csum[16];
    U08 *pu1_tmp, u1_tmp;
    S32 i4_max_ridx, i4_max_cidx, i4_tmp;
    S32 i, j, i4_ret;
    S32 i4_max_rp[4], i4_max_cp[4];
    S32 i4_seg_lutc[4] = { 0, ENABLE_nLx2N, ENABLE_Nx2N, ENABLE_nRx2N };
    S32 i4_seg_lutr[4] = { 0, ENABLE_2NxnU, ENABLE_2NxN, ENABLE_2NxnD };
#define EDGE_THR (15 * 16)
#define HI_PASS(ptr, i) (2 * (ptr[i] - ptr[i - 1]) + (ptr[i + 1] - ptr[i - 2]))

    if(0 == limit_active_partitions)
    {
        /*********************************************************************/
        /* In this case, we do not optimize on active partitions and search  */
        /* brute force. This way, 17 partitinos would be enabled.            */
        /*********************************************************************/
        return (ENABLE_ALL_PARTS);
    }

    /*************************************************************************/
    /* Control passes below in case we wish to optimize on active partitions.*/
    /* This is based on input characteristics, check how an edge passes along*/
    /* an input 16x16 area, if at all, and decide active partitinos.         */
    /*************************************************************************/

    /* Initialize row and col sums */
    for(i = 0; i < 16; i++)
    {
        i4_rsum[i] = 0;
        i4_csum[i] = 0;
    }
    pu1_tmp = pu1_inp;
    for(i = 0; i < 16; i++)
    {
        for(j = 0; j < 16; j++)
        {
            u1_tmp = *pu1_tmp++;
            i4_rsum[i] += u1_tmp;
            i4_csum[j] += u1_tmp;
        }
        pu1_tmp += (i4_inp_stride - 16);
    }

    /* 0 is dummy; 1 is 4; 2 is 8; 3 is 12 */
    i4_max_rp[0] = 0;
    i4_max_cp[0] = 0;
    i4_max_rp[1] = 0;
    i4_max_cp[1] = 0;
    i4_max_rp[2] = 0;
    i4_max_cp[2] = 0;
    i4_max_rp[3] = 0;
    i4_max_cp[3] = 0;

    /* Get Max edge strength across (2,3) (3,4) (4,5) */
    for(i = 3; i < 6; i++)
    {
        /* Run [-1 -2 2 1] filter through rsum/csum */
        i4_tmp = HI_PASS(i4_rsum, i);
        if(ABS(i4_tmp) > i4_max_rp[1])
            i4_max_rp[1] = i4_tmp;

        i4_tmp = HI_PASS(i4_csum, i);
        if(ABS(i4_tmp) > i4_max_cp[1])
            i4_max_cp[1] = i4_tmp;
    }

    /* Get Max edge strength across (6,7) (7,8) (8,9) */
    for(i = 7; i < 10; i++)
    {
        /* Run [-1 -2 2 1] filter through rsum/csum */
        i4_tmp = HI_PASS(i4_rsum, i);
        if(ABS(i4_tmp) > i4_max_rp[2])
            i4_max_rp[2] = i4_tmp;

        i4_tmp = HI_PASS(i4_csum, i);
        if(ABS(i4_tmp) > i4_max_cp[2])
            i4_max_cp[2] = i4_tmp;
    }

    /* Get Max edge strength across (10,11) (11,12) (12,13) */
    for(i = 11; i < 14; i++)
    {
        /* Run [-1 -2 2 1] filter through rsum/csum */
        i4_tmp = HI_PASS(i4_rsum, i);
        if(ABS(i4_tmp) > i4_max_rp[3])
            i4_max_rp[3] = i4_tmp;

        i4_tmp = HI_PASS(i4_csum, i);
        if(ABS(i4_tmp) > i4_max_cp[3])
            i4_max_cp[3] = i4_tmp;
    }

    /* Find the maximum across the 3 and see whether the strength qualifies as edge */
    i4_max_ridx = 1;
    i4_max_cidx = 1;
    for(i = 2; i <= 3; i++)
    {
        if(i4_max_rp[i] > i4_max_rp[i4_max_ridx])
            i4_max_ridx = i;

        if(i4_max_cp[i] > i4_max_cp[i4_max_cidx])
            i4_max_cidx = i;
    }

    if(EDGE_THR > i4_max_rp[i4_max_ridx])
    {
        i4_max_ridx = 0;
    }

    if(EDGE_THR > i4_max_cp[i4_max_cidx])
    {
        i4_max_cidx = 0;
    }

    i4_ret = ENABLE_2Nx2N;

    /* If only vertical discontinuity, go with one of 2Nx? */
    if(0 == (i4_max_ridx + i4_max_cidx))
    {
        //num_me_parts++;
        return i4_ret;
    }

    if(i4_max_ridx && (i4_max_cidx == 0))
    {
        //num_me_parts += 3;
        return ((i4_ret | i4_seg_lutr[i4_max_ridx]));
    }

    /* If only horizontal discontinuity, go with one of ?x2N */
    if(i4_max_cidx && (i4_max_ridx == 0))
    {
        //num_me_parts += 3;
        return ((i4_ret | i4_seg_lutc[i4_max_cidx]));
    }

    /* If middle is dominant in both directions, go with NxN */
    if((2 == i4_max_cidx) && (2 == i4_max_ridx))
    {
        //num_me_parts += 5;
        return ((i4_ret | ENABLE_NxN));
    }

    /* Otherwise, conservatively, enable NxN and the 2 AMPs */
    //num_me_parts += 9;
    return (i4_ret | ENABLE_NxN | i4_seg_lutr[i4_max_ridx] | i4_seg_lutc[i4_max_cidx]);
}

/**
********************************************************************************
*  @fn     hme_init_search_results(search_results_t *ps_search_results,
*                           S32 i4_num_ref,
*                           S32 i4_num_best_results,
*                           S32 i4_num_results_per_part,
*                           BLK_SIZE_T e_blk_size,
*                           S32 i4_x_off,
*                           S32 i4_y_off)
*
*  @brief  Initializes the search results structure with some key attributes
*
*  @param[out] ps_search_results : search results structure to initialise
*
*  @param[in] i4_num_Ref: corresponds to the number of ref ids searched
*
*  @param[in] i4_num_best_results: Number of best results for the CU to
*               be maintained in the result structure
*
*  @param[in] i4_num_results_per_part: Per active partition the number of best
*               results to be maintained
*
*  @param[in] e_blk_size: blk size of the CU for which this structure used
*
*  @param[in] i4_x_off: x offset of the top left of CU from CTB top left
*
*  @param[in] i4_y_off: y offset of the top left of CU from CTB top left
*
*  @param[in] pu1_is_past : points ot an array that tells whether a given ref id
*              has prominence in L0 or in L1 list (past or future )
*
*  @return void
********************************************************************************
*/
void hme_init_search_results(
    search_results_t *ps_search_results,
    S32 i4_num_ref,
    S32 i4_num_best_results,
    S32 i4_num_results_per_part,
    BLK_SIZE_T e_blk_size,
    S32 i4_x_off,
    S32 i4_y_off,
    U08 *pu1_is_past)
{
    CU_SIZE_T e_cu_size = ge_blk_size_to_cu_size[e_blk_size];

    ASSERT(e_cu_size != -1);
    ps_search_results->e_cu_size = e_cu_size;
    ps_search_results->u1_x_off = (U08)i4_x_off;
    ps_search_results->u1_y_off = (U08)i4_y_off;
    ps_search_results->u1_num_active_ref = (U08)i4_num_ref;
    ps_search_results->u1_num_best_results = (U08)i4_num_best_results;
    ps_search_results->u1_num_results_per_part = (U08)i4_num_results_per_part;
    ps_search_results->pu1_is_past = pu1_is_past;
    ps_search_results->u1_split_flag = 0;
    ps_search_results->best_cu_cost = MAX_32BIT_VAL;
}

/**
********************************************************************************
*  @fn     hme_reset_search_results((search_results_t *ps_search_results,
*                               S32 i4_part_mask)
*
*
*  @brief  Resets the best results to maximum values, so as to allow search
*          for the new CU's partitions. The existing results may be from an
*          older CU using same structure.
*
*  @param[in] ps_search_results: search results structure
*
*  @param[in] i4_part_mask : bit mask of active partitions
*
*  @return part mask (bit mask of active partitions to search)
********************************************************************************
*/
void hme_reset_search_results(search_results_t *ps_search_results, S32 i4_part_mask, S32 mv_res)
{
    S32 i4_num_ref = (S32)ps_search_results->u1_num_active_ref;
    S08 i1_ref_idx;
    S32 i, j;
    search_node_t *ps_search_node;

    /* store this for future use */
    ps_search_results->i4_part_mask = i4_part_mask;

    /* Reset the spli_flag to zero */
    ps_search_results->u1_split_flag = 0;

    HME_SET_MVPRED_RES((&ps_search_results->as_pred_ctxt[0]), mv_res);
    HME_SET_MVPRED_RES((&ps_search_results->as_pred_ctxt[1]), mv_res);

    for(i1_ref_idx = 0; i1_ref_idx < i4_num_ref; i1_ref_idx++)
    {
        /* Reset the individual partitino results */
        for(i = 0; i < TOT_NUM_PARTS; i++)
        {
            if(!(i4_part_mask & (1 << i)))
                continue;

            ps_search_node = ps_search_results->aps_part_results[i1_ref_idx][i];

            for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
            {
                ps_search_node[j].s_mv.i2_mvx = 0;
                ps_search_node[j].s_mv.i2_mvy = 0;
                ps_search_node[j].i4_tot_cost = MAX_32BIT_VAL;
                ps_search_node[j].i4_sad = MAX_32BIT_VAL;
                ps_search_node[j].i4_sdi = 0;
                ps_search_node[j].i1_ref_idx = -1;
                ps_search_node[j].u1_subpel_done = 0;
                ps_search_node[j].u1_is_avail = 1;
                ps_search_node[j].i4_mv_cost = 0;
            }
        }
    }
}
/**
********************************************************************************
*  @fn     hme_clamp_grid_by_mvrange(search_node_t *ps_search_node,
*                               S32 i4_step,
*                               range_prms_t *ps_mvrange)
*
*  @brief  Given a central pt within mv range, and a grid of points surrounding
*           this pt, this function returns a grid mask of pts within search rng
*
*  @param[in] ps_search_node: the centre pt of the grid
*
*  @param[in] i4_step: step size of grid
*
*  @param[in] ps_mvrange: structure containing the current mv range
*
*  @return bitmask of the  pts in grid within search range
********************************************************************************
*/
S32 hme_clamp_grid_by_mvrange(search_node_t *ps_search_node, S32 i4_step, range_prms_t *ps_mvrange)
{
    S32 i4_mask = GRID_ALL_PTS_VALID;
    if(ps_search_node->s_mv.i2_mvx + i4_step >= ps_mvrange->i2_max_x)
    {
        i4_mask &= (GRID_RT_3_INVALID);
    }
    if(ps_search_node->s_mv.i2_mvx - i4_step < ps_mvrange->i2_min_x)
    {
        i4_mask &= (GRID_LT_3_INVALID);
    }
    if(ps_search_node->s_mv.i2_mvy + i4_step >= ps_mvrange->i2_max_y)
    {
        i4_mask &= (GRID_BOT_3_INVALID);
    }
    if(ps_search_node->s_mv.i2_mvy - i4_step < ps_mvrange->i2_min_y)
    {
        i4_mask &= (GRID_TOP_3_INVALID);
    }
    return i4_mask;
}

/**
********************************************************************************
*  @fn    layer_ctxt_t *hme_get_past_layer_ctxt(me_ctxt_t *ps_ctxt,
S32 i4_layer_id)
*
*  @brief  returns the layer ctxt of the layer with given id from the temporally
*          previous frame
*
*  @param[in] ps_ctxt : ME context
*
*  @param[in] i4_layer_id : id of layer required
*
*  @return layer ctxt of given layer id in temporally previous frame
********************************************************************************
*/
layer_ctxt_t *hme_get_past_layer_ctxt(
    me_ctxt_t *ps_ctxt, me_frm_ctxt_t *ps_frm_ctxt, S32 i4_layer_id, S32 i4_num_me_frm_pllel)
{
    S32 i4_poc = ps_frm_ctxt->ai4_ref_idx_to_poc_lc[0];
    S32 i;
    layers_descr_t *ps_desc;

    for(i = 0; i < (ps_ctxt->aps_me_frm_prms[0]->max_num_ref * i4_num_me_frm_pllel) + 1; i++)
    {
        ps_desc = &ps_ctxt->as_ref_descr[i];
        if(i4_poc == ps_desc->aps_layers[i4_layer_id]->i4_poc)
            return (ps_desc->aps_layers[i4_layer_id]);
    }
    return NULL;
}

/**
********************************************************************************
*  @fn    layer_ctxt_t *hme_coarse_get_past_layer_ctxt(me_ctxt_t *ps_ctxt,
S32 i4_layer_id)
*
*  @brief  returns the layer ctxt of the layer with given id from the temporally
*          previous frame
*
*  @param[in] ps_ctxt : ME context
*
*  @param[in] i4_layer_id : id of layer required
*
*  @return layer ctxt of given layer id in temporally previous frame
********************************************************************************
*/
layer_ctxt_t *hme_coarse_get_past_layer_ctxt(coarse_me_ctxt_t *ps_ctxt, S32 i4_layer_id)
{
    S32 i4_poc = ps_ctxt->ai4_ref_idx_to_poc_lc[0];
    S32 i;
    layers_descr_t *ps_desc;

    for(i = 0; i < ps_ctxt->max_num_ref + 1 + NUM_BUFS_DECOMP_HME; i++)
    {
        ps_desc = &ps_ctxt->as_ref_descr[i];
        if(i4_poc == ps_desc->aps_layers[i4_layer_id]->i4_poc)
            return (ps_desc->aps_layers[i4_layer_id]);
    }
    return NULL;
}

/**
********************************************************************************
*  @fn    void hme_init_mv_bank(layer_ctxt_t *ps_layer_ctxt,
BLK_SIZE_T e_blk_size,
S32 i4_num_ref,
S32 i4_num_results_per_part)
*
*  @brief  Given a blk size to be used for this layer, this function initialize
*          the mv bank to make it ready to store and return results.
*
*  @param[in, out] ps_layer_ctxt: pointer to layer ctxt
*
*  @param[in] e_blk_size : resolution at which mvs are stored
*
*  @param[in] i4_num_ref: number of reference frames corresponding to which
*              results are stored.
*
*  @param[in] e_blk_size : resolution at which mvs are stored
*
*  @param[in] i4_num_results_per_part : Number of results to be stored per
*               ref idx. So these many best results stored
*
*  @return void
********************************************************************************
*/
void hme_init_mv_bank(
    layer_ctxt_t *ps_layer_ctxt,
    BLK_SIZE_T e_blk_size,
    S32 i4_num_ref,
    S32 i4_num_results_per_part,
    U08 u1_enc)
{
    layer_mv_t *ps_mv_bank;
    hme_mv_t *ps_mv1, *ps_mv2;
    S08 *pi1_ref_id1, *pi1_ref_id2;
    S32 blk_wd, mvs_in_blk, blks_in_row, mvs_in_row, blks_in_col;
    S32 i4_i, i4_j, blk_ht;

    ps_mv_bank = ps_layer_ctxt->ps_layer_mvbank;
    ps_mv_bank->i4_num_mvs_per_ref = i4_num_results_per_part;
    ps_mv_bank->i4_num_ref = i4_num_ref;
    mvs_in_blk = i4_num_ref * i4_num_results_per_part;
    ps_mv_bank->i4_num_mvs_per_blk = mvs_in_blk;

    /*************************************************************************/
    /* Store blk size, from blk size derive blk width and use this to compute*/
    /* number of blocks every row. We also pad to left and top by 1, to      */
    /* support the prediction mechanism.                                     */
    /*************************************************************************/
    ps_mv_bank->e_blk_size = e_blk_size;
    blk_wd = gau1_blk_size_to_wd[e_blk_size];
    blk_ht = gau1_blk_size_to_ht[e_blk_size];

    blks_in_row = (ps_layer_ctxt->i4_wd + (blk_wd - 1)) / blk_wd;
    blks_in_col = (ps_layer_ctxt->i4_ht + (blk_ht - 1)) / blk_ht;

    if(u1_enc)
    {
        /* TODO: CTB64x64 is assumed. FIX according to actual CTB */
        WORD32 num_ctb_cols = ((ps_layer_ctxt->i4_wd + 63) >> 6);
        WORD32 num_ctb_rows = ((ps_layer_ctxt->i4_ht + 63) >> 6);

        blks_in_row = (num_ctb_cols << 3);
        blks_in_col = (num_ctb_rows << 3);
    }

    blks_in_row += 2;
    mvs_in_row = blks_in_row * mvs_in_blk;

    ps_mv_bank->i4_num_blks_per_row = blks_in_row;
    ps_mv_bank->i4_num_mvs_per_row = mvs_in_row;

    /* To ensure run time requirements fall within allocation time request */
    ASSERT(ps_mv_bank->i4_num_mvs_per_row <= ps_mv_bank->max_num_mvs_per_row);

    /*************************************************************************/
    /* Increment by one full row at top for padding and one column in left   */
    /* this gives us the actual start of mv for 0,0 blk                      */
    /*************************************************************************/
    ps_mv_bank->ps_mv = ps_mv_bank->ps_mv_base + mvs_in_row + mvs_in_blk;
    ps_mv_bank->pi1_ref_idx = ps_mv_bank->pi1_ref_idx_base + mvs_in_row + mvs_in_blk;

    memset(ps_mv_bank->ps_mv_base, 0, mvs_in_row * sizeof(hme_mv_t));
    memset(ps_mv_bank->pi1_ref_idx_base, -1, mvs_in_row * sizeof(U08));

    /*************************************************************************/
    /* Initialize top row, left col and right col with zeros since these are */
    /* used as candidates during searches.                                   */
    /*************************************************************************/
    ps_mv1 = ps_mv_bank->ps_mv_base + mvs_in_row;
    ps_mv2 = ps_mv1 + mvs_in_row - mvs_in_blk;
    pi1_ref_id1 = ps_mv_bank->pi1_ref_idx_base + mvs_in_row;
    pi1_ref_id2 = pi1_ref_id1 + mvs_in_row - mvs_in_blk;
    for(i4_i = 0; i4_i < blks_in_col; i4_i++)
    {
        for(i4_j = 0; i4_j < mvs_in_blk; i4_j++)
        {
            ps_mv1[i4_j].i2_mv_x = 0;
            ps_mv1[i4_j].i2_mv_y = 0;
            ps_mv2[i4_j].i2_mv_x = 0;
            ps_mv2[i4_j].i2_mv_y = 0;
            pi1_ref_id1[i4_j] = -1;
            pi1_ref_id2[i4_j] = -1;
        }
        ps_mv1 += mvs_in_row;
        ps_mv2 += mvs_in_row;
        pi1_ref_id1 += mvs_in_row;
        pi1_ref_id2 += mvs_in_row;
    }
}
void hme_fill_mvbank_intra(layer_ctxt_t *ps_layer_ctxt)
{
    layer_mv_t *ps_mv_bank;
    hme_mv_t *ps_mv;
    S08 *pi1_ref_id;
    S32 blk_wd, blks_in_row, mvs_in_row, blks_in_col;
    S32 i, j, blk_ht;
    BLK_SIZE_T e_blk_size;

    ps_mv_bank = ps_layer_ctxt->ps_layer_mvbank;

    /*************************************************************************/
    /* Store blk size, from blk size derive blk width and use this to compute*/
    /* number of blocks every row. We also pad to left and top by 1, to      */
    /* support the prediction mechanism.                                     */
    /*************************************************************************/
    e_blk_size = ps_mv_bank->e_blk_size;
    blk_wd = gau1_blk_size_to_wd[e_blk_size];
    blk_ht = gau1_blk_size_to_wd[e_blk_size];
    blks_in_row = ps_layer_ctxt->i4_wd / blk_wd;
    blks_in_col = ps_layer_ctxt->i4_ht / blk_ht;
    mvs_in_row = blks_in_row * ps_mv_bank->i4_num_mvs_per_blk;

    /*************************************************************************/
    /* Increment by one full row at top for padding and one column in left   */
    /* this gives us the actual start of mv for 0,0 blk                      */
    /*************************************************************************/
    ps_mv = ps_mv_bank->ps_mv;
    pi1_ref_id = ps_mv_bank->pi1_ref_idx;

    for(i = 0; i < blks_in_col; i++)
    {
        for(j = 0; j < blks_in_row; j++)
        {
            ps_mv[j].i2_mv_x = INTRA_MV;
            ps_mv[j].i2_mv_y = INTRA_MV;
            pi1_ref_id[j] = -1;
        }
        ps_mv += ps_mv_bank->i4_num_mvs_per_row;
        pi1_ref_id += ps_mv_bank->i4_num_mvs_per_row;
    }
}

/**
********************************************************************************
*  @fn    void hme_derive_search_range(range_prms_t *ps_range,
*                                   range_prms_t *ps_pic_limit,
*                                   range_prms_t *ps_mv_limit,
*                                   S32 i4_x,
*                                   S32 i4_y,
*                                   S32 blk_wd,
*                                   S32 blk_ht)
*
*  @brief  given picture limits and blk dimensions and mv search limits, obtains
*          teh valid search range such that the blk stays within pic boundaries,
*          where picture boundaries include padded portions of picture
*
*  @param[out] ps_range: updated with actual search range
*
*  @param[in] ps_pic_limit : picture boundaries
*
*  @param[in] ps_mv_limit: Search range limits for the mvs
*
*  @param[in] i4_x : x coordinate of the blk
*
*  @param[in] i4_y : y coordinate of the blk
*
*  @param[in] blk_wd : blk width
*
*  @param[in] blk_ht : blk height
*
*  @return void
********************************************************************************
*/
void hme_derive_search_range(
    range_prms_t *ps_range,
    range_prms_t *ps_pic_limit,
    range_prms_t *ps_mv_limit,
    S32 i4_x,
    S32 i4_y,
    S32 blk_wd,
    S32 blk_ht)
{
    ps_range->i2_max_x =
        MIN((ps_pic_limit->i2_max_x - (S16)blk_wd - (S16)i4_x), ps_mv_limit->i2_max_x);
    ps_range->i2_min_x = MAX((ps_pic_limit->i2_min_x - (S16)i4_x), ps_mv_limit->i2_min_x);
    ps_range->i2_max_y =
        MIN((ps_pic_limit->i2_max_y - (S16)blk_ht - (S16)i4_y), ps_mv_limit->i2_max_y);
    ps_range->i2_min_y = MAX((ps_pic_limit->i2_min_y - (S16)i4_y), ps_mv_limit->i2_min_y);
}

/**
********************************************************************************
*  @fn    void hme_get_spatial_candt(search_node_t *ps_search_node,
*                                   layer_ctxt_t *ps_curr_layer,
*                                   S32 i4_blk_x,
*                                   S32 i4_blk_y,
*                                   S08 i1_ref_id,
*                                   S32 i4_result_id)
*
*  @brief  obtains a candt from the same mv bank as the current one, its called
*          spatial candt as it does not require scaling for temporal distances
*
*  @param[out] ps_search_node: mv and ref id updated here of the candt
*
*  @param[in] ps_curr_layer: layer ctxt, has the mv bank structure pointer
*
*  @param[in] i4_blk_x : x coordinate of the block in mv bank
*
*  @param[in] i4_blk_y : y coordinate of the block in mv bank
*
*  @param[in] i1_ref_id : Corresponds to ref idx from which to pick up mv
*              results, useful if multiple ref idx candts maintained separately.
*
*  @param[in] i4_result_id : If multiple results stored per ref idx, this
*              pts to the id of the result
*
*  @param[in] tr_avail : top right availability of the block
*
*  @param[in] bl_avail : bottom left availability of the block
*
*  @return void
********************************************************************************
*/
void hme_get_spatial_candt(
    layer_ctxt_t *ps_curr_layer,
    BLK_SIZE_T e_search_blk_size,
    S32 i4_blk_x,
    S32 i4_blk_y,
    S08 i1_ref_idx,
    search_node_t *ps_top_neighbours,
    search_node_t *ps_left_neighbours,
    S32 i4_result_id,
    S32 tr_avail,
    S32 bl_avail,
    S32 encode)

{
    layer_mv_t *ps_layer_mvbank = ps_curr_layer->ps_layer_mvbank;
    S32 i4_blk_size1 = gau1_blk_size_to_wd[ps_layer_mvbank->e_blk_size];
    S32 i4_blk_size2 = gau1_blk_size_to_wd[e_search_blk_size];
    search_node_t *ps_search_node;
    S32 i4_offset;
    hme_mv_t *ps_mv, *ps_mv_base;
    S08 *pi1_ref_idx, *pi1_ref_idx_base;
    S32 jump = 1, mvs_in_blk, mvs_in_row;
    S32 shift = (encode ? 2 : 0);

    if(i4_blk_size1 != i4_blk_size2)
    {
        i4_blk_x <<= 1;
        i4_blk_y <<= 1;
        jump = 2;
        if((i4_blk_size1 << 2) == i4_blk_size2)
        {
            i4_blk_x <<= 1;
            i4_blk_y <<= 1;
            jump = 4;
        }
    }

    mvs_in_blk = ps_layer_mvbank->i4_num_mvs_per_blk;
    mvs_in_row = ps_layer_mvbank->i4_num_mvs_per_row;

    /* Adjust teh blk coord to point to top left locn */
    i4_blk_x -= 1;
    i4_blk_y -= 1;
    /* Pick up the mvs from the location */
    i4_offset = (i4_blk_x * ps_layer_mvbank->i4_num_mvs_per_blk);
    i4_offset += (ps_layer_mvbank->i4_num_mvs_per_row * i4_blk_y);

    ps_mv = ps_layer_mvbank->ps_mv + i4_offset;
    pi1_ref_idx = ps_layer_mvbank->pi1_ref_idx + i4_offset;

    ps_mv += (i1_ref_idx * ps_layer_mvbank->i4_num_mvs_per_ref) + i4_result_id;
    pi1_ref_idx += (i1_ref_idx * ps_layer_mvbank->i4_num_mvs_per_ref) + i4_result_id;

    ps_mv_base = ps_mv;
    pi1_ref_idx_base = pi1_ref_idx;

    /* ps_mv and pi1_ref_idx now point to the top left locn */
    /* Get 4 mvs as follows:                                */
    ps_search_node = ps_top_neighbours;
    COPY_MV_TO_SEARCH_NODE(ps_search_node, ps_mv, pi1_ref_idx, i1_ref_idx, shift);

    /* Move to top */
    ps_search_node++;
    ps_mv += mvs_in_blk;
    pi1_ref_idx += mvs_in_blk;
    COPY_MV_TO_SEARCH_NODE(ps_search_node, ps_mv, pi1_ref_idx, i1_ref_idx, shift);

    /* Move to t1 : relevant for 4x4 part searches or for partitions i 16x16 */
    if(ps_layer_mvbank->i4_num_mvs_per_ref > 1)
    {
        ps_search_node++;
        ps_mv += (mvs_in_blk * (jump >> 1));
        pi1_ref_idx += (mvs_in_blk * (jump >> 1));
        COPY_MV_TO_SEARCH_NODE(ps_search_node, ps_mv, pi1_ref_idx, i1_ref_idx, shift);
    }
    else
    {
        ps_search_node++;
        ps_search_node->s_mv.i2_mvx = 0;
        ps_search_node->s_mv.i2_mvy = 0;
        ps_search_node->i1_ref_idx = i1_ref_idx;
        ps_search_node->u1_is_avail = 0;
        ps_search_node->u1_subpel_done = 0;
    }

    /* Move to tr: this will be tr w.r.t. the blk being searched */
    ps_search_node++;
    if(tr_avail == 0)
    {
        ps_search_node->s_mv.i2_mvx = 0;
        ps_search_node->s_mv.i2_mvy = 0;
        ps_search_node->i1_ref_idx = i1_ref_idx;
        ps_search_node->u1_is_avail = 0;
        ps_search_node->u1_subpel_done = 0;
    }
    else
    {
        ps_mv = ps_mv_base + (mvs_in_blk * (1 + jump));
        pi1_ref_idx = pi1_ref_idx_base + (mvs_in_blk * (1 + jump));
        COPY_MV_TO_SEARCH_NODE(ps_search_node, ps_mv, pi1_ref_idx, i1_ref_idx, shift);
    }

    /* Move to left */
    ps_search_node = ps_left_neighbours;
    ps_mv = ps_mv_base + mvs_in_row;
    pi1_ref_idx = pi1_ref_idx_base + mvs_in_row;
    COPY_MV_TO_SEARCH_NODE(ps_search_node, ps_mv, pi1_ref_idx, i1_ref_idx, shift);

    /* Move to l1 */
    if(ps_layer_mvbank->i4_num_mvs_per_ref > 1)
    {
        ps_search_node++;
        ps_mv += (mvs_in_row * (jump >> 1));
        pi1_ref_idx += (mvs_in_row * (jump >> 1));
        COPY_MV_TO_SEARCH_NODE(ps_search_node, ps_mv, pi1_ref_idx, i1_ref_idx, shift);
    }
    else
    {
        ps_search_node++;
        ps_search_node->s_mv.i2_mvx = 0;
        ps_search_node->s_mv.i2_mvy = 0;
        ps_search_node->i1_ref_idx = i1_ref_idx;
        ps_search_node->u1_is_avail = 0;
        ps_search_node->u1_subpel_done = 0;
    }

    /* Move to bl */
    ps_search_node++;
    if(bl_avail == 0)
    {
        ps_search_node->s_mv.i2_mvx = 0;
        ps_search_node->s_mv.i2_mvy = 0;
        ps_search_node->i1_ref_idx = i1_ref_idx;
        ps_search_node->u1_is_avail = 0;
    }
    else
    {
        ps_mv = ps_mv_base + (mvs_in_row * (1 + jump));
        pi1_ref_idx = pi1_ref_idx_base + (mvs_in_row * (1 + jump));
        COPY_MV_TO_SEARCH_NODE(ps_search_node, ps_mv, pi1_ref_idx, i1_ref_idx, shift);
    }
}

void hme_get_spatial_candt_in_l1_me(
    layer_ctxt_t *ps_curr_layer,
    BLK_SIZE_T e_search_blk_size,
    S32 i4_blk_x,
    S32 i4_blk_y,
    S08 i1_ref_idx,
    U08 u1_pred_dir,
    search_node_t *ps_top_neighbours,
    search_node_t *ps_left_neighbours,
    S32 i4_result_id,
    S32 tr_avail,
    S32 bl_avail,
    S32 i4_num_act_ref_l0,
    S32 i4_num_act_ref_l1)
{
    search_node_t *ps_search_node;
    hme_mv_t *ps_mv, *ps_mv_base;

    S32 i4_offset;
    S32 mvs_in_blk, mvs_in_row;
    S08 *pi1_ref_idx, *pi1_ref_idx_base;
    S32 i4_mv_pos_in_implicit_array;

    layer_mv_t *ps_layer_mvbank = ps_curr_layer->ps_layer_mvbank;

    S32 i4_blk_size1 = gau1_blk_size_to_wd[ps_layer_mvbank->e_blk_size];
    S32 i4_blk_size2 = gau1_blk_size_to_wd[e_search_blk_size];
    S32 jump = 1;
    S32 shift = 0;
    S32 i4_num_results_in_given_dir =
        ((u1_pred_dir == 1) ? (ps_layer_mvbank->i4_num_mvs_per_ref * i4_num_act_ref_l1)
                            : (ps_layer_mvbank->i4_num_mvs_per_ref * i4_num_act_ref_l0));

    if(i4_blk_size1 != i4_blk_size2)
    {
        i4_blk_x <<= 1;
        i4_blk_y <<= 1;
        jump = 2;
        if((i4_blk_size1 << 2) == i4_blk_size2)
        {
            i4_blk_x <<= 1;
            i4_blk_y <<= 1;
            jump = 4;
        }
    }

    mvs_in_blk = ps_layer_mvbank->i4_num_mvs_per_blk;
    mvs_in_row = ps_layer_mvbank->i4_num_mvs_per_row;

    /* Adjust the blk coord to point to top left locn */
    i4_blk_x -= 1;
    i4_blk_y -= 1;
    /* Pick up the mvs from the location */
    i4_offset = (i4_blk_x * ps_layer_mvbank->i4_num_mvs_per_blk);
    i4_offset += (ps_layer_mvbank->i4_num_mvs_per_row * i4_blk_y);

    i4_offset +=
        ((u1_pred_dir == 1) ? (ps_layer_mvbank->i4_num_mvs_per_ref * i4_num_act_ref_l0) : 0);

    ps_mv = ps_layer_mvbank->ps_mv + i4_offset;
    pi1_ref_idx = ps_layer_mvbank->pi1_ref_idx + i4_offset;

    ps_mv_base = ps_mv;
    pi1_ref_idx_base = pi1_ref_idx;

    /* TL */
    {
        /* ps_mv and pi1_ref_idx now point to the top left locn */
        ps_search_node = ps_top_neighbours;

        i4_mv_pos_in_implicit_array = hme_find_pos_of_implicitly_stored_ref_id(
            pi1_ref_idx, i1_ref_idx, i4_result_id, i4_num_results_in_given_dir);

        if(-1 != i4_mv_pos_in_implicit_array)
        {
            COPY_MV_TO_SEARCH_NODE(
                ps_search_node,
                &ps_mv[i4_mv_pos_in_implicit_array],
                &pi1_ref_idx[i4_mv_pos_in_implicit_array],
                i1_ref_idx,
                shift);
        }
        else
        {
            ps_search_node->u1_is_avail = 0;
            ps_search_node->s_mv.i2_mvx = 0;
            ps_search_node->s_mv.i2_mvy = 0;
            ps_search_node->i1_ref_idx = i1_ref_idx;
        }
    }

    /* Move to top */
    {
        /* ps_mv and pi1_ref_idx now point to the top left locn */
        ps_search_node++;
        ps_mv += mvs_in_blk;
        pi1_ref_idx += mvs_in_blk;

        i4_mv_pos_in_implicit_array = hme_find_pos_of_implicitly_stored_ref_id(
            pi1_ref_idx, i1_ref_idx, i4_result_id, i4_num_results_in_given_dir);

        if(-1 != i4_mv_pos_in_implicit_array)
        {
            COPY_MV_TO_SEARCH_NODE(
                ps_search_node,
                &ps_mv[i4_mv_pos_in_implicit_array],
                &pi1_ref_idx[i4_mv_pos_in_implicit_array],
                i1_ref_idx,
                shift);
        }
        else
        {
            ps_search_node->u1_is_avail = 0;
            ps_search_node->s_mv.i2_mvx = 0;
            ps_search_node->s_mv.i2_mvy = 0;
            ps_search_node->i1_ref_idx = i1_ref_idx;
        }
    }

    /* Move to t1 : relevant for 4x4 part searches or for partitions i 16x16 */
    if(ps_layer_mvbank->i4_num_mvs_per_ref > 1)
    {
        ps_search_node++;
        ps_mv += (mvs_in_blk * (jump >> 1));
        pi1_ref_idx += (mvs_in_blk * (jump >> 1));

        i4_mv_pos_in_implicit_array = hme_find_pos_of_implicitly_stored_ref_id(
            pi1_ref_idx, i1_ref_idx, i4_result_id, i4_num_results_in_given_dir);

        if(-1 != i4_mv_pos_in_implicit_array)
        {
            COPY_MV_TO_SEARCH_NODE(
                ps_search_node,
                &ps_mv[i4_mv_pos_in_implicit_array],
                &pi1_ref_idx[i4_mv_pos_in_implicit_array],
                i1_ref_idx,
                shift);
        }
        else
        {
            ps_search_node->u1_is_avail = 0;
            ps_search_node->s_mv.i2_mvx = 0;
            ps_search_node->s_mv.i2_mvy = 0;
            ps_search_node->i1_ref_idx = i1_ref_idx;
        }
    }
    else
    {
        ps_search_node++;
        ps_search_node->u1_is_avail = 0;
        ps_search_node->s_mv.i2_mvx = 0;
        ps_search_node->s_mv.i2_mvy = 0;
        ps_search_node->i1_ref_idx = i1_ref_idx;
    }

    /* Move to tr: this will be tr w.r.t. the blk being searched */
    ps_search_node++;
    if(tr_avail == 0)
    {
        ps_search_node->s_mv.i2_mvx = 0;
        ps_search_node->s_mv.i2_mvy = 0;
        ps_search_node->i1_ref_idx = i1_ref_idx;
        ps_search_node->u1_is_avail = 0;
        ps_search_node->u1_subpel_done = 0;
    }
    else
    {
        /* ps_mv and pi1_ref_idx now point to the top left locn */
        ps_mv = ps_mv_base + (mvs_in_blk * (1 + jump));
        pi1_ref_idx = pi1_ref_idx_base + (mvs_in_blk * (1 + jump));

        i4_mv_pos_in_implicit_array = hme_find_pos_of_implicitly_stored_ref_id(
            pi1_ref_idx, i1_ref_idx, i4_result_id, i4_num_results_in_given_dir);

        if(-1 != i4_mv_pos_in_implicit_array)
        {
            COPY_MV_TO_SEARCH_NODE(
                ps_search_node,
                &ps_mv[i4_mv_pos_in_implicit_array],
                &pi1_ref_idx[i4_mv_pos_in_implicit_array],
                i1_ref_idx,
                shift);
        }
        else
        {
            ps_search_node->u1_is_avail = 0;
            ps_search_node->s_mv.i2_mvx = 0;
            ps_search_node->s_mv.i2_mvy = 0;
            ps_search_node->i1_ref_idx = i1_ref_idx;
        }
    }

    /* Move to left */
    {
        /* ps_mv and pi1_ref_idx now point to the top left locn */
        ps_search_node = ps_left_neighbours;
        ps_mv = ps_mv_base + mvs_in_row;
        pi1_ref_idx = pi1_ref_idx_base + mvs_in_row;

        i4_mv_pos_in_implicit_array = hme_find_pos_of_implicitly_stored_ref_id(
            pi1_ref_idx, i1_ref_idx, i4_result_id, i4_num_results_in_given_dir);

        if(-1 != i4_mv_pos_in_implicit_array)
        {
            COPY_MV_TO_SEARCH_NODE(
                ps_search_node,
                &ps_mv[i4_mv_pos_in_implicit_array],
                &pi1_ref_idx[i4_mv_pos_in_implicit_array],
                i1_ref_idx,
                shift);
        }
        else
        {
            ps_search_node->u1_is_avail = 0;
            ps_search_node->s_mv.i2_mvx = 0;
            ps_search_node->s_mv.i2_mvy = 0;
            ps_search_node->i1_ref_idx = i1_ref_idx;
        }
    }

    /* Move to l1 */
    if(ps_layer_mvbank->i4_num_mvs_per_ref > 1)
    {
        /* ps_mv and pi1_ref_idx now point to the top left locn */
        ps_search_node++;
        ps_mv += (mvs_in_row * (jump >> 1));
        pi1_ref_idx += (mvs_in_row * (jump >> 1));

        i4_mv_pos_in_implicit_array = hme_find_pos_of_implicitly_stored_ref_id(
            pi1_ref_idx, i1_ref_idx, i4_result_id, i4_num_results_in_given_dir);

        if(-1 != i4_mv_pos_in_implicit_array)
        {
            COPY_MV_TO_SEARCH_NODE(
                ps_search_node,
                &ps_mv[i4_mv_pos_in_implicit_array],
                &pi1_ref_idx[i4_mv_pos_in_implicit_array],
                i1_ref_idx,
                shift);
        }
        else
        {
            ps_search_node->u1_is_avail = 0;
            ps_search_node->s_mv.i2_mvx = 0;
            ps_search_node->s_mv.i2_mvy = 0;
            ps_search_node->i1_ref_idx = i1_ref_idx;
        }
    }
    else
    {
        ps_search_node++;
        ps_search_node->u1_is_avail = 0;
        ps_search_node->s_mv.i2_mvx = 0;
        ps_search_node->s_mv.i2_mvy = 0;
        ps_search_node->i1_ref_idx = i1_ref_idx;
    }

    /* Move to bl */
    ps_search_node++;
    if(bl_avail == 0)
    {
        ps_search_node->s_mv.i2_mvx = 0;
        ps_search_node->s_mv.i2_mvy = 0;
        ps_search_node->i1_ref_idx = i1_ref_idx;
        ps_search_node->u1_is_avail = 0;
    }
    else
    {
        /* ps_mv and pi1_ref_idx now point to the top left locn */
        ps_mv = ps_mv_base + (mvs_in_row * (1 + jump));
        pi1_ref_idx = pi1_ref_idx_base + (mvs_in_row * (1 + jump));

        i4_mv_pos_in_implicit_array = hme_find_pos_of_implicitly_stored_ref_id(
            pi1_ref_idx, i1_ref_idx, i4_result_id, i4_num_results_in_given_dir);

        if(-1 != i4_mv_pos_in_implicit_array)
        {
            COPY_MV_TO_SEARCH_NODE(
                ps_search_node,
                &ps_mv[i4_mv_pos_in_implicit_array],
                &pi1_ref_idx[i4_mv_pos_in_implicit_array],
                i1_ref_idx,
                shift);
        }
        else
        {
            ps_search_node->u1_is_avail = 0;
            ps_search_node->s_mv.i2_mvx = 0;
            ps_search_node->s_mv.i2_mvy = 0;
            ps_search_node->i1_ref_idx = i1_ref_idx;
        }
    }
}

/**
********************************************************************************
*  @fn    void hme_fill_ctb_neighbour_mvs(layer_ctxt_t *ps_curr_layer,
*                                   S32 i4_blk_x,
*                                   S32 i4_blk_y,
*                                   mvgrid_t *ps_mv_grid ,
*                                   S32 i1_ref_id)
*
*  @brief  The 18x18 MV grid for a ctb, is filled in first row and 1st col
*          this corresponds to neighbours (TL, T, TR, L, BL)
*
*  @param[in] ps_curr_layer: layer ctxt, has the mv bank structure pointer
*
*  @param[in] blk_x : x coordinate of the block in mv bank
*
*  @param[in] blk_y : y coordinate of the block in mv bank
*
*  @param[in] ps_mv_grid : Grid (18x18 mvs at 4x4 level)
*
*  @param[in] i1_ref_idx : Corresponds to ref idx from which to pick up mv
*              results, useful if multiple ref idx candts maintained separately.
*
*  @return void
********************************************************************************
*/
void hme_fill_ctb_neighbour_mvs(
    layer_ctxt_t *ps_curr_layer,
    S32 blk_x,
    S32 blk_y,
    mv_grid_t *ps_mv_grid,
    U08 u1_pred_dir_ctr,
    U08 u1_default_ref_id,
    S32 i4_num_act_ref_l0)
{
    search_node_t *ps_grid_node;
    layer_mv_t *ps_layer_mvbank = ps_curr_layer->ps_layer_mvbank;
    S32 i4_offset;
    hme_mv_t *ps_mv, *ps_mv_base;
    S08 *pi1_ref_idx, *pi1_ref_idx_base;
    S32 jump = 0, inc, i, mvs_in_blk, mvs_in_row;

    if(ps_layer_mvbank->e_blk_size == BLK_4x4)
    {
        /* searching 16x16, mvs are for 4x4 */
        jump = 1;
        blk_x <<= 2;
        blk_y <<= 2;
    }
    else
    {
        /* Searching 16x16, mvs are for 8x8 */
        blk_x <<= 1;
        blk_y <<= 1;
    }
    ASSERT(ps_layer_mvbank->e_blk_size != BLK_16x16);

    mvs_in_blk = ps_layer_mvbank->i4_num_mvs_per_blk;
    mvs_in_row = ps_layer_mvbank->i4_num_mvs_per_row;

    /* Adjust the blk coord to point to top left locn */
    blk_x -= 1;
    blk_y -= 1;

    /* Pick up the mvs from the location */
    i4_offset = (blk_x * ps_layer_mvbank->i4_num_mvs_per_blk);
    i4_offset += (ps_layer_mvbank->i4_num_mvs_per_row * blk_y);

    i4_offset += (u1_pred_dir_ctr == 1);

    ps_mv = ps_layer_mvbank->ps_mv + i4_offset;
    pi1_ref_idx = ps_layer_mvbank->pi1_ref_idx + i4_offset;

    ps_mv_base = ps_mv;
    pi1_ref_idx_base = pi1_ref_idx;

    /* the 0, 0 entry of the grid pts to top left for the ctb */
    ps_grid_node = &ps_mv_grid->as_node[0];

    /* Copy 18 mvs at 4x4 level including top left, 16 top mvs for ctb, 1 tr */
    for(i = 0; i < 18; i++)
    {
        COPY_MV_TO_SEARCH_NODE(ps_grid_node, ps_mv, pi1_ref_idx, u1_default_ref_id, 0);
        ps_grid_node++;
        inc = 1;
        /* If blk size is 8x8, then every 2 grid nodes are updated with same mv */
        if(i & 1)
            inc = jump;

        ps_mv += (mvs_in_blk * inc);
        pi1_ref_idx += (mvs_in_blk * inc);
    }

    ps_mv = ps_mv_base + mvs_in_row;
    pi1_ref_idx = pi1_ref_idx_base + mvs_in_row;

    /* now copy left 16 left mvs */
    ps_grid_node = &ps_mv_grid->as_node[0];
    ps_grid_node += (ps_mv_grid->i4_stride);
    for(i = 0; i < 16; i++)
    {
        COPY_MV_TO_SEARCH_NODE(ps_grid_node, ps_mv, pi1_ref_idx, u1_default_ref_id, 0);
        ps_grid_node += ps_mv_grid->i4_stride;
        inc = 1;
        /* If blk size is 8x8, then every 2 grid nodes are updated with same mv */
        if(!(i & 1))
            inc = jump;

        ps_mv += (mvs_in_row * inc);
        pi1_ref_idx += (mvs_in_row * inc);
    }
    /* last one set to invalid as bottom left not yet encoded */
    ps_grid_node->u1_is_avail = 0;
}

void hme_reset_wkg_mem(buf_mgr_t *ps_buf_mgr)
{
    ps_buf_mgr->i4_used = 0;
}
void hme_init_wkg_mem(buf_mgr_t *ps_buf_mgr, U08 *pu1_mem, S32 size)
{
    ps_buf_mgr->pu1_wkg_mem = pu1_mem;
    ps_buf_mgr->i4_total = size;
    hme_reset_wkg_mem(ps_buf_mgr);
}

void hme_init_mv_grid(mv_grid_t *ps_mv_grid)
{
    S32 i, j;
    search_node_t *ps_search_node;
    /*************************************************************************/
    /* We have a 64x64 CTB in the worst case. For this, we have 16x16 4x4 MVs*/
    /* Additionally, we have 1 neighbour on each side. This makes it a 18x18 */
    /* MV Grid. The boundary of this Grid on all sides are neighbours and the*/
    /* left and top edges of this grid is filled run time. The center portion*/
    /* represents the actual CTB MVs (16x16) and is also filled run time.    */
    /* However, the availability is always set as available (init time)      */
    /*************************************************************************/
    ps_mv_grid->i4_stride = NUM_COLUMNS_IN_CTB_GRID;
    ps_mv_grid->i4_start_offset = ps_mv_grid->i4_stride + CTB_MV_GRID_PAD;
    ps_search_node = &ps_mv_grid->as_node[ps_mv_grid->i4_start_offset];
    for(i = 0; i < 16; i++)
    {
        for(j = 0; j < 16; j++)
        {
            ps_search_node[j].u1_is_avail = 1;
        }

        ps_search_node += ps_mv_grid->i4_stride;
    }
}
/**
********************************************************************************
*  @fn    void hme_pad_left(U08 *pu1_dst, S32 stride, S32 pad_wd, S32 pad_ht)
*
*  @brief  Pads horizontally to left side. Each pixel replicated across a line
*
*  @param[in] pu1_dst : destination pointer. Points to the pixel to be repeated
*
*  @param[in] stride : stride of destination buffer
*
*  @param[in] pad_wd : Amt of horizontal padding to be done
*
*  @param[in] pad_ht : Number of lines for which horizontal padding to be done
*
*  @return void
********************************************************************************
*/
void hme_pad_left(U08 *pu1_dst, S32 stride, S32 pad_wd, S32 pad_ht)
{
    S32 i, j;
    U08 u1_val;
    for(i = 0; i < pad_ht; i++)
    {
        u1_val = pu1_dst[0];
        for(j = -pad_wd; j < 0; j++)
            pu1_dst[j] = u1_val;

        pu1_dst += stride;
    }
}
/**
********************************************************************************
*  @fn    void hme_pad_right(U08 *pu1_dst, S32 stride, S32 pad_wd, S32 pad_ht)
*
*  @brief  Pads horizontally to rt side. Each pixel replicated across a line
*
*  @param[in] pu1_dst : destination pointer. Points to the pixel to be repeated
*
*  @param[in] stride : stride of destination buffer
*
*  @param[in] pad_wd : Amt of horizontal padding to be done
*
*  @param[in] pad_ht : Number of lines for which horizontal padding to be done
*
*  @return void
********************************************************************************
*/
void hme_pad_right(U08 *pu1_dst, S32 stride, S32 pad_wd, S32 pad_ht)
{
    S32 i, j;
    U08 u1_val;
    for(i = 0; i < pad_ht; i++)
    {
        u1_val = pu1_dst[0];
        for(j = 1; j <= pad_wd; j++)
            pu1_dst[j] = u1_val;

        pu1_dst += stride;
    }
}
/**
********************************************************************************
*  @fn    void hme_pad_top(U08 *pu1_dst, S32 stride, S32 pad_ht, S32 pad_wd)
*
*  @brief  Pads vertically on the top. Repeats the top line for top padding
*
*  @param[in] pu1_dst : destination pointer. Points to the line to be repeated
*
*  @param[in] stride : stride of destination buffer
*
*  @param[in] pad_ht : Amt of vertical padding to be done
*
*  @param[in] pad_wd : Number of columns for which vertical padding to be done
*
*  @return void
********************************************************************************
*/
void hme_pad_top(U08 *pu1_dst, S32 stride, S32 pad_ht, S32 pad_wd)
{
    S32 i;
    for(i = 1; i <= pad_ht; i++)
        memcpy(pu1_dst - (i * stride), pu1_dst, pad_wd);
}
/**
********************************************************************************
*  @fn    void hme_pad_bot(U08 *pu1_dst, S32 stride, S32 pad_ht, S32 pad_wd)
*
*  @brief  Pads vertically on the bot. Repeats the top line for top padding
*
*  @param[in] pu1_dst : destination pointer. Points to the line to be repeated
*
*  @param[in] stride : stride of destination buffer
*
*  @param[in] pad_ht : Amt of vertical padding to be done
*
*  @param[in] pad_wd : Number of columns for which vertical padding to be done
*
*  @return void
********************************************************************************
*/
void hme_pad_bot(U08 *pu1_dst, S32 stride, S32 pad_ht, S32 pad_wd)
{
    S32 i;
    for(i = 1; i <= pad_ht; i++)
        memcpy(pu1_dst + (i * stride), pu1_dst, pad_wd);
}

/**
********************************************************************************
*  @fn    void hme_get_wt_inp(layer_ctxt_t *ps_curr_layer,  S32 pos_x,
*                           S32 pos_y, S32 size)
*
*  @brief  Does weighting of the input in case the search needs to happen
*          with reference frames weighted
*
*  @param[in] ps_curr_layer: layer ctxt
*
*  @param[in] pos_x : x coordinate of the input blk in the picture
*
*  @param[in] pos_y : y coordinate of hte input blk in the picture
*
*  @param[in] size : size of the input block
*
*  @param[in] num_ref : Number of reference frames
*
*  @return void
********************************************************************************
*/
void hme_get_wt_inp(
    layer_ctxt_t *ps_curr_layer,
    wgt_pred_ctxt_t *ps_wt_inp_prms,
    S32 dst_stride,
    S32 pos_x,
    S32 pos_y,
    S32 size,
    S32 num_ref,
    U08 u1_is_wt_pred_on)
{
    S32 ref, i, j;
    U08 *pu1_src, *pu1_dst, *pu1_src_tmp;
    S32 log_wdc = ps_wt_inp_prms->wpred_log_wdc;
    S32 x_count, y_count;

    /* Fixed source */
    pu1_src = ps_curr_layer->pu1_inp;

    /* Make sure the start positions of block are inside frame limits */
    pos_x = MIN(pos_x, ps_curr_layer->i4_wd - 1);
    pos_y = MIN(pos_y, ps_curr_layer->i4_ht - 1);

    pu1_src += (pos_x + (pos_y * ps_curr_layer->i4_inp_stride));

    /* In case we handle imcomplete CTBs, we copy only as much as reqd */
    /* from input buffers to prevent out of bound accesses. In this    */
    /* case, we do padding in x or y or both dirns */
    x_count = MIN(size, (ps_curr_layer->i4_wd - pos_x));
    y_count = MIN(size, (ps_curr_layer->i4_ht - pos_y));

    for(i = 0; i < num_ref + 1; i++)
    {
        ps_wt_inp_prms->apu1_wt_inp[i] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
    }

    /* Run thro all ref ids */
    for(ref = 0; ref < num_ref + 1; ref++)
    {
        S32 wt, off;
        S32 inv_wt;

        pu1_src_tmp = pu1_src;

        /* Each ref id may have differnet wt/offset. */
        /* So we have unique inp buf for each ref id */
        pu1_dst = ps_wt_inp_prms->apu1_wt_inp[ref];

        if(ref == num_ref)
        {
            /* last ref will be non weighted input */
            for(i = 0; i < y_count; i++)
            {
                for(j = 0; j < x_count; j++)
                {
                    pu1_dst[j] = pu1_src_tmp[j];
                }
                pu1_src_tmp += ps_curr_layer->i4_inp_stride;
                pu1_dst += dst_stride;
            }
        }
        else
        {
            /* Wt and off specific to this ref id */
            wt = ps_wt_inp_prms->a_wpred_wt[ref];
            inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ref];
            off = ps_wt_inp_prms->a_wpred_off[ref];

            /* Generate size*size worth of modified input samples */
            for(i = 0; i < y_count; i++)
            {
                for(j = 0; j < x_count; j++)
                {
                    S32 tmp;

                    /* Since we scale input, we use inverse transform of wt pred */
                    //tmp = HME_INV_WT_PRED(pu1_src_tmp[j], wt, off, log_wdc);
                    tmp = HME_INV_WT_PRED1(pu1_src_tmp[j], inv_wt, off, log_wdc);
                    pu1_dst[j] = (U08)(HME_CLIP(tmp, 0, 255));
                }
                pu1_src_tmp += ps_curr_layer->i4_inp_stride;
                pu1_dst += dst_stride;
            }
        }

        /* Check and do padding in right direction if need be */
        pu1_dst = ps_wt_inp_prms->apu1_wt_inp[ref];
        if(x_count != size)
        {
            hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
        }

        /* Check and do padding in bottom directino if need be */
        if(y_count != size)
        {
            hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
        }
    }
}
/**
****************************************************************************************
*  @fn     hme_pick_best_pu_cand(pu_result_t *ps_pu_results_dst,
*                                pu_result_t *ps_pu_results_inp,
*                                UWORD8 u1_num_results_per_part,
*                                UWORD8 u1_num_best_cand)
*
*  @brief  Does the candidate evaluation across all the current candidates and returns
*           the best two or one candidates across given lists
*
*  @param[in]  - ps_pu_results_inp : Pointer to the input candidates
*              - u1_num_results_per_part: Number of available candidates
*
*  @param[out] - ps_pu_results_dst : Pointer to best PU results
*
****************************************************************************************
*/
void hme_pick_best_pu_cand(
    pu_result_t *ps_pu_results_dst,
    pu_result_t *ps_pu_results_list0,
    pu_result_t *ps_pu_results_list1,
    UWORD8 u1_num_results_per_part_l0,
    UWORD8 u1_num_results_per_part_l1,
    UWORD8 u1_candidate_rank)
{
    struct cand_pos_data
    {
        U08 u1_cand_list_id;

        U08 u1_cand_id_in_cand_list;
    } as_cand_pos_data[MAX_NUM_RESULTS_PER_PART_LIST << 1];

    S32 ai4_costs[MAX_NUM_RESULTS_PER_PART_LIST << 1];
    U08 i, j;

    for(i = 0; i < u1_num_results_per_part_l0; i++)
    {
        ai4_costs[i] = ps_pu_results_list0[i].i4_tot_cost;
        as_cand_pos_data[i].u1_cand_id_in_cand_list = i;
        as_cand_pos_data[i].u1_cand_list_id = 0;
    }

    for(i = 0, j = u1_num_results_per_part_l0; i < u1_num_results_per_part_l1; i++, j++)
    {
        ai4_costs[j] = ps_pu_results_list1[i].i4_tot_cost;
        as_cand_pos_data[j].u1_cand_id_in_cand_list = i;
        as_cand_pos_data[j].u1_cand_list_id = 1;
    }

    SORT_PRIMARY_INTTYPE_ARRAY_AND_REORDER_GENERIC_COMPANION_ARRAY(
        ai4_costs,
        as_cand_pos_data,
        u1_num_results_per_part_l0 + u1_num_results_per_part_l1,
        struct cand_pos_data);

    if(as_cand_pos_data[u1_candidate_rank].u1_cand_list_id)
    {
        ps_pu_results_dst[0] =
            ps_pu_results_list1[as_cand_pos_data[u1_candidate_rank].u1_cand_id_in_cand_list];
    }
    else
    {
        ps_pu_results_dst[0] =
            ps_pu_results_list0[as_cand_pos_data[u1_candidate_rank].u1_cand_id_in_cand_list];
    }
}

/* Returns the number of candidates */
static S32 hme_tu_recur_cand_harvester(
    part_type_results_t *ps_cand_container,
    inter_pu_results_t *ps_pu_data,
    inter_ctb_prms_t *ps_inter_ctb_prms,
    S32 i4_part_mask)
{
    part_type_results_t s_cand_data;

    U08 i, j;
    PART_ID_T e_part_id;

    S32 i4_num_cands = 0;

    /* 2Nx2N part_type decision part */
    if(i4_part_mask & ENABLE_2Nx2N)
    {
        U08 u1_num_candt_to_pick;

        e_part_id = ge_part_type_to_part_id[PRT_2Nx2N][0];

        ASSERT(ps_inter_ctb_prms->u1_max_2nx2n_tu_recur_cands >= 1);

        if(!ps_inter_ctb_prms->i4_bidir_enabled || (i4_part_mask == ENABLE_2Nx2N))
        {
            u1_num_candt_to_pick =
                MIN(ps_inter_ctb_prms->u1_max_2nx2n_tu_recur_cands,
                    ps_pu_data->u1_num_results_per_part_l0[e_part_id] +
                        ps_pu_data->u1_num_results_per_part_l1[e_part_id]);
        }
        else
        {
            u1_num_candt_to_pick =
                MIN(1,
                    ps_pu_data->u1_num_results_per_part_l0[e_part_id] +
                        ps_pu_data->u1_num_results_per_part_l1[e_part_id]);
        }

        if(ME_XTREME_SPEED_25 == ps_inter_ctb_prms->i1_quality_preset)
        {
            u1_num_candt_to_pick = MIN(u1_num_candt_to_pick, MAX_NUM_TU_RECUR_CANDS_IN_XS25);
        }

        for(i = 0; i < u1_num_candt_to_pick; i++)
        {
            /* Picks the best two candidates of all the available ones */
            hme_pick_best_pu_cand(
                ps_cand_container[i4_num_cands].as_pu_results,
                ps_pu_data->aps_pu_results[0][e_part_id],
                ps_pu_data->aps_pu_results[1][e_part_id],
                ps_pu_data->u1_num_results_per_part_l0[e_part_id],
                ps_pu_data->u1_num_results_per_part_l1[e_part_id],
                i);

            /* Update the other params part_type and total_cost in part_type_results */
            ps_cand_container[i4_num_cands].u1_part_type = e_part_id;
            ps_cand_container[i4_num_cands].i4_tot_cost =
                ps_cand_container[i4_num_cands].as_pu_results->i4_tot_cost;

            i4_num_cands++;
        }
    }

    /* SMP */
    {
        S32 i4_total_cost;

        S32 num_part_types = PRT_Nx2N - PRT_2NxN + 1;
        S32 start_part_type = PRT_2NxN;
        S32 best_cost = MAX_32BIT_VAL;
        S32 part_type_cnt = 0;

        for(j = 0; j < num_part_types; j++)
        {
            if(!(i4_part_mask & gai4_part_type_to_part_mask[j + start_part_type]))
            {
                continue;
            }

            for(i = 0; i < gau1_num_parts_in_part_type[j + start_part_type]; i++)
            {
                e_part_id = ge_part_type_to_part_id[j + start_part_type][i];

                /* Pick the best candidate for the partition acroos lists */
                hme_pick_best_pu_cand(
                    &s_cand_data.as_pu_results[i],
                    ps_pu_data->aps_pu_results[0][e_part_id],
                    ps_pu_data->aps_pu_results[1][e_part_id],
                    ps_pu_data->u1_num_results_per_part_l0[e_part_id],
                    ps_pu_data->u1_num_results_per_part_l1[e_part_id],
                    0);
            }

            i4_total_cost =
                s_cand_data.as_pu_results[0].i4_tot_cost + s_cand_data.as_pu_results[1].i4_tot_cost;

            if(i4_total_cost < best_cost)
            {
                /* Stores the index of the best part_type in the sub-catoegory */
                best_cost = i4_total_cost;

                ps_cand_container[i4_num_cands] = s_cand_data;

                ps_cand_container[i4_num_cands].u1_part_type = j + start_part_type;
                ps_cand_container[i4_num_cands].i4_tot_cost = i4_total_cost;
            }

            part_type_cnt++;
        }

        i4_num_cands = (part_type_cnt) ? (i4_num_cands + 1) : i4_num_cands;
    }

    /* AMP */
    {
        S32 i4_total_cost;

        S32 num_part_types = PRT_nRx2N - PRT_2NxnU + 1;
        S32 start_part_type = PRT_2NxnU;
        S32 best_cost = MAX_32BIT_VAL;
        S32 part_type_cnt = 0;

        for(j = 0; j < num_part_types; j++)
        {
            if(!(i4_part_mask & gai4_part_type_to_part_mask[j + start_part_type]))
            {
                continue;
            }

            for(i = 0; i < gau1_num_parts_in_part_type[j + start_part_type]; i++)
            {
                e_part_id = ge_part_type_to_part_id[j + start_part_type][i];

                /* Pick the best candidate for the partition acroos lists */
                hme_pick_best_pu_cand(
                    &s_cand_data.as_pu_results[i],
                    ps_pu_data->aps_pu_results[0][e_part_id],
                    ps_pu_data->aps_pu_results[1][e_part_id],
                    ps_pu_data->u1_num_results_per_part_l0[e_part_id],
                    ps_pu_data->u1_num_results_per_part_l1[e_part_id],
                    0);
            }

            i4_total_cost =
                s_cand_data.as_pu_results[0].i4_tot_cost + s_cand_data.as_pu_results[1].i4_tot_cost;

            if(i4_total_cost < best_cost)
            {
                /* Stores the index of the best part_type in the sub-catoegory */
                best_cost = i4_total_cost;

                ps_cand_container[i4_num_cands] = s_cand_data;

                ps_cand_container[i4_num_cands].u1_part_type = j + start_part_type;
                ps_cand_container[i4_num_cands].i4_tot_cost = i4_total_cost;
            }

            part_type_cnt++;
        }

        i4_num_cands = (part_type_cnt) ? (i4_num_cands + 1) : i4_num_cands;
    }

    return i4_num_cands;
}

/**
*****************************************************************************
*  @fn     hme_decide_part_types(search_results_t *ps_search_results)
*
*  @brief  Does uni/bi evaluation accross various partition types,
*          decides best inter partition types for the CU, compares
*          intra cost and decides the best K results for the CU
*
*          This is called post subpel refinmenent for 16x16s, 8x8s and
*          for post merge evaluation for 32x32,64x64 CUs
*
*  @param[in,out] ps_search_results : Search results data structure
*                 - In : 2 lists of upto 2mvs & refids, active partition mask
*                 - Out: Best results for final rdo evaluation of the cu
*
*  @param[in]     ps_subpel_prms : Sub pel params data structure
*
*
*  @par Description
*    --------------------------------------------------------------------------------
*     Flow:
*            for each category (SMP,AMP,2Nx2N based on part mask)
*            {
*                for each part_type
*                {
*                    for each part
*                        pick best candidate from each list
*                    combine uni part type
*                    update best results for part type
*                }
*                pick the best part type for given category (for SMP & AMP)
*            }
*                    ||
*                    ||
*                    \/
*           Bi-Pred evaluation:
*            for upto 4 best part types
*            {
*                for each part
*                {
*                    compute fixed size had for all uni and remember coeffs
*                    compute bisatd
*                    uni vs bi and gives upto two results
*                    also gives the pt level pred buffer
*                }
*             }
*                    ||
*                    ||
*                    \/
*            select X candidates for tu recursion as per the Note below
*               tu_rec_on_part_type (reuse transform coeffs)
*                    ||
*                    ||
*                    \/
*            insert intra nodes at appropriate result id
*                    ||
*                    ||
*                    \/
*            populate y best resuls for rdo based on preset
*
*     Note :
*     number of TU rec for P pics : 2 2nx2n + 1 smp + 1 amp for ms or 9 for hq
*     number of TU rec for B pics : 1 2nx2n + 1 smp + 1 amp for ms or 2 uni 2nx2n + 1 smp + 1 amp for ms or 9 for hq
*     --------------------------------------------------------------------------------
*
*  @return None
********************************************************************************
*/
void hme_decide_part_types(
    inter_cu_results_t *ps_cu_results,
    inter_pu_results_t *ps_pu_results,
    inter_ctb_prms_t *ps_inter_ctb_prms,
    me_frm_ctxt_t *ps_ctxt,
    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
    ihevce_me_optimised_function_list_t *ps_me_optimised_function_list

)
{
    S32 i, j;
    S32 i4_part_mask;
    ULWORD64 au8_pred_sigmaXSquare[NUM_BEST_ME_OUTPUTS][NUM_INTER_PU_PARTS];
    ULWORD64 au8_pred_sigmaX[NUM_BEST_ME_OUTPUTS][NUM_INTER_PU_PARTS];
    S32 i4_noise_term;
    WORD32 e_part_id;

    PF_SAD_FXN_TU_REC apf_err_compute[4];

    part_type_results_t as_part_type_results[NUM_BEST_ME_OUTPUTS];
    part_type_results_t *ps_part_type_results;

    S32 num_best_cand = 0;
    const S32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;

    i4_part_mask = ps_cu_results->i4_part_mask;

    num_best_cand = hme_tu_recur_cand_harvester(
        as_part_type_results, ps_pu_results, ps_inter_ctb_prms, i4_part_mask);

    /* Partition ID for the current PU */
    e_part_id = (UWORD8)ge_part_type_to_part_id[PRT_2Nx2N][0];

    ps_part_type_results = as_part_type_results;
    for(i = 0; i < num_best_cand; i++)
    {
        hme_compute_pred_and_evaluate_bi(
            ps_cu_results,
            ps_pu_results,
            ps_inter_ctb_prms,
            &(ps_part_type_results[i]),
            au8_pred_sigmaXSquare[i],
            au8_pred_sigmaX[i],
            ps_cmn_utils_optimised_function_list,
            ps_me_optimised_function_list

        );
    }
    /* Perform TU_REC on the best candidates selected */
    {
        WORD32 i4_sad_grid;
        WORD32 ai4_tu_split_flag[4];
        WORD32 ai4_tu_early_cbf[4];

        WORD32 best_cost[NUM_BEST_ME_OUTPUTS];
        WORD32 ai4_final_idx[NUM_BEST_ME_OUTPUTS];
        WORD16 i2_wght;
        WORD32 i4_satd;

        err_prms_t s_err_prms;
        err_prms_t *ps_err_prms = &s_err_prms;

        /* Default cost and final idx initialization */
        for(i = 0; i < num_best_cand; i++)
        {
            best_cost[i] = MAX_32BIT_VAL;
            ai4_final_idx[i] = -1;
        }

        /* Assign the stad function to the err_compute function pointer :
        Implemented only for 32x32 and 64x64, hence 16x16 and 8x8 are kept NULL */
        apf_err_compute[CU_64x64] = hme_evalsatd_pt_pu_64x64_tu_rec;
        apf_err_compute[CU_32x32] = hme_evalsatd_pt_pu_32x32_tu_rec;
        apf_err_compute[CU_16x16] = hme_evalsatd_pt_pu_16x16_tu_rec;
        apf_err_compute[CU_8x8] = hme_evalsatd_pt_pu_8x8_tu_rec;

        ps_err_prms->pi4_sad_grid = &i4_sad_grid;
        ps_err_prms->pi4_tu_split_flags = ai4_tu_split_flag;
        ps_err_prms->u1_max_tr_depth = ps_inter_ctb_prms->u1_max_tr_depth;
        ps_err_prms->pi4_tu_early_cbf = ai4_tu_early_cbf;
        ps_err_prms->i4_grid_mask = 1;
        ps_err_prms->pu1_wkg_mem = ps_inter_ctb_prms->pu1_wkg_mem;
        ps_err_prms->u1_max_tr_size = 32;

        if(ps_inter_ctb_prms->u1_is_cu_noisy)
        {
            ps_err_prms->u1_max_tr_size = MAX_TU_SIZE_WHEN_NOISY;
        }

        /* TU_REC for the best candidates, as mentioned in NOTE above (except candidates that
        are disabled by Part_mask */
        for(i = 0; i < num_best_cand; i++)
        {
            part_type_results_t *ps_best_results;
            pu_result_t *ps_pu_result;
            WORD32 part_type_cost;
            WORD32 cand_idx;

            WORD32 pred_dir;
            S32 i4_inp_off;

            S32 lambda;
            U08 lambda_qshift;
            U08 *apu1_inp[MAX_NUM_INTER_PARTS];
            S16 ai2_wt[MAX_NUM_INTER_PARTS];
            S32 ai4_inv_wt[MAX_NUM_INTER_PARTS];
            S32 ai4_inv_wt_shift_val[MAX_NUM_INTER_PARTS];

            WORD32 part_type = ps_part_type_results[i].u1_part_type;
            WORD32 e_cu_size = ps_cu_results->u1_cu_size;
            WORD32 e_blk_size = ge_cu_size_to_blk_size[e_cu_size];
            U08 u1_num_parts = gau1_num_parts_in_part_type[part_type];
            U08 u1_inp_buf_idx = UCHAR_MAX;

            ps_err_prms->i4_part_mask = i4_part_mask;
            ps_err_prms->i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
            ps_err_prms->i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
            ps_err_prms->pu1_ref = ps_part_type_results[i].pu1_pred;
            ps_err_prms->i4_ref_stride = ps_part_type_results[i].i4_pred_stride;

            /* Current offset for the present part type */
            i4_inp_off = ps_cu_results->i4_inp_offset;

            ps_best_results = &(ps_part_type_results[i]);

            part_type_cost = 0;
            lambda = ps_inter_ctb_prms->i4_lamda;
            lambda_qshift = ps_inter_ctb_prms->u1_lamda_qshift;

            for(j = 0; j < u1_num_parts; j++)
            {
                ps_pu_result = &(ps_best_results->as_pu_results[j]);

                pred_dir = ps_pu_result->pu.b2_pred_mode;

                if(PRED_L0 == pred_dir)
                {
                    apu1_inp[j] =
                        ps_inter_ctb_prms->apu1_wt_inp[PRED_L0][ps_pu_result->pu.mv.i1_l0_ref_idx] +
                        i4_inp_off;
                    ai2_wt[j] =
                        ps_inter_ctb_prms->pps_rec_list_l0[ps_pu_result->pu.mv.i1_l0_ref_idx]
                            ->s_weight_offset.i2_luma_weight;
                    ai4_inv_wt[j] =
                        ps_inter_ctb_prms->pi4_inv_wt
                            [ps_inter_ctb_prms->pi1_past_list[ps_pu_result->pu.mv.i1_l0_ref_idx]];
                    ai4_inv_wt_shift_val[j] =
                        ps_inter_ctb_prms->pi4_inv_wt_shift_val
                            [ps_inter_ctb_prms->pi1_past_list[ps_pu_result->pu.mv.i1_l0_ref_idx]];
                }
                else if(PRED_L1 == pred_dir)
                {
                    apu1_inp[j] =
                        ps_inter_ctb_prms->apu1_wt_inp[PRED_L1][ps_pu_result->pu.mv.i1_l1_ref_idx] +
                        i4_inp_off;
                    ai2_wt[j] =
                        ps_inter_ctb_prms->pps_rec_list_l1[ps_pu_result->pu.mv.i1_l1_ref_idx]
                            ->s_weight_offset.i2_luma_weight;
                    ai4_inv_wt[j] =
                        ps_inter_ctb_prms->pi4_inv_wt
                            [ps_inter_ctb_prms->pi1_future_list[ps_pu_result->pu.mv.i1_l1_ref_idx]];
                    ai4_inv_wt_shift_val[j] =
                        ps_inter_ctb_prms->pi4_inv_wt_shift_val
                            [ps_inter_ctb_prms->pi1_future_list[ps_pu_result->pu.mv.i1_l1_ref_idx]];
                }
                else if(PRED_BI == pred_dir)
                {
                    apu1_inp[j] = ps_inter_ctb_prms->pu1_non_wt_inp + i4_inp_off;
                    ai2_wt[j] = 1 << ps_inter_ctb_prms->wpred_log_wdc;
                    ai4_inv_wt[j] = i4_default_src_wt;
                    ai4_inv_wt_shift_val[j] = 0;
                }
                else
                {
                    ASSERT(0);
                }

                part_type_cost += ps_pu_result->i4_mv_cost;
            }

            if((u1_num_parts == 1) || (ai2_wt[0] == ai2_wt[1]))
            {
                ps_err_prms->pu1_inp = apu1_inp[0];
                ps_err_prms->i4_inp_stride = ps_inter_ctb_prms->i4_inp_stride;
                i2_wght = ai2_wt[0];
            }
            else
            {
                if(1 != ihevce_get_free_pred_buf_indices(
                            &u1_inp_buf_idx,
                            &ps_inter_ctb_prms->s_pred_buf_mngr.u4_pred_buf_usage_indicator,
                            1))
                {
                    ASSERT(0);
                }
                else
                {
                    U08 *pu1_dst =
                        ps_inter_ctb_prms->s_pred_buf_mngr.apu1_pred_bufs[u1_inp_buf_idx];
                    U08 *pu1_src = apu1_inp[0];
                    U08 u1_pu1_wd = (ps_part_type_results[i].as_pu_results[0].pu.b4_wd + 1) << 2;
                    U08 u1_pu1_ht = (ps_part_type_results[i].as_pu_results[0].pu.b4_ht + 1) << 2;
                    U08 u1_pu2_wd = (ps_part_type_results[i].as_pu_results[1].pu.b4_wd + 1) << 2;
                    U08 u1_pu2_ht = (ps_part_type_results[i].as_pu_results[1].pu.b4_ht + 1) << 2;

                    ps_cmn_utils_optimised_function_list->pf_copy_2d(
                        pu1_dst,
                        MAX_CU_SIZE,
                        pu1_src,
                        ps_inter_ctb_prms->i4_inp_stride,
                        u1_pu1_wd,
                        u1_pu1_ht);

                    pu1_dst +=
                        (gai1_is_part_vertical[ge_part_type_to_part_id[part_type][0]]
                             ? u1_pu1_ht * MAX_CU_SIZE
                             : u1_pu1_wd);
                    pu1_src =
                        apu1_inp[1] + (gai1_is_part_vertical[ge_part_type_to_part_id[part_type][0]]
                                           ? u1_pu1_ht * ps_inter_ctb_prms->i4_inp_stride
                                           : u1_pu1_wd);

                    ps_cmn_utils_optimised_function_list->pf_copy_2d(
                        pu1_dst,
                        MAX_CU_SIZE,
                        pu1_src,
                        ps_inter_ctb_prms->i4_inp_stride,
                        u1_pu2_wd,
                        u1_pu2_ht);

                    ps_err_prms->pu1_inp =
                        ps_inter_ctb_prms->s_pred_buf_mngr.apu1_pred_bufs[u1_inp_buf_idx];
                    ps_err_prms->i4_inp_stride = MAX_CU_SIZE;
                    i2_wght = ai2_wt[1];
                }
            }

#if !DISABLE_TU_RECURSION
            i4_satd = apf_err_compute[e_cu_size](
                ps_err_prms,
                lambda,
                lambda_qshift,
                ps_inter_ctb_prms->i4_qstep_ls8,
                ps_ctxt->ps_func_selector);
#else
            ps_err_prms->pi4_sad_grid = &i4_satd;

            pf_err_compute(ps_err_prms);

            if((part_type == PRT_2Nx2N) || (e_cu_size != CU_64x64))
            {
                ai4_tu_split_flag[0] = 1;
                ai4_tu_split_flag[1] = 1;
                ai4_tu_split_flag[2] = 1;
                ai4_tu_split_flag[3] = 1;

                ps_err_prms->i4_tu_split_cost = 0;
            }
            else
            {
                ai4_tu_split_flag[0] = 1;
                ai4_tu_split_flag[1] = 1;
                ai4_tu_split_flag[2] = 1;
                ai4_tu_split_flag[3] = 1;

                ps_err_prms->i4_tu_split_cost = 0;
            }
#endif

#if UNI_SATD_SCALE
            i4_satd = (i4_satd * i2_wght) >> ps_inter_ctb_prms->wpred_log_wdc;
#endif

            if(ps_inter_ctb_prms->u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
            {
                ULWORD64 u8_temp_var, u8_temp_var1, u8_pred_sigmaSquaredX;
                ULWORD64 u8_src_variance, u8_pred_variance;
                unsigned long u4_shift_val;
                S32 i4_bits_req;
                S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;

                if(1 == u1_num_parts)
                {
                    u8_pred_sigmaSquaredX = au8_pred_sigmaX[i][0] * au8_pred_sigmaX[i][0];
                    u8_pred_variance = au8_pred_sigmaXSquare[i][0] - u8_pred_sigmaSquaredX;

                    if(e_cu_size == CU_8x8)
                    {
                        PART_ID_T e_part_id = (PART_ID_T)(
                            (PART_ID_NxN_TL) + (ps_cu_results->u1_x_off & 1) +
                            ((ps_cu_results->u1_y_off & 1) << 1));

                        u4_shift_val = ihevce_calc_stim_injected_variance(
                            ps_inter_ctb_prms->pu8_part_src_sigmaX,
                            ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
                            &u8_src_variance,
                            ai4_inv_wt[0],
                            ai4_inv_wt_shift_val[0],
                            ps_inter_ctb_prms->wpred_log_wdc,
                            e_part_id);
                    }
                    else
                    {
                        u4_shift_val = ihevce_calc_stim_injected_variance(
                            ps_inter_ctb_prms->pu8_part_src_sigmaX,
                            ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
                            &u8_src_variance,
                            ai4_inv_wt[0],
                            ai4_inv_wt_shift_val[0],
                            ps_inter_ctb_prms->wpred_log_wdc,
                            e_part_id);
                    }

                    u8_pred_variance = u8_pred_variance >> u4_shift_val;

                    GETRANGE64(i4_bits_req, u8_pred_variance);

                    if(i4_bits_req > 27)
                    {
                        u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
                        u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
                    }

                    if(u8_src_variance == u8_pred_variance)
                    {
                        u8_temp_var = (1 << STIM_Q_FORMAT);
                    }
                    else
                    {
                        u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
                        u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
                        u8_temp_var1 = (u8_src_variance * u8_src_variance) +
                                       (u8_pred_variance * u8_pred_variance);
                        u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
                        u8_temp_var = (u8_temp_var / u8_temp_var1);
                    }

                    i4_noise_term = (UWORD32)u8_temp_var;

                    ASSERT(i4_noise_term >= 0);

                    i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;

                    u8_temp_var = i4_satd;
                    u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
                    u8_temp_var += (1 << ((i4_q_level)-1));
                    i4_satd = (UWORD32)(u8_temp_var >> (i4_q_level));
                }
                else /*if(e_cu_size <= CU_16x16)*/
                {
                    unsigned long temp_shift_val;
                    PART_ID_T ae_part_id[MAX_NUM_INTER_PARTS] = {
                        ge_part_type_to_part_id[part_type][0], ge_part_type_to_part_id[part_type][1]
                    };

                    u4_shift_val = ihevce_calc_variance_for_diff_weights(
                        ps_inter_ctb_prms->pu8_part_src_sigmaX,
                        ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
                        &u8_src_variance,
                        ai4_inv_wt,
                        ai4_inv_wt_shift_val,
                        ps_best_results->as_pu_results,
                        ps_inter_ctb_prms->wpred_log_wdc,
                        ae_part_id,
                        gau1_blk_size_to_wd[e_blk_size],
                        u1_num_parts,
                        1);

                    temp_shift_val = u4_shift_val;

                    u4_shift_val = ihevce_calc_variance_for_diff_weights(
                        au8_pred_sigmaX[i],
                        au8_pred_sigmaXSquare[i],
                        &u8_pred_variance,
                        ai4_inv_wt,
                        ai4_inv_wt_shift_val,
                        ps_best_results->as_pu_results,
                        0,
                        ae_part_id,
                        gau1_blk_size_to_wd[e_blk_size],
                        u1_num_parts,
                        0);

                    u8_pred_variance = u8_pred_variance >> temp_shift_val;

                    GETRANGE64(i4_bits_req, u8_pred_variance);

                    if(i4_bits_req > 27)
                    {
                        u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
                        u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
                    }

                    if(u8_src_variance == u8_pred_variance)
                    {
                        u8_temp_var = (1 << STIM_Q_FORMAT);
                    }
                    else
                    {
                        u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
                        u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
                        u8_temp_var1 = (u8_src_variance * u8_src_variance) +
                                       (u8_pred_variance * u8_pred_variance);
                        u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
                        u8_temp_var = (u8_temp_var / u8_temp_var1);
                    }

                    i4_noise_term = (UWORD32)u8_temp_var;

                    ASSERT(i4_noise_term >= 0);
                    ASSERT(i4_noise_term <= (1 << (STIM_Q_FORMAT + ALPHA_Q_FORMAT)));

                    i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;

                    u8_temp_var = i4_satd;
                    u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
                    u8_temp_var += (1 << ((i4_q_level)-1));
                    i4_satd = (UWORD32)(u8_temp_var >> (i4_q_level));

                    ASSERT(i4_satd >= 0);
                }
            }

            if(u1_inp_buf_idx != UCHAR_MAX)
            {
                ihevce_set_pred_buf_as_free(
                    &ps_inter_ctb_prms->s_pred_buf_mngr.u4_pred_buf_usage_indicator,
                    u1_inp_buf_idx);
            }

            part_type_cost += i4_satd;

            /*Update the best results with the new results */
            ps_best_results->i4_tot_cost = part_type_cost;

            ps_best_results->i4_tu_split_cost = ps_err_prms->i4_tu_split_cost;

            ASSERT(ai4_tu_split_flag[0] >= 0);
            if(e_cu_size == CU_64x64)
            {
                ps_best_results->ai4_tu_split_flag[0] = ai4_tu_split_flag[0];
                ps_best_results->ai4_tu_split_flag[1] = ai4_tu_split_flag[1];
                ps_best_results->ai4_tu_split_flag[2] = ai4_tu_split_flag[2];
                ps_best_results->ai4_tu_split_flag[3] = ai4_tu_split_flag[3];

                /* Update the TU early cbf flags into the best results structure */
                ps_best_results->ai4_tu_early_cbf[0] = ai4_tu_early_cbf[0];
                ps_best_results->ai4_tu_early_cbf[1] = ai4_tu_early_cbf[1];
                ps_best_results->ai4_tu_early_cbf[2] = ai4_tu_early_cbf[2];
                ps_best_results->ai4_tu_early_cbf[3] = ai4_tu_early_cbf[3];
            }
            else
            {
                ps_best_results->ai4_tu_split_flag[0] = ai4_tu_split_flag[0];
                ps_best_results->ai4_tu_early_cbf[0] = ai4_tu_early_cbf[0];
            }

            if(part_type_cost < best_cost[num_best_cand - 1])
            {
                /* Push and sort current part type if it is one of the num_best_cand */
                for(cand_idx = 0; cand_idx < i; cand_idx++)
                {
                    if(part_type_cost <= best_cost[cand_idx])
                    {
                        memmove(
                            &ai4_final_idx[cand_idx + 1],
                            &ai4_final_idx[cand_idx],
                            sizeof(WORD32) * (i - cand_idx));
                        memmove(
                            &best_cost[cand_idx + 1],
                            &best_cost[cand_idx],
                            sizeof(WORD32) * (i - cand_idx));
                        break;
                    }
                }

                ai4_final_idx[cand_idx] = i;
                best_cost[cand_idx] = part_type_cost;
            }
        }

        ps_cu_results->u1_num_best_results = num_best_cand;

        for(i = 0; i < num_best_cand; i++)
        {
            ASSERT(ai4_final_idx[i] < num_best_cand);

            if(ai4_final_idx[i] != -1)
            {
                memcpy(
                    &(ps_cu_results->ps_best_results[i]),
                    &(ps_part_type_results[ai4_final_idx[i]]),
                    sizeof(part_type_results_t));
            }
        }
    }

    for(i = 0; i < (MAX_NUM_PRED_BUFS_USED_FOR_PARTTYPE_DECISIONS)-2; i++)
    {
        ihevce_set_pred_buf_as_free(
            &ps_inter_ctb_prms->s_pred_buf_mngr.u4_pred_buf_usage_indicator, i);
    }
}

/**
**************************************************************************************************
*  @fn     hme_populate_pus(search_results_t *ps_search_results, inter_cu_results_t *ps_cu_results)
*
*  @brief Does the population of the inter_cu_results structure with the results after the
*           subpel refinement
*
*          This is called post subpel refinmenent for 16x16s, 8x8s and
*          for post merge evaluation for 32x32,64x64 CUs
*
*  @param[in,out] ps_search_results : Search results data structure
*                 - ps_cu_results : cu_results data structure
*                   ps_pu_result  : Pointer to the memory for storing PU's
*
****************************************************************************************************
*/
void hme_populate_pus(
    me_ctxt_t *ps_thrd_ctxt,
    me_frm_ctxt_t *ps_ctxt,
    hme_subpel_prms_t *ps_subpel_prms,
    search_results_t *ps_search_results,
    inter_cu_results_t *ps_cu_results,
    inter_pu_results_t *ps_pu_results,
    pu_result_t *ps_pu_result,
    inter_ctb_prms_t *ps_inter_ctb_prms,
    wgt_pred_ctxt_t *ps_wt_prms,
    layer_ctxt_t *ps_curr_layer,
    U08 *pu1_pred_dir_searched,
    WORD32 i4_num_active_ref)
{
    WORD32 i, j, k;
    WORD32 i4_part_mask;
    WORD32 i4_ref;
    UWORD8 e_part_id;
    pu_result_t *ps_curr_pu;
    search_node_t *ps_search_node;
    part_attr_t *ps_part_attr;
    UWORD8 e_cu_size = ps_search_results->e_cu_size;
    WORD32 num_results_per_part_l0 = 0;
    WORD32 num_results_per_part_l1 = 0;
    WORD32 i4_ref_id;
    WORD32 i4_total_act_ref;

    i4_part_mask = ps_search_results->i4_part_mask;

    /* pred_buf_mngr init */
    {
        hme_get_wkg_mem(&ps_ctxt->s_buf_mgr, MAX_WKG_MEM_SIZE_PER_THREAD);

        ps_inter_ctb_prms->s_pred_buf_mngr.u4_pred_buf_usage_indicator = UINT_MAX;

        for(i = 0; i < MAX_NUM_PRED_BUFS_USED_FOR_PARTTYPE_DECISIONS - 2; i++)
        {
            ps_inter_ctb_prms->s_pred_buf_mngr.apu1_pred_bufs[i] =
                ps_ctxt->s_buf_mgr.pu1_wkg_mem + i * INTERP_OUT_BUF_SIZE;
            ps_inter_ctb_prms->s_pred_buf_mngr.u4_pred_buf_usage_indicator &= ~(1 << i);
        }

        ps_inter_ctb_prms->pu1_wkg_mem = ps_ctxt->s_buf_mgr.pu1_wkg_mem + i * INTERP_OUT_BUF_SIZE;
    }

    ps_inter_ctb_prms->i4_alpha_stim_multiplier = ALPHA_FOR_NOISE_TERM_IN_ME;
    ps_inter_ctb_prms->u1_is_cu_noisy = ps_subpel_prms->u1_is_cu_noisy;
    ps_inter_ctb_prms->i4_lamda = ps_search_results->as_pred_ctxt[0].lambda;

    /* Populate the CU level parameters */
    ps_cu_results->u1_cu_size = ps_search_results->e_cu_size;
    ps_cu_results->u1_num_best_results = ps_search_results->u1_num_best_results;
    ps_cu_results->i4_part_mask = ps_search_results->i4_part_mask;
    ps_cu_results->u1_x_off = ps_search_results->u1_x_off;
    ps_cu_results->u1_y_off = ps_search_results->u1_y_off;

    i4_total_act_ref =
        ps_ctxt->s_frm_prms.u1_num_active_ref_l0 + ps_ctxt->s_frm_prms.u1_num_active_ref_l1;
    /*Populate the partition results
    Loop across all the active references that are enabled right now */
    for(i = 0; i < MAX_PART_TYPES; i++)
    {
        if(!(i4_part_mask & gai4_part_type_to_part_mask[i]))
        {
            continue;
        }

        for(j = 0; j < gau1_num_parts_in_part_type[i]; j++)
        {
            /* Partition ID for the current PU */
            e_part_id = (UWORD8)ge_part_type_to_part_id[i][j];
            ps_part_attr = &gas_part_attr_in_cu[e_part_id];

            num_results_per_part_l0 = 0;
            num_results_per_part_l1 = 0;

            ps_pu_results->aps_pu_results[0][e_part_id] =
                ps_pu_result + (e_part_id * MAX_NUM_RESULTS_PER_PART_LIST);
            ps_pu_results->aps_pu_results[1][e_part_id] =
                ps_pu_result + ((e_part_id + TOT_NUM_PARTS) * MAX_NUM_RESULTS_PER_PART_LIST);

            for(i4_ref = 0; i4_ref < i4_num_active_ref; i4_ref++)
            {
                U08 u1_pred_dir = pu1_pred_dir_searched[i4_ref];

                for(k = 0; k < ps_search_results->u1_num_results_per_part; k++)
                {
                    ps_search_node =
                        &ps_search_results->aps_part_results[u1_pred_dir][e_part_id][k];

                    /* If subpel is done then the node is a valid candidate else break the loop */
                    if(ps_search_node->u1_subpel_done)
                    {
                        i4_ref_id = ps_search_node->i1_ref_idx;

                        ASSERT(i4_ref_id >= 0);

                        /* Check whether current ref_id is past or future and assign the pointers to L0 or L1 list accordingly */
                        if(!u1_pred_dir)
                        {
                            ps_curr_pu = ps_pu_results->aps_pu_results[0][e_part_id] +
                                         num_results_per_part_l0;

                            ASSERT(
                                ps_ctxt->a_ref_idx_lc_to_l0[i4_ref_id] <
                                ps_inter_ctb_prms->u1_num_active_ref_l0);

                            /* Always populate the ref_idx value in l0_ref_idx */
                            ps_curr_pu->pu.mv.i1_l0_ref_idx =
                                ps_ctxt->a_ref_idx_lc_to_l0[i4_ref_id];
                            ps_curr_pu->pu.mv.s_l0_mv = ps_search_node->s_mv;
                            ps_curr_pu->pu.mv.i1_l1_ref_idx = -1;
                            ps_curr_pu->pu.b2_pred_mode = PRED_L0;

                            ps_inter_ctb_prms->apu1_wt_inp[0][ps_curr_pu->pu.mv.i1_l0_ref_idx] =
                                ps_wt_prms->apu1_wt_inp[i4_ref_id];

                            num_results_per_part_l0++;
                        }
                        else
                        {
                            ps_curr_pu = ps_pu_results->aps_pu_results[1][e_part_id] +
                                         num_results_per_part_l1;

                            ASSERT(
                                ps_ctxt->a_ref_idx_lc_to_l1[i4_ref_id] <
                                ps_inter_ctb_prms->u1_num_active_ref_l1);

                            /* populate the ref_idx value in l1_ref_idx */
                            ps_curr_pu->pu.mv.i1_l1_ref_idx =
                                ps_ctxt->a_ref_idx_lc_to_l1[i4_ref_id];
                            ps_curr_pu->pu.mv.s_l1_mv = ps_search_node->s_mv;
                            ps_curr_pu->pu.mv.i1_l0_ref_idx = -1;
                            ps_curr_pu->pu.b2_pred_mode = PRED_L1;

                            /* Copy the values from weighted params to common_frm_aprams */
                            ps_inter_ctb_prms->apu1_wt_inp[1][ps_curr_pu->pu.mv.i1_l1_ref_idx] =
                                ps_wt_prms->apu1_wt_inp[i4_ref_id];

                            num_results_per_part_l1++;
                        }
                        ps_curr_pu->i4_mv_cost = ps_search_node->i4_mv_cost;
                        ps_curr_pu->i4_sdi = ps_search_node->i4_sdi;

#if UNI_SATD_SCALE
                        /*SATD is scaled by weight. Hence rescale the SATD */
                        ps_curr_pu->i4_tot_cost =
                            ((ps_search_node->i4_sad *
                                  ps_ctxt->s_wt_pred.a_wpred_wt[ps_search_node->i1_ref_idx] +
                              (1 << (ps_inter_ctb_prms->wpred_log_wdc - 1))) >>
                             ps_inter_ctb_prms->wpred_log_wdc) +
                            ps_search_node->i4_mv_cost;
#endif

                        /* Packed format of the width and height */
                        ps_curr_pu->pu.b4_wd = ((ps_part_attr->u1_x_count << e_cu_size) >> 2) - 1;
                        ps_curr_pu->pu.b4_ht = ((ps_part_attr->u1_y_count << e_cu_size) >> 2) - 1;

                        ps_curr_pu->pu.b4_pos_x =
                            (((ps_part_attr->u1_x_start << e_cu_size) + ps_cu_results->u1_x_off) >>
                             2);
                        ps_curr_pu->pu.b4_pos_y =
                            (((ps_part_attr->u1_y_start << e_cu_size) + ps_cu_results->u1_y_off) >>
                             2);

                        ps_curr_pu->pu.b1_intra_flag = 0;

                        /* Unweighted input */
                        ps_inter_ctb_prms->pu1_non_wt_inp =
                            ps_wt_prms->apu1_wt_inp[i4_total_act_ref];

                        ps_search_node++;
                    }
                    else
                    {
                        break;
                    }
                }
            }

            ps_pu_results->u1_num_results_per_part_l0[e_part_id] = num_results_per_part_l0;
            ps_pu_results->u1_num_results_per_part_l1[e_part_id] = num_results_per_part_l1;
        }
    }
}

/**
*********************************************************************************************************
*  @fn     hme_populate_pus_8x8_cu(search_results_t *ps_search_results, inter_cu_results_t *ps_cu_results)
*
*  @brief Does the population of the inter_cu_results structure with the results after the
*           subpel refinement
*
*          This is called post subpel refinmenent for 16x16s, 8x8s and
*          for post merge evaluation for 32x32,64x64 CUs
*
*  @param[in,out] ps_search_results : Search results data structure
*                 - ps_cu_results : cu_results data structure
*                   ps_pu_results : Pointer for the PU's
*                   ps_pu_result  : Pointer to the memory for storing PU's
*
*********************************************************************************************************
*/
void hme_populate_pus_8x8_cu(
    me_ctxt_t *ps_thrd_ctxt,
    me_frm_ctxt_t *ps_ctxt,
    hme_subpel_prms_t *ps_subpel_prms,
    search_results_t *ps_search_results,
    inter_cu_results_t *ps_cu_results,
    inter_pu_results_t *ps_pu_results,
    pu_result_t *ps_pu_result,
    inter_ctb_prms_t *ps_inter_ctb_prms,
    U08 *pu1_pred_dir_searched,
    WORD32 i4_num_active_ref,
    U08 u1_blk_8x8_mask)
{
    WORD32 i, k;
    WORD32 i4_part_mask;
    WORD32 i4_ref;
    pu_result_t *ps_curr_pu;
    search_node_t *ps_search_node;
    WORD32 i4_ref_id;
    WORD32 x_off, y_off;

    /* Make part mask available as only 2Nx2N
    Later support for 4x8 and 8x4 needs to be added */
    i4_part_mask = ENABLE_2Nx2N;

    x_off = ps_search_results->u1_x_off;
    y_off = ps_search_results->u1_y_off;

    for(i = 0; i < 4; i++)
    {
        if(u1_blk_8x8_mask & (1 << i))
        {
            UWORD8 u1_x_pos, u1_y_pos;

            WORD32 num_results_per_part_l0 = 0;
            WORD32 num_results_per_part_l1 = 0;

            ps_cu_results->u1_cu_size = CU_8x8;
            ps_cu_results->u1_num_best_results = ps_search_results->u1_num_best_results;
            ps_cu_results->i4_part_mask = i4_part_mask;
            ps_cu_results->u1_x_off = x_off + (i & 1) * 8;
            ps_cu_results->u1_y_off = y_off + (i >> 1) * 8;
            ps_cu_results->i4_inp_offset = ps_cu_results->u1_x_off + (ps_cu_results->u1_y_off * 64);

            ps_cu_results->ps_best_results[0].i4_tot_cost = MAX_32BIT_VAL;
            ps_cu_results->ps_best_results[0].i4_tu_split_cost = 0;

            u1_x_pos = ps_cu_results->u1_x_off >> 2;
            u1_y_pos = ps_cu_results->u1_y_off >> 2;

            if(!(ps_search_results->i4_part_mask & ENABLE_NxN))
            {
                ps_curr_pu = &ps_cu_results->ps_best_results[0].as_pu_results[0];

                ps_cu_results->i4_part_mask = 0;
                ps_cu_results->u1_num_best_results = 0;

                ps_curr_pu->i4_tot_cost = MAX_32BIT_VAL;

                ps_curr_pu->pu.b4_wd = 1;
                ps_curr_pu->pu.b4_ht = 1;
                ps_curr_pu->pu.b4_pos_x = u1_x_pos;
                ps_curr_pu->pu.b4_pos_y = u1_y_pos;
                ps_cu_results->ps_best_results[0].i4_tu_split_cost = 0;

                ps_cu_results++;
                ps_pu_results++;

                continue;
            }

            ps_pu_results->aps_pu_results[0][0] =
                ps_pu_result + (i * MAX_NUM_RESULTS_PER_PART_LIST);
            ps_pu_results->aps_pu_results[1][0] =
                ps_pu_result + ((i + TOT_NUM_PARTS) * MAX_NUM_RESULTS_PER_PART_LIST);

            for(i4_ref = 0; i4_ref < i4_num_active_ref; i4_ref++)
            {
                U08 u1_pred_dir = pu1_pred_dir_searched[i4_ref];

                /* Select the NxN partition node for the current ref_idx in the search results*/
                ps_search_node =
                    ps_search_results->aps_part_results[u1_pred_dir][PART_ID_NxN_TL + i];

                for(k = 0; k < ps_search_results->u1_num_results_per_part; k++)
                {
                    /* If subpel is done then the node is a valid candidate else break the loop */
                    if((ps_search_node->u1_is_avail) || (ps_search_node->u1_subpel_done))
                    {
                        i4_ref_id = ps_search_node->i1_ref_idx;

                        ASSERT(i4_ref_id >= 0);

                        if(!u1_pred_dir)
                        {
                            ps_curr_pu =
                                ps_pu_results->aps_pu_results[0][0] + num_results_per_part_l0;

                            ASSERT(
                                ps_ctxt->a_ref_idx_lc_to_l0[i4_ref_id] <
                                ps_inter_ctb_prms->u1_num_active_ref_l0);

                            ps_curr_pu->pu.mv.i1_l0_ref_idx =
                                ps_ctxt->a_ref_idx_lc_to_l0[i4_ref_id];
                            ps_curr_pu->pu.mv.s_l0_mv = ps_search_node->s_mv;
                            ps_curr_pu->pu.mv.i1_l1_ref_idx = -1;
                            ps_curr_pu->pu.b2_pred_mode = PRED_L0;

                            num_results_per_part_l0++;
                        }
                        else
                        {
                            ps_curr_pu =
                                ps_pu_results->aps_pu_results[1][0] + num_results_per_part_l1;

                            ASSERT(
                                ps_ctxt->a_ref_idx_lc_to_l1[i4_ref_id] <
                                ps_inter_ctb_prms->u1_num_active_ref_l1);

                            ps_curr_pu->pu.mv.i1_l1_ref_idx =
                                ps_ctxt->a_ref_idx_lc_to_l1[i4_ref_id];
                            ps_curr_pu->pu.mv.s_l1_mv = ps_search_node->s_mv;
                            ps_curr_pu->pu.mv.i1_l0_ref_idx = -1;
                            ps_curr_pu->pu.b2_pred_mode = PRED_L1;

                            num_results_per_part_l1++;
                        }
                        ps_curr_pu->i4_mv_cost = ps_search_node->i4_mv_cost;
                        ps_curr_pu->i4_sdi = ps_search_node->i4_sdi;

#if UNI_SATD_SCALE
                        /*SATD is scaled by weight. Hence rescale the SATD */
                        ps_curr_pu->i4_tot_cost =
                            ((ps_search_node->i4_sad *
                                  ps_ctxt->s_wt_pred.a_wpred_wt[ps_search_node->i1_ref_idx] +
                              (1 << (ps_inter_ctb_prms->wpred_log_wdc - 1))) >>
                             ps_inter_ctb_prms->wpred_log_wdc) +
                            ps_search_node->i4_mv_cost;
#endif

                        ps_curr_pu->pu.b4_wd = 1;
                        ps_curr_pu->pu.b4_ht = 1;
                        ps_curr_pu->pu.b4_pos_x = u1_x_pos;
                        ps_curr_pu->pu.b4_pos_y = u1_y_pos;
                        ps_curr_pu->pu.b1_intra_flag = 0;

                        ps_search_node++;
                    }
                    else
                    {
                        /* if NxN was not evaluated at 16x16 level, assign max cost to 8x8 CU
                        to remove 8x8's as possible candidates during evaluation */

                        ps_curr_pu = ps_pu_results->aps_pu_results[0][0] + num_results_per_part_l0;

                        ps_curr_pu->i4_tot_cost = MAX_32BIT_VAL;

                        ps_curr_pu = ps_pu_results->aps_pu_results[1][0] + num_results_per_part_l1;

                        ps_curr_pu->i4_tot_cost = MAX_32BIT_VAL;

                        break;
                    }
                }
            }

            /* Update the num_results per_part across lists L0 and L1 */
            ps_pu_results->u1_num_results_per_part_l0[0] = num_results_per_part_l0;
            ps_pu_results->u1_num_results_per_part_l1[0] = num_results_per_part_l1;
        }
        ps_cu_results++;
        ps_pu_results++;
    }
}

/**
********************************************************************************
*  @fn     hme_insert_intra_nodes_post_bipred
*
*  @brief  Compares intra costs (populated by IPE) with the best inter costs
*          (populated after evaluating bi-pred) and updates the best results
*          if intra cost is better
*
*  @param[in,out]  ps_cu_results    [inout] : Best results structure of CU
*                  ps_cur_ipe_ctb   [in]    : intra results for the current CTB
*                  i4_frm_qstep     [in]    : current frame quantizer(qscale)*
*
*  @return None
********************************************************************************
*/
void hme_insert_intra_nodes_post_bipred(
    inter_cu_results_t *ps_cu_results,
    ipe_l0_ctb_analyse_for_me_t *ps_cur_ipe_ctb,
    WORD32 i4_frm_qstep)
{
    WORD32 i;
    WORD32 num_results;
    WORD32 cu_size = ps_cu_results->u1_cu_size;
    UWORD8 u1_x_off = ps_cu_results->u1_x_off;
    UWORD8 u1_y_off = ps_cu_results->u1_y_off;

    /* Id of the 32x32 block, 16x16 block in a CTB */
    WORD32 i4_32x32_id = (u1_y_off >> 5) * 2 + (u1_x_off >> 5);
    WORD32 i4_16x16_id = ((u1_y_off >> 4) & 0x1) * 2 + ((u1_x_off >> 4) & 0x1);

    /* Flags to indicate if intra64/intra32/intra16 cusize are invalid as per IPE decision */
    WORD32 disable_intra64 = 0;
    WORD32 disable_intra32 = 0;
    WORD32 disable_intra16 = 0;

    S32 i4_intra_2nx2n_cost;

    /* ME final results for this CU (post seeding of best uni/bi pred results) */
    part_type_results_t *ps_best_result;

    i4_frm_qstep *= !L0ME_IN_OPENLOOP_MODE;

    /*If inter candidates are enabled then enter the for loop to update the intra candidate */

    if((ps_cu_results->u1_num_best_results == 0) && (CU_8x8 == ps_cu_results->u1_cu_size))
    {
        ps_cu_results->u1_num_best_results = 1;
    }

    num_results = ps_cu_results->u1_num_best_results;

    ps_best_result = &ps_cu_results->ps_best_results[0];

    /* Disable intra16/32/64 flags based on split flags recommended by IPE */
    if(ps_cur_ipe_ctb->u1_split_flag)
    {
        disable_intra64 = 1;
        if(ps_cur_ipe_ctb->as_intra32_analyse[i4_32x32_id].b1_split_flag)
        {
            disable_intra32 = 1;

            if(ps_cur_ipe_ctb->as_intra32_analyse[i4_32x32_id]
                   .as_intra16_analyse[i4_16x16_id]
                   .b1_split_flag)
            {
                disable_intra16 = 1;
            }
        }
    }

    /* Derive the intra cost based on current cu size and offset */
    switch(cu_size)
    {
    case CU_8x8:
    {
        i4_intra_2nx2n_cost = ps_cur_ipe_ctb->ai4_best8x8_intra_cost[u1_y_off + (u1_x_off >> 3)];

        /* Accounting for coding noise in the open loop IPE cost */
        i4_intra_2nx2n_cost +=
            ((i4_frm_qstep * 16) >> 2) /*+ ((i4_frm_qstep*i4_intra_2nx2n_cost)/256) */;

        break;
    }

    case CU_16x16:
    {
        i4_intra_2nx2n_cost =
            ps_cur_ipe_ctb->ai4_best16x16_intra_cost[(u1_y_off >> 4) * 4 + (u1_x_off >> 4)];

        /* Accounting for coding noise in the open loop IPE cost */
        i4_intra_2nx2n_cost +=
            ((i4_frm_qstep * 16)); /* + ((i4_frm_qstep*i4_intra_2nx2n_cost)/256) */

        if(disable_intra16)
        {
            /* Disable intra 2Nx2N (intra 16) as IPE suggested best mode as 8x8 */
            i4_intra_2nx2n_cost = MAX_32BIT_VAL;
        }
        break;
    }

    case CU_32x32:
    {
        i4_intra_2nx2n_cost =
            ps_cur_ipe_ctb->ai4_best32x32_intra_cost[(u1_y_off >> 5) * 2 + (u1_x_off >> 5)];

        /* Accounting for coding noise in the open loop IPE cost */
        i4_intra_2nx2n_cost +=
            (i4_frm_qstep * 16 * 4) /* + ((i4_frm_qstep*i4_intra_2nx2n_cost)/256) */;

        if(disable_intra32)
        {
            /* Disable intra 2Nx2N (intra 32) as IPE suggested best mode as 16x16 or 8x8 */
            i4_intra_2nx2n_cost = MAX_32BIT_VAL;
        }
        break;
    }

    case CU_64x64:
    {
        i4_intra_2nx2n_cost = ps_cur_ipe_ctb->i4_best64x64_intra_cost;

        /* Accounting for coding noise in the open loop IPE cost */
        i4_intra_2nx2n_cost +=
            (i4_frm_qstep * 16 * 16) /* + ((i4_frm_qstep*i4_intra_2nx2n_cost)/256) */;

        if(disable_intra64)
        {
            /* Disable intra 2Nx2N (intra 64) as IPE suggested best mode as 32x32 /16x16 / 8x8 */
            i4_intra_2nx2n_cost = MAX_32BIT_VAL;
        }
        break;
    }

    default:
        ASSERT(0);
    }

    {
        /*****************************************************************/
        /* Intra / Inter cost comparison for  2Nx2N : cu size 8/16/32/64 */
        /* Identify where the current result isto be placed. Basically   */
        /* find the node which has cost just higher than node under test */
        /*****************************************************************/
        for(i = 0; i < num_results; i++)
        {
            /* Subtrqact the tu_spli_flag_cost from total_inter_cost for fair comparision */
            WORD32 inter_cost = ps_best_result[i].i4_tot_cost - ps_best_result[i].i4_tu_split_cost;

            if(i4_intra_2nx2n_cost < inter_cost)
            {
                if(i < (num_results - 1))
                {
                    memmove(
                        ps_best_result + i + 1,
                        ps_best_result + i,
                        sizeof(ps_best_result[0]) * (num_results - 1 - i));
                }

                /* Insert the intra node result */
                ps_best_result[i].u1_part_type = PRT_2Nx2N;
                ps_best_result[i].i4_tot_cost = i4_intra_2nx2n_cost;
                ps_best_result[i].ai4_tu_split_flag[0] = 0;
                ps_best_result[i].ai4_tu_split_flag[1] = 0;
                ps_best_result[i].ai4_tu_split_flag[2] = 0;
                ps_best_result[i].ai4_tu_split_flag[3] = 0;

                /* Populate intra flag, cost and default mvs, refidx for intra pu */
                ps_best_result[i].as_pu_results[0].i4_tot_cost = i4_intra_2nx2n_cost;
                //ps_best_result[i].as_pu_results[0].i4_sad = i4_intra_2nx2n_cost;
                ps_best_result[i].as_pu_results[0].i4_mv_cost = 0;
                ps_best_result[i].as_pu_results[0].pu.b1_intra_flag = 1;
                ps_best_result[i].as_pu_results[0].pu.mv.i1_l0_ref_idx = -1;
                ps_best_result[i].as_pu_results[0].pu.mv.i1_l1_ref_idx = -1;
                ps_best_result[i].as_pu_results[0].pu.mv.s_l0_mv.i2_mvx = INTRA_MV;
                ps_best_result[i].as_pu_results[0].pu.mv.s_l0_mv.i2_mvy = INTRA_MV;
                ps_best_result[i].as_pu_results[0].pu.mv.s_l1_mv.i2_mvx = INTRA_MV;
                ps_best_result[i].as_pu_results[0].pu.mv.s_l1_mv.i2_mvy = INTRA_MV;

                break;
            }
        }
    }
}

S32 hme_recompute_lambda_from_min_8x8_act_in_ctb(
    me_frm_ctxt_t *ps_ctxt, ipe_l0_ctb_analyse_for_me_t *ps_cur_ipe_ctb)
{
    double lambda;
    double lambda_modifier;
    WORD32 i4_cu_qp;
    frm_lambda_ctxt_t *ps_frm_lambda_ctxt;
    //ipe_l0_ctb_analyse_for_me_t *ps_cur_ipe_ctb;
    WORD32 i4_frame_qp;
    rc_quant_t *ps_rc_quant_ctxt;
    WORD32 i4_is_bpic;

    ps_frm_lambda_ctxt = &ps_ctxt->s_frm_lambda_ctxt;
    //ps_cur_ipe_ctb = ps_ctxt->ps_ipe_l0_ctb_frm_base;
    i4_frame_qp = ps_ctxt->s_frm_prms.i4_frame_qp;
    ps_rc_quant_ctxt = ps_ctxt->ps_rc_quant_ctxt;
    i4_is_bpic = ps_ctxt->s_frm_prms.bidir_enabled;

    i4_cu_qp = ps_rc_quant_ctxt->pi4_qp_to_qscale[i4_frame_qp + ps_rc_quant_ctxt->i1_qp_offset];

    {
        if(ps_ctxt->i4_l0me_qp_mod)
        {
#if MODULATE_LAMDA_WHEN_SPATIAL_MOD_ON
#if LAMDA_BASED_ON_QUANT
            WORD32 i4_activity = ps_cur_ipe_ctb->i4_64x64_act_factor[2][0];
#else
            WORD32 i4_activity = ps_cur_ipe_ctb->i4_64x64_act_factor[3][0];
#endif
            i4_cu_qp = (((i4_cu_qp)*i4_activity) + (1 << (QP_LEVEL_MOD_ACT_FACTOR - 1))) >>
                       QP_LEVEL_MOD_ACT_FACTOR;

#endif
        }
        if(i4_cu_qp > ps_rc_quant_ctxt->i2_max_qscale)
            i4_cu_qp = ps_rc_quant_ctxt->i2_max_qscale;
        else if(i4_cu_qp < ps_rc_quant_ctxt->i2_min_qscale)
            i4_cu_qp = ps_rc_quant_ctxt->i2_min_qscale;

        i4_cu_qp = ps_rc_quant_ctxt->pi4_qscale_to_qp[i4_cu_qp];
    }

    if(i4_cu_qp > ps_rc_quant_ctxt->i2_max_qp)
        i4_cu_qp = ps_rc_quant_ctxt->i2_max_qp;
    else if(i4_cu_qp < ps_rc_quant_ctxt->i2_min_qp)
        i4_cu_qp = ps_rc_quant_ctxt->i2_min_qp;

    lambda = pow(2.0, (((double)(i4_cu_qp - 12)) / 3));

    lambda_modifier = ps_frm_lambda_ctxt->lambda_modifier;

    if(i4_is_bpic)
    {
        lambda_modifier = lambda_modifier * CLIP3((((double)(i4_cu_qp - 12)) / 6.0), 2.00, 4.00);
    }
    if(ps_ctxt->i4_use_const_lamda_modifier)
    {
        if(ps_ctxt->s_frm_prms.is_i_pic)
        {
            lambda_modifier = ps_ctxt->f_i_pic_lamda_modifier;
        }
        else
        {
            lambda_modifier = CONST_LAMDA_MOD_VAL;
        }
    }
    lambda *= lambda_modifier;

    return ((WORD32)(sqrt(lambda) * (1 << LAMBDA_Q_SHIFT)));
}

/**
********************************************************************************
*  @fn     hme_update_dynamic_search_params
*
*  @brief  Update the Dynamic search params based on the current MVs
*
*  @param[in,out]  ps_dyn_range_prms    [inout] : Dyn. Range Param str.
*                  i2_mvy               [in]    : current MV y comp.
*
*  @return None
********************************************************************************
*/
void hme_update_dynamic_search_params(dyn_range_prms_t *ps_dyn_range_prms, WORD16 i2_mvy)
{
    /* If MV is up large, update i2_dyn_max_y */
    if(i2_mvy > ps_dyn_range_prms->i2_dyn_max_y)
        ps_dyn_range_prms->i2_dyn_max_y = i2_mvy;
    /* If MV is down large, update i2_dyn_min_y */
    if(i2_mvy < ps_dyn_range_prms->i2_dyn_min_y)
        ps_dyn_range_prms->i2_dyn_min_y = i2_mvy;
}

void hme_add_new_node_to_a_sorted_array(
    search_node_t *ps_result_node,
    search_node_t **pps_sorted_array,
    U08 *pu1_shifts,
    U32 u4_num_results_updated,
    U08 u1_shift)
{
    U32 i;

    if(NULL == pu1_shifts)
    {
        S32 i4_cur_node_cost = ps_result_node->i4_tot_cost;

        for(i = 0; i < u4_num_results_updated; i++)
        {
            if(i4_cur_node_cost < pps_sorted_array[i]->i4_tot_cost)
            {
                memmove(
                    &pps_sorted_array[i + 1],
                    &pps_sorted_array[i],
                    (u4_num_results_updated - i) * sizeof(search_node_t *));

                break;
            }
        }
    }
    else
    {
        S32 i4_cur_node_cost =
            (u1_shift == 0) ? ps_result_node->i4_tot_cost
                            : (ps_result_node->i4_tot_cost + (1 << (u1_shift - 1))) >> u1_shift;

        for(i = 0; i < u4_num_results_updated; i++)
        {
            S32 i4_prev_node_cost = (pu1_shifts[i] == 0) ? pps_sorted_array[i]->i4_tot_cost
                                                         : (pps_sorted_array[i]->i4_tot_cost +
                                                            (1 << (pu1_shifts[i] - 1))) >>
                                                               pu1_shifts[i];

            if(i4_cur_node_cost < i4_prev_node_cost)
            {
                memmove(
                    &pps_sorted_array[i + 1],
                    &pps_sorted_array[i],
                    (u4_num_results_updated - i) * sizeof(search_node_t *));
                memmove(
                    &pu1_shifts[i + 1], &pu1_shifts[i], (u4_num_results_updated - i) * sizeof(U08));

                break;
            }
        }

        pu1_shifts[i] = u1_shift;
    }

    pps_sorted_array[i] = ps_result_node;
}

S32 hme_find_pos_of_implicitly_stored_ref_id(
    S08 *pi1_ref_idx, S08 i1_ref_idx, S32 i4_result_id, S32 i4_num_results)
{
    S32 i;

    for(i = 0; i < i4_num_results; i++)
    {
        if(i1_ref_idx == pi1_ref_idx[i])
        {
            if(0 == i4_result_id)
            {
                return i;
            }
            else
            {
                i4_result_id--;
            }
        }
    }

    return -1;
}

static __inline void hme_search_node_populator(
    search_node_t *ps_search_node, hme_mv_t *ps_mv, S08 i1_ref_idx, S08 i1_mv_magnitude_shift)
{
    ps_search_node->ps_mv->i2_mvx = SHL_NEG((WORD16)ps_mv->i2_mv_x, i1_mv_magnitude_shift);
    ps_search_node->ps_mv->i2_mvy = SHL_NEG((WORD16)ps_mv->i2_mv_y, i1_mv_magnitude_shift);
    ps_search_node->i1_ref_idx = i1_ref_idx;
    ps_search_node->u1_is_avail = 1;
    ps_search_node->u1_subpel_done = 0;
}

S32 hme_populate_search_candidates(fpel_srch_cand_init_data_t *ps_ctxt)
{
    hme_mv_t *ps_mv;

    S32 wd_c, ht_c, wd_p, ht_p;
    S32 blksize_p, blksize_c;
    S32 i;
    S08 *pi1_ref_idx;
    /* Cache for storing offsets */
    S32 ai4_cand_offsets[NUM_SEARCH_CAND_LOCATIONS];

    layer_ctxt_t *ps_curr_layer = ps_ctxt->ps_curr_layer;
    layer_ctxt_t *ps_coarse_layer = ps_ctxt->ps_coarse_layer;
    layer_mv_t *ps_coarse_layer_mvbank = ps_coarse_layer->ps_layer_mvbank;
    layer_mv_t *ps_curr_layer_mvbank = ps_curr_layer->ps_layer_mvbank;
    search_candt_t *ps_search_cands = ps_ctxt->ps_search_cands;
    hme_mv_t s_zero_mv = { 0 };

    S32 i4_pos_x = ps_ctxt->i4_pos_x;
    S32 i4_pos_y = ps_ctxt->i4_pos_y;
    S32 i4_num_act_ref_l0 = ps_ctxt->i4_num_act_ref_l0;
    S32 i4_num_act_ref_l1 = ps_ctxt->i4_num_act_ref_l1;
    U08 u1_pred_dir = ps_ctxt->u1_pred_dir;
    U08 u1_pred_dir_ctr = ps_ctxt->u1_pred_dir_ctr;
    U08 u1_num_results_in_curr_mvbank = ps_ctxt->u1_num_results_in_mvbank;
    U08 u1_num_results_in_coarse_mvbank =
        (u1_pred_dir == 0) ? (i4_num_act_ref_l0 * ps_coarse_layer_mvbank->i4_num_mvs_per_ref)
                           : (i4_num_act_ref_l1 * ps_coarse_layer_mvbank->i4_num_mvs_per_ref);
    S32 i4_init_offset_projected =
        (u1_pred_dir == 1) ? (i4_num_act_ref_l0 * ps_coarse_layer_mvbank->i4_num_mvs_per_ref) : 0;
    S32 i4_init_offset_spatial =
        (u1_pred_dir_ctr == 1)
            ? (ps_curr_layer_mvbank->i4_num_mvs_per_ref * u1_num_results_in_curr_mvbank)
            : 0;
    U08 u1_search_candidate_list_index = ps_ctxt->u1_search_candidate_list_index;
    U08 u1_max_num_search_cands =
        gau1_max_num_search_cands_in_l0_me[u1_search_candidate_list_index];
    S32 i4_num_srch_cands = MIN(u1_max_num_search_cands, ps_ctxt->i4_max_num_init_cands << 1);
    U16 u2_is_offset_available = 0;
    U08 u1_search_blk_to_spatial_mvbank_blk_size_factor = 1;

    /* Width and ht of current and prev layers */
    wd_c = ps_curr_layer->i4_wd;
    ht_c = ps_curr_layer->i4_ht;
    wd_p = ps_coarse_layer->i4_wd;
    ht_p = ps_coarse_layer->i4_ht;

    blksize_p = gau1_blk_size_to_wd_shift[ps_coarse_layer_mvbank->e_blk_size];
    blksize_c = gau1_blk_size_to_wd_shift[ps_curr_layer_mvbank->e_blk_size];

    /* ASSERT for valid sizes */
    ASSERT((blksize_p == 3) || (blksize_p == 4) || (blksize_p == 5));

    {
        S32 x = i4_pos_x >> 4;
        S32 y = i4_pos_y >> 4;

        if(blksize_c != gau1_blk_size_to_wd_shift[ps_ctxt->e_search_blk_size])
        {
            x *= 2;
            y *= 2;

            u1_search_blk_to_spatial_mvbank_blk_size_factor = 2;
        }

        i4_init_offset_spatial += (x + y * ps_curr_layer_mvbank->i4_num_blks_per_row) *
                                  ps_curr_layer_mvbank->i4_num_mvs_per_blk;
    }

    for(i = 0; i < i4_num_srch_cands; i++)
    {
        SEARCH_CANDIDATE_TYPE_T e_search_cand_type =
            gae_search_cand_priority_to_search_cand_type_map_in_l0_me[u1_search_candidate_list_index]
                                                                     [i];
        SEARCH_CAND_LOCATIONS_T e_search_cand_loc =
            gae_search_cand_type_to_location_map[e_search_cand_type];
        S08 i1_result_id = MIN(
            gai1_search_cand_type_to_result_id_map[e_search_cand_type],
            (e_search_cand_loc < 0 ? 0
                                   : ps_ctxt->pu1_num_fpel_search_cands[e_search_cand_loc] - 1));
        U08 u1_is_spatial_cand = (1 == gau1_search_cand_type_to_spatiality_map[e_search_cand_type]);
        U08 u1_is_proj_cand = (0 == gau1_search_cand_type_to_spatiality_map[e_search_cand_type]);
        U08 u1_is_zeroMV_cand = (ZERO_MV == e_search_cand_type) ||
                                (ZERO_MV_ALTREF == e_search_cand_type);

        /* When spatial candidates are available, use them, else use the projected candidates */
        /* This is required since some blocks will never have certain spatial candidates, and in order */
        /* to accomodate such instances in 'gae_search_cand_priority_to_search_cand_type_map_in_l0_me' list,  */
        /* all candidates apart from the 'LEFT' have been marked as projected */
        if(((e_search_cand_loc == TOPLEFT) || (e_search_cand_loc == TOP) ||
            (e_search_cand_loc == TOPRIGHT)) &&
           (i1_result_id < u1_num_results_in_curr_mvbank) && u1_is_proj_cand)
        {
            if(e_search_cand_loc == TOPLEFT)
            {
                u1_is_spatial_cand = ps_ctxt->u1_is_topLeft_available ||
                                     !ps_ctxt->u1_is_left_available;
            }
            else if(e_search_cand_loc == TOPRIGHT)
            {
                u1_is_spatial_cand = ps_ctxt->u1_is_topRight_available;
            }
            else
            {
                u1_is_spatial_cand = ps_ctxt->u1_is_top_available;
            }

            u1_is_proj_cand = !u1_is_spatial_cand;
        }

        switch(u1_is_zeroMV_cand + (u1_is_spatial_cand << 1) + (u1_is_proj_cand << 2))
        {
        case 1:
        {
            hme_search_node_populator(
                ps_search_cands[i].ps_search_node,
                &s_zero_mv,
                (ZERO_MV == e_search_cand_type) ? ps_ctxt->i1_default_ref_id
                                                : ps_ctxt->i1_alt_default_ref_id,
                0);

            break;
        }
        case 2:
        {
            S08 i1_mv_magnitude_shift = 0;

            S32 i4_offset = i4_init_offset_spatial;

            i1_result_id = MIN(i1_result_id, u1_num_results_in_curr_mvbank - 1);
            i4_offset += i1_result_id;

            switch(e_search_cand_loc)
            {
            case LEFT:
            {
                if(ps_ctxt->u1_is_left_available)
                {
                    i1_mv_magnitude_shift = -2;

                    i4_offset -= ps_curr_layer_mvbank->i4_num_mvs_per_blk;

                    ps_mv = ps_curr_layer_mvbank->ps_mv + i4_offset;
                    pi1_ref_idx = ps_curr_layer_mvbank->pi1_ref_idx + i4_offset;
                }
                else
                {
                    i1_mv_magnitude_shift = 0;

                    ps_mv = &s_zero_mv;
                    pi1_ref_idx = &ps_ctxt->i1_default_ref_id;
                }

                break;
            }
            case TOPLEFT:
            {
                if(ps_ctxt->u1_is_topLeft_available)
                {
                    i1_mv_magnitude_shift = -2;

                    i4_offset -= ps_curr_layer_mvbank->i4_num_mvs_per_blk;
                    i4_offset -= ps_curr_layer_mvbank->i4_num_mvs_per_row;

                    ps_mv = ps_curr_layer_mvbank->ps_mv + i4_offset;
                    pi1_ref_idx = ps_curr_layer_mvbank->pi1_ref_idx + i4_offset;
                }
                else
                {
                    i1_mv_magnitude_shift = 0;

                    ps_mv = &s_zero_mv;
                    pi1_ref_idx = &ps_ctxt->i1_default_ref_id;
                }

                break;
            }
            case TOP:
            {
                if(ps_ctxt->u1_is_top_available)
                {
                    i1_mv_magnitude_shift = -2;

                    i4_offset -= ps_curr_layer_mvbank->i4_num_mvs_per_row;

                    ps_mv = ps_curr_layer_mvbank->ps_mv + i4_offset;
                    pi1_ref_idx = ps_curr_layer_mvbank->pi1_ref_idx + i4_offset;
                }
                else
                {
                    i1_mv_magnitude_shift = 0;

                    ps_mv = &s_zero_mv;
                    pi1_ref_idx = &ps_ctxt->i1_default_ref_id;
                }

                break;
            }
            case TOPRIGHT:
            {
                if(ps_ctxt->u1_is_topRight_available)
                {
                    i1_mv_magnitude_shift = -2;

                    i4_offset += ps_curr_layer_mvbank->i4_num_mvs_per_blk *
                                 u1_search_blk_to_spatial_mvbank_blk_size_factor;
                    i4_offset -= ps_curr_layer_mvbank->i4_num_mvs_per_row;

                    ps_mv = ps_curr_layer_mvbank->ps_mv + i4_offset;
                    pi1_ref_idx = ps_curr_layer_mvbank->pi1_ref_idx + i4_offset;
                }
                else
                {
                    i1_mv_magnitude_shift = 0;
                    ps_mv = &s_zero_mv;
                    pi1_ref_idx = &ps_ctxt->i1_default_ref_id;
                }

                break;
            }
            default:
            {
                /* AiyAiyYo!! */
                ASSERT(0);
            }
            }

            hme_search_node_populator(
                ps_search_cands[i].ps_search_node, ps_mv, pi1_ref_idx[0], i1_mv_magnitude_shift);

            break;
        }
        case 4:
        {
            ASSERT(ILLUSORY_CANDIDATE != e_search_cand_type);
            ASSERT(ILLUSORY_LOCATION != e_search_cand_loc);

            i1_result_id = MIN(i1_result_id, u1_num_results_in_coarse_mvbank - 1);

            if(!(u2_is_offset_available & (1 << e_search_cand_loc)))
            {
                S32 x, y;

                x = i4_pos_x + gai4_search_cand_location_to_x_offset_map[e_search_cand_loc];
                y = i4_pos_y + gai4_search_cand_location_to_y_offset_map[e_search_cand_loc];

                /* Safety check to avoid uninitialized access across temporal layers */
                x = CLIP3(x, 0, (wd_c - blksize_p));
                y = CLIP3(y, 0, (ht_c - blksize_p));

                /* Project the positions to prev layer */
                x = x >> blksize_p;
                y = y >> blksize_p;

                ai4_cand_offsets[e_search_cand_loc] =
                    (x * ps_coarse_layer_mvbank->i4_num_mvs_per_blk);
                ai4_cand_offsets[e_search_cand_loc] +=
                    (y * ps_coarse_layer_mvbank->i4_num_mvs_per_row);
                ai4_cand_offsets[e_search_cand_loc] += i4_init_offset_projected;

                u2_is_offset_available |= (1 << e_search_cand_loc);
            }

            ps_mv =
                ps_coarse_layer_mvbank->ps_mv + ai4_cand_offsets[e_search_cand_loc] + i1_result_id;
            pi1_ref_idx = ps_coarse_layer_mvbank->pi1_ref_idx +
                          ai4_cand_offsets[e_search_cand_loc] + i1_result_id;

            hme_search_node_populator(ps_search_cands[i].ps_search_node, ps_mv, pi1_ref_idx[0], 1);

            break;
        }
        default:
        {
            /* NoNoNoNoNooooooooNO! */
            ASSERT(0);
        }
        }

        ASSERT(ps_search_cands[i].ps_search_node->i1_ref_idx >= 0);
        ASSERT(
            !u1_pred_dir
                ? (ps_ctxt->pi4_ref_id_lc_to_l0_map[ps_search_cands[i].ps_search_node->i1_ref_idx] <
                   i4_num_act_ref_l0)
                : (ps_ctxt->pi4_ref_id_lc_to_l1_map[ps_search_cands[i].ps_search_node->i1_ref_idx] <
                   ps_ctxt->i4_num_act_ref_l1));
    }

    return i4_num_srch_cands;
}

void hme_mv_clipper(
    hme_search_prms_t *ps_search_prms_blk,
    S32 i4_num_srch_cands,
    S08 i1_check_for_mult_refs,
    U08 u1_fpel_refine_extent,
    U08 u1_hpel_refine_extent,
    U08 u1_qpel_refine_extent)
{
    S32 candt;
    range_prms_t *ps_range_prms;

    for(candt = 0; candt < i4_num_srch_cands; candt++)
    {
        search_node_t *ps_search_node;

        ps_search_node = ps_search_prms_blk->ps_search_candts[candt].ps_search_node;
        ps_range_prms = ps_search_prms_blk->aps_mv_range[ps_search_node->i1_ref_idx];

        /* Clip the motion vectors as well here since after clipping
        two candidates can become same and they will be removed during deduplication */
        CLIP_MV_WITHIN_RANGE(
            ps_search_node->ps_mv->i2_mvx,
            ps_search_node->ps_mv->i2_mvy,
            ps_range_prms,
            u1_fpel_refine_extent,
            u1_hpel_refine_extent,
            u1_qpel_refine_extent);
    }
}

void hme_init_pred_buf_info(
    hme_pred_buf_info_t (*ps_info)[MAX_NUM_INTER_PARTS],
    hme_pred_buf_mngr_t *ps_buf_mngr,
    U08 u1_pu1_wd,
    U08 u1_pu1_ht,
    PART_TYPE_T e_part_type)
{
    U08 u1_pred_buf_array_id;

    if(1 != ihevce_get_free_pred_buf_indices(
                &u1_pred_buf_array_id, &ps_buf_mngr->u4_pred_buf_usage_indicator, 1))
    {
        ASSERT(0);
    }
    else
    {
        ps_info[0][0].i4_pred_stride = MAX_CU_SIZE;
        ps_info[0][0].pu1_pred = ps_buf_mngr->apu1_pred_bufs[u1_pred_buf_array_id];
        ps_info[0][0].u1_pred_buf_array_id = u1_pred_buf_array_id;

        if(PRT_2Nx2N != e_part_type)
        {
            ps_info[0][1].i4_pred_stride = MAX_CU_SIZE;
            ps_info[0][1].pu1_pred = ps_buf_mngr->apu1_pred_bufs[u1_pred_buf_array_id] +
                                     (gai1_is_part_vertical[ge_part_type_to_part_id[e_part_type][0]]
                                          ? u1_pu1_ht * ps_info[0][1].i4_pred_stride
                                          : u1_pu1_wd);
            ps_info[0][1].u1_pred_buf_array_id = u1_pred_buf_array_id;
        }
    }
}

void hme_debrief_bipred_eval(
    part_type_results_t *ps_part_type_result,
    hme_pred_buf_info_t (*ps_pred_buf_info)[MAX_NUM_INTER_PARTS],
    hme_pred_buf_mngr_t *ps_pred_buf_mngr,
    U08 *pu1_allocated_pred_buf_array_indixes,
    ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list

)
{
    PART_TYPE_T e_part_type = (PART_TYPE_T)ps_part_type_result->u1_part_type;

    U32 *pu4_pred_buf_usage_indicator = &ps_pred_buf_mngr->u4_pred_buf_usage_indicator;
    U08 u1_is_part_vertical = gai1_is_part_vertical[ge_part_type_to_part_id[e_part_type][0]];

    if(0 == ps_part_type_result->u1_part_type)
    {
        if(ps_part_type_result->as_pu_results->pu.b2_pred_mode == PRED_BI)
        {
            ASSERT(UCHAR_MAX != ps_pred_buf_info[2][0].u1_pred_buf_array_id);

            ps_part_type_result->pu1_pred = ps_pred_buf_info[2][0].pu1_pred;
            ps_part_type_result->i4_pred_stride = ps_pred_buf_info[2][0].i4_pred_stride;

            ihevce_set_pred_buf_as_free(
                pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[0]);

            ihevce_set_pred_buf_as_free(
                pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[1]);
        }
        else
        {
            ps_part_type_result->pu1_pred = ps_pred_buf_info[0][0].pu1_pred;
            ps_part_type_result->i4_pred_stride = ps_pred_buf_info[0][0].i4_pred_stride;

            ihevce_set_pred_buf_as_free(
                pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[2]);

            ihevce_set_pred_buf_as_free(
                pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[1]);

            if(UCHAR_MAX == ps_pred_buf_info[0][0].u1_pred_buf_array_id)
            {
                ihevce_set_pred_buf_as_free(
                    pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[0]);
            }
        }
    }
    else
    {
        U08 *pu1_src_pred;
        U08 *pu1_dst_pred;
        S32 i4_src_pred_stride;
        S32 i4_dst_pred_stride;

        U08 u1_pu1_wd = (ps_part_type_result->as_pu_results[0].pu.b4_wd + 1) << 2;
        U08 u1_pu1_ht = (ps_part_type_result->as_pu_results[0].pu.b4_ht + 1) << 2;
        U08 u1_pu2_wd = (ps_part_type_result->as_pu_results[1].pu.b4_wd + 1) << 2;
        U08 u1_pu2_ht = (ps_part_type_result->as_pu_results[1].pu.b4_ht + 1) << 2;

        U08 u1_condition_for_switch =
            (ps_part_type_result->as_pu_results[0].pu.b2_pred_mode == PRED_BI) |
            ((ps_part_type_result->as_pu_results[1].pu.b2_pred_mode == PRED_BI) << 1);

        switch(u1_condition_for_switch)
        {
        case 0:
        {
            ps_part_type_result->pu1_pred =
                ps_pred_buf_mngr->apu1_pred_bufs[pu1_allocated_pred_buf_array_indixes[0]];
            ps_part_type_result->i4_pred_stride = MAX_CU_SIZE;

            ihevce_set_pred_buf_as_free(
                pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[2]);

            ihevce_set_pred_buf_as_free(
                pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[1]);

            if(UCHAR_MAX == ps_pred_buf_info[0][0].u1_pred_buf_array_id)
            {
                pu1_src_pred = ps_pred_buf_info[0][0].pu1_pred;
                pu1_dst_pred = ps_part_type_result->pu1_pred;
                i4_src_pred_stride = ps_pred_buf_info[0][0].i4_pred_stride;
                i4_dst_pred_stride = ps_part_type_result->i4_pred_stride;

                ps_cmn_utils_optimised_function_list->pf_copy_2d(
                    pu1_dst_pred,
                    i4_dst_pred_stride,
                    pu1_src_pred,
                    i4_src_pred_stride,
                    u1_pu1_wd,
                    u1_pu1_ht);
            }

            if(UCHAR_MAX == ps_pred_buf_info[0][1].u1_pred_buf_array_id)
            {
                pu1_src_pred = ps_pred_buf_info[0][1].pu1_pred;
                pu1_dst_pred = ps_part_type_result->pu1_pred +
                               (u1_is_part_vertical
                                    ? u1_pu1_ht * ps_part_type_result->i4_pred_stride
                                    : u1_pu1_wd);
                i4_src_pred_stride = ps_pred_buf_info[0][1].i4_pred_stride;
                i4_dst_pred_stride = ps_part_type_result->i4_pred_stride;

                ps_cmn_utils_optimised_function_list->pf_copy_2d(
                    pu1_dst_pred,
                    i4_dst_pred_stride,
                    pu1_src_pred,
                    i4_src_pred_stride,
                    u1_pu2_wd,
                    u1_pu2_ht);
            }

            break;
        }
        case 1:
        {
            ASSERT(UCHAR_MAX != ps_pred_buf_info[2][0].u1_pred_buf_array_id);

            ihevce_set_pred_buf_as_free(
                pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[1]);

            /* Copy PU1 pred into PU2's pred buf */
            if(((u1_pu1_ht < u1_pu2_ht) || (u1_pu1_wd < u1_pu2_wd)) &&
               (UCHAR_MAX != ps_pred_buf_info[0][1].u1_pred_buf_array_id))
            {
                ps_part_type_result->pu1_pred =
                    ps_pred_buf_info[0][1].pu1_pred -
                    (u1_is_part_vertical ? u1_pu1_ht * ps_pred_buf_info[0][1].i4_pred_stride
                                         : u1_pu1_wd);
                ps_part_type_result->i4_pred_stride = ps_pred_buf_info[0][1].i4_pred_stride;

                ihevce_set_pred_buf_as_free(
                    pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[2]);

                pu1_src_pred = ps_pred_buf_info[2][0].pu1_pred;
                pu1_dst_pred = ps_part_type_result->pu1_pred;
                i4_src_pred_stride = ps_pred_buf_info[2][0].i4_pred_stride;
                i4_dst_pred_stride = ps_part_type_result->i4_pred_stride;

                ps_cmn_utils_optimised_function_list->pf_copy_2d(
                    pu1_dst_pred,
                    i4_dst_pred_stride,
                    pu1_src_pred,
                    i4_src_pred_stride,
                    u1_pu1_wd,
                    u1_pu1_ht);
            }
            else
            {
                ps_part_type_result->pu1_pred = ps_pred_buf_info[2][0].pu1_pred;
                ps_part_type_result->i4_pred_stride = ps_pred_buf_info[2][0].i4_pred_stride;

                ihevce_set_pred_buf_as_free(
                    pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[0]);

                pu1_src_pred = ps_pred_buf_info[0][1].pu1_pred;
                pu1_dst_pred = ps_part_type_result->pu1_pred;
                i4_src_pred_stride = ps_pred_buf_info[0][1].i4_pred_stride;
                i4_dst_pred_stride = ps_part_type_result->i4_pred_stride;

                ps_cmn_utils_optimised_function_list->pf_copy_2d(
                    pu1_dst_pred,
                    i4_dst_pred_stride,
                    pu1_src_pred,
                    i4_src_pred_stride,
                    u1_pu2_wd,
                    u1_pu2_ht);
            }

            break;
        }
        case 2:
        {
            ASSERT(UCHAR_MAX != ps_pred_buf_info[2][1].u1_pred_buf_array_id);

            ihevce_set_pred_buf_as_free(
                pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[1]);

            /* Copy PU2 pred into PU1's pred buf */
            if(((u1_pu1_ht > u1_pu2_ht) || (u1_pu1_wd > u1_pu2_wd)) &&
               (UCHAR_MAX != ps_pred_buf_info[0][0].u1_pred_buf_array_id))
            {
                ps_part_type_result->pu1_pred = ps_pred_buf_info[0][0].pu1_pred;
                ps_part_type_result->i4_pred_stride = ps_pred_buf_info[0][0].i4_pred_stride;

                ihevce_set_pred_buf_as_free(
                    pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[2]);

                pu1_src_pred = ps_pred_buf_info[2][1].pu1_pred;
                pu1_dst_pred = ps_part_type_result->pu1_pred +
                               (u1_is_part_vertical
                                    ? u1_pu1_ht * ps_part_type_result->i4_pred_stride
                                    : u1_pu1_wd);
                i4_src_pred_stride = ps_pred_buf_info[2][1].i4_pred_stride;
                i4_dst_pred_stride = ps_part_type_result->i4_pred_stride;

                ps_cmn_utils_optimised_function_list->pf_copy_2d(
                    pu1_dst_pred,
                    i4_dst_pred_stride,
                    pu1_src_pred,
                    i4_src_pred_stride,
                    u1_pu2_wd,
                    u1_pu2_ht);
            }
            else
            {
                ps_part_type_result->pu1_pred =
                    ps_pred_buf_info[2][1].pu1_pred -
                    (u1_is_part_vertical ? u1_pu1_ht * ps_pred_buf_info[2][1].i4_pred_stride
                                         : u1_pu1_wd);
                ps_part_type_result->i4_pred_stride = ps_pred_buf_info[2][1].i4_pred_stride;

                ihevce_set_pred_buf_as_free(
                    pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[0]);

                pu1_src_pred = ps_pred_buf_info[0][0].pu1_pred;
                pu1_dst_pred = ps_part_type_result->pu1_pred;
                i4_src_pred_stride = ps_pred_buf_info[0][0].i4_pred_stride;
                i4_dst_pred_stride = ps_part_type_result->i4_pred_stride;

                ps_cmn_utils_optimised_function_list->pf_copy_2d(
                    pu1_dst_pred,
                    i4_dst_pred_stride,
                    pu1_src_pred,
                    i4_src_pred_stride,
                    u1_pu1_wd,
                    u1_pu1_ht);
            }

            break;
        }
        case 3:
        {
            ASSERT(UCHAR_MAX != ps_pred_buf_info[2][0].u1_pred_buf_array_id);
            ASSERT(UCHAR_MAX != ps_pred_buf_info[2][1].u1_pred_buf_array_id);
            ASSERT(
                ps_pred_buf_info[2][1].u1_pred_buf_array_id ==
                ps_pred_buf_info[2][0].u1_pred_buf_array_id);

            ps_part_type_result->pu1_pred = ps_pred_buf_info[2][0].pu1_pred;
            ps_part_type_result->i4_pred_stride = ps_pred_buf_info[2][0].i4_pred_stride;

            ihevce_set_pred_buf_as_free(
                pu4_pred_buf_usage_indicator, pu1_allocated_pred_buf_array_indixes[0]);

            break;
        }
        }
    }
}

U08 hme_decide_search_candidate_priority_in_l1_and_l2_me(
    SEARCH_CANDIDATE_TYPE_T e_cand_type, ME_QUALITY_PRESETS_T e_quality_preset)
{
    U08 u1_priority_val =
        gau1_search_cand_priority_in_l1_and_l2_me[e_quality_preset >= ME_MEDIUM_SPEED][e_cand_type];

    if(UCHAR_MAX == u1_priority_val)
    {
        ASSERT(0);
    }

    ASSERT(u1_priority_val <= MAX_INIT_CANDTS);

    return u1_priority_val;
}

U08 hme_decide_search_candidate_priority_in_l0_me(SEARCH_CANDIDATE_TYPE_T e_cand_type, U08 u1_index)
{
    U08 u1_priority_val = gau1_search_cand_priority_in_l0_me[u1_index][e_cand_type];

    if(UCHAR_MAX == u1_priority_val)
    {
        ASSERT(0);
    }

    ASSERT(u1_priority_val <= MAX_INIT_CANDTS);

    return u1_priority_val;
}

void hme_search_cand_data_init(
    S32 *pi4_id_Z,
    S32 *pi4_id_coloc,
    S32 *pi4_num_coloc_cands,
    U08 *pu1_search_candidate_list_index,
    S32 i4_num_act_ref_l0,
    S32 i4_num_act_ref_l1,
    U08 u1_is_bidir_enabled,
    U08 u1_4x4_blk_in_l1me)
{
    S32 i, j;
    S32 i4_num_coloc_cands;

    U08 u1_search_candidate_list_index;

    if(!u1_is_bidir_enabled && !u1_4x4_blk_in_l1me)
    {
        S32 i;

        u1_search_candidate_list_index = (i4_num_act_ref_l0 - 1) * 2;
        i4_num_coloc_cands = i4_num_act_ref_l0 * 2;

        switch(i4_num_act_ref_l0)
        {
        case 1:
        {
            for(i = 0; i < 2; i++)
            {
                pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                    (SEARCH_CANDIDATE_TYPE_T)(PROJECTED_COLOC0 + i),
                    u1_search_candidate_list_index);
            }

            break;
        }
        case 2:
        {
            for(i = 0; i < 4; i++)
            {
                pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                    (SEARCH_CANDIDATE_TYPE_T)(PROJECTED_COLOC0 + i),
                    u1_search_candidate_list_index);
            }

            break;
        }
        case 3:
        {
            for(i = 0; i < 6; i++)
            {
                pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                    (SEARCH_CANDIDATE_TYPE_T)(PROJECTED_COLOC0 + i),
                    u1_search_candidate_list_index);
            }

            break;
        }
        case 4:
        {
            for(i = 0; i < 8; i++)
            {
                pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                    (SEARCH_CANDIDATE_TYPE_T)(PROJECTED_COLOC0 + i),
                    u1_search_candidate_list_index);
            }

            break;
        }
        default:
        {
            ASSERT(0);
        }
        }

        *pi4_num_coloc_cands = i4_num_coloc_cands;
        *pu1_search_candidate_list_index = u1_search_candidate_list_index;
    }
    else if(!u1_is_bidir_enabled && u1_4x4_blk_in_l1me)
    {
        S32 i;

        i4_num_coloc_cands = i4_num_act_ref_l0 * 2;
        u1_search_candidate_list_index = (i4_num_act_ref_l0 - 1) * 2 + 1;

        switch(i4_num_act_ref_l0)
        {
        case 1:
        {
            for(i = 0; i < 2; i++)
            {
                pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                    (SEARCH_CANDIDATE_TYPE_T)(PROJECTED_COLOC0 + i),
                    u1_search_candidate_list_index);
            }

            pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_TR0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 1] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BL0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 2] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BR0, u1_search_candidate_list_index);

            i4_num_coloc_cands += 3;

            break;
        }
        case 2:
        {
            for(i = 0; i < 4; i++)
            {
                pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                    (SEARCH_CANDIDATE_TYPE_T)(PROJECTED_COLOC0 + i),
                    u1_search_candidate_list_index);
            }

            pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_TR0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 1] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BL0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 2] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BR0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 3] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_TR1, u1_search_candidate_list_index);

            pi4_id_coloc[i + 4] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BL1, u1_search_candidate_list_index);

            pi4_id_coloc[i + 5] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BR1, u1_search_candidate_list_index);

            i4_num_coloc_cands += 6;

            break;
        }
        case 3:
        {
            for(i = 0; i < 6; i++)
            {
                pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                    (SEARCH_CANDIDATE_TYPE_T)(PROJECTED_COLOC0 + i),
                    u1_search_candidate_list_index);
            }

            pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_TR0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 1] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BL0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 2] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BR0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 3] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_TR1, u1_search_candidate_list_index);

            pi4_id_coloc[i + 4] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BL1, u1_search_candidate_list_index);

            pi4_id_coloc[i + 5] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BR1, u1_search_candidate_list_index);

            i4_num_coloc_cands += 6;

            break;
        }
        case 4:
        {
            for(i = 0; i < 8; i++)
            {
                pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                    (SEARCH_CANDIDATE_TYPE_T)(PROJECTED_COLOC0 + i),
                    u1_search_candidate_list_index);
            }

            pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_TR0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 1] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BL0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 2] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BR0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 3] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_TR1, u1_search_candidate_list_index);

            pi4_id_coloc[i + 4] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BL1, u1_search_candidate_list_index);

            pi4_id_coloc[i + 5] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BR1, u1_search_candidate_list_index);

            i4_num_coloc_cands += 6;

            break;
        }
        default:
        {
            ASSERT(0);
        }
        }

        *pi4_num_coloc_cands = i4_num_coloc_cands;
        *pu1_search_candidate_list_index = u1_search_candidate_list_index;
    }
    else
    {
        /* The variable 'u1_search_candidate_list_index' is hardcoded */
        /* to 10 and 11 respectively. But, these values are not returned */
        /* by this function since the actual values are dependent on */
        /* the number of refs in L0 and L1 respectively */
        /* Hence, the actual return values are being recomputed */
        /* in the latter part of this block */

        if(!u1_4x4_blk_in_l1me)
        {
            u1_search_candidate_list_index = 10;

            i4_num_coloc_cands = 2 + (2 * ((i4_num_act_ref_l0 > 1) || (i4_num_act_ref_l1 > 1)));

            for(i = 0; i < i4_num_coloc_cands; i++)
            {
                pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                    (SEARCH_CANDIDATE_TYPE_T)(PROJECTED_COLOC0 + i),
                    u1_search_candidate_list_index);
            }
        }
        else
        {
            u1_search_candidate_list_index = 11;

            i4_num_coloc_cands = 2 + (2 * ((i4_num_act_ref_l0 > 1) || (i4_num_act_ref_l1 > 1)));

            for(i = 0; i < i4_num_coloc_cands; i++)
            {
                pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                    (SEARCH_CANDIDATE_TYPE_T)(PROJECTED_COLOC0 + i),
                    u1_search_candidate_list_index);
            }

            pi4_id_coloc[i] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_TR0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 1] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BL0, u1_search_candidate_list_index);

            pi4_id_coloc[i + 2] = hme_decide_search_candidate_priority_in_l0_me(
                PROJECTED_COLOC_BR0, u1_search_candidate_list_index);
        }

        for(j = 0; j < 2; j++)
        {
            if(0 == j)
            {
                pu1_search_candidate_list_index[j] =
                    8 + ((i4_num_act_ref_l0 > 1) * 2) + u1_4x4_blk_in_l1me;
                pi4_num_coloc_cands[j] =
                    (u1_4x4_blk_in_l1me * 3) + 2 + ((i4_num_act_ref_l0 > 1) * 2);
            }
            else
            {
                pu1_search_candidate_list_index[j] =
                    8 + ((i4_num_act_ref_l1 > 1) * 2) + u1_4x4_blk_in_l1me;
                pi4_num_coloc_cands[j] =
                    (u1_4x4_blk_in_l1me * 3) + 2 + ((i4_num_act_ref_l1 > 1) * 2);
            }
        }
    }

    if(i4_num_act_ref_l0 || i4_num_act_ref_l1)
    {
        pi4_id_Z[0] = hme_decide_search_candidate_priority_in_l0_me(
            (SEARCH_CANDIDATE_TYPE_T)ZERO_MV, pu1_search_candidate_list_index[0]);
    }

    if((i4_num_act_ref_l0 > 1) && !u1_is_bidir_enabled)
    {
        pi4_id_Z[1] = hme_decide_search_candidate_priority_in_l0_me(
            (SEARCH_CANDIDATE_TYPE_T)ZERO_MV_ALTREF, pu1_search_candidate_list_index[0]);
    }
}

static U08
    hme_determine_base_block_size(S32 *pi4_valid_part_array, S32 i4_num_valid_parts, U08 u1_cu_size)
{
    ASSERT(i4_num_valid_parts > 0);

    if(1 == i4_num_valid_parts)
    {
        ASSERT(pi4_valid_part_array[i4_num_valid_parts - 1] == PART_ID_2Nx2N);

        return u1_cu_size;
    }
    else
    {
        if(pi4_valid_part_array[i4_num_valid_parts - 1] <= PART_ID_NxN_BR)
        {
            return u1_cu_size / 2;
        }
        else if(pi4_valid_part_array[i4_num_valid_parts - 1] <= PART_ID_nRx2N_R)
        {
            return u1_cu_size / 4;
        }
    }

    return u1_cu_size / 4;
}

static U32 hme_compute_variance_of_pu_from_base_blocks(
    ULWORD64 *pu8_SigmaX,
    ULWORD64 *pu8_SigmaXSquared,
    U08 u1_cu_size,
    U08 u1_base_block_size,
    S32 i4_part_id)
{
    U08 i, j;
    ULWORD64 u8_final_variance;

    U08 u1_part_dimension_multiplier = (u1_cu_size >> 4);
    S32 i4_part_wd = gai1_part_wd_and_ht[i4_part_id][0] * u1_part_dimension_multiplier;
    S32 i4_part_ht = gai1_part_wd_and_ht[i4_part_id][1] * u1_part_dimension_multiplier;
    U08 u1_num_base_blocks_in_pu_row = i4_part_wd / u1_base_block_size;
    U08 u1_num_base_blocks_in_pu_column = i4_part_ht / u1_base_block_size;
    U08 u1_num_base_blocks_in_cu_row = u1_cu_size / u1_base_block_size;
    U08 u1_num_base_blocks = (u1_num_base_blocks_in_pu_row * u1_num_base_blocks_in_pu_column);
    U32 u4_num_pixels_in_base_block = u1_base_block_size * u1_base_block_size;
    ULWORD64 u8_final_SigmaXSquared = 0;
    ULWORD64 u8_final_SigmaX = 0;

    if(ge_part_id_to_part_type[i4_part_id] != PRT_NxN)
    {
        U08 u1_column_start_index = gau1_part_id_to_part_num[i4_part_id]
                                        ? (gai1_is_part_vertical[i4_part_id]
                                               ? 0
                                               : (u1_cu_size - i4_part_wd) / u1_base_block_size)
                                        : 0;
        U08 u1_row_start_index = gau1_part_id_to_part_num[i4_part_id]
                                     ? (gai1_is_part_vertical[i4_part_id]
                                            ? (u1_cu_size - i4_part_ht) / u1_base_block_size
                                            : 0)
                                     : 0;
        U08 u1_column_end_index = u1_column_start_index + u1_num_base_blocks_in_pu_row;
        U08 u1_row_end_index = u1_row_start_index + u1_num_base_blocks_in_pu_column;

        for(i = u1_row_start_index; i < u1_row_end_index; i++)
        {
            for(j = u1_column_start_index; j < u1_column_end_index; j++)
            {
                u8_final_SigmaXSquared += pu8_SigmaXSquared[j + i * u1_num_base_blocks_in_cu_row];
                u8_final_SigmaX += pu8_SigmaX[j + i * u1_num_base_blocks_in_cu_row];
            }
        }

        u8_final_variance =
            u1_num_base_blocks * u4_num_pixels_in_base_block * u8_final_SigmaXSquared;
        u8_final_variance -= u8_final_SigmaX * u8_final_SigmaX;
        u8_final_variance +=
            ((u1_num_base_blocks * u4_num_pixels_in_base_block) *
             (u1_num_base_blocks * u4_num_pixels_in_base_block) / 2);
        u8_final_variance /= (u1_num_base_blocks * u4_num_pixels_in_base_block) *
                             (u1_num_base_blocks * u4_num_pixels_in_base_block);

        ASSERT(u8_final_variance <= UINT_MAX);
    }
    else
    {
        U08 u1_row_start_index;
        U08 u1_column_start_index;
        U08 u1_row_end_index;
        U08 u1_column_end_index;

        switch(gau1_part_id_to_part_num[i4_part_id])
        {
        case 0:
        {
            u1_row_start_index = 0;
            u1_column_start_index = 0;

            break;
        }
        case 1:
        {
            u1_row_start_index = 0;
            u1_column_start_index = u1_num_base_blocks_in_pu_row;

            break;
        }
        case 2:
        {
            u1_row_start_index = u1_num_base_blocks_in_pu_column;
            u1_column_start_index = 0;

            break;
        }
        case 3:
        {
            u1_row_start_index = u1_num_base_blocks_in_pu_column;
            u1_column_start_index = u1_num_base_blocks_in_pu_row;

            break;
        }
        }

        u1_column_end_index = u1_column_start_index + u1_num_base_blocks_in_pu_row;
        u1_row_end_index = u1_row_start_index + u1_num_base_blocks_in_pu_column;

        for(i = u1_row_start_index; i < u1_row_end_index; i++)
        {
            for(j = u1_column_start_index; j < u1_column_end_index; j++)
            {
                u8_final_SigmaXSquared += pu8_SigmaXSquared[j + i * u1_num_base_blocks_in_cu_row];
                u8_final_SigmaX += pu8_SigmaX[j + i * u1_num_base_blocks_in_cu_row];
            }
        }

        u8_final_variance =
            u1_num_base_blocks * u4_num_pixels_in_base_block * u8_final_SigmaXSquared;
        u8_final_variance -= u8_final_SigmaX * u8_final_SigmaX;
        u8_final_variance +=
            ((u1_num_base_blocks * u4_num_pixels_in_base_block) *
             (u1_num_base_blocks * u4_num_pixels_in_base_block) / 2);
        u8_final_variance /= (u1_num_base_blocks * u4_num_pixels_in_base_block) *
                             (u1_num_base_blocks * u4_num_pixels_in_base_block);

        ASSERT(u8_final_variance <= UINT_MAX);
    }

    return u8_final_variance;
}

void hme_compute_variance_for_all_parts(
    U08 *pu1_data,
    S32 i4_data_stride,
    S32 *pi4_valid_part_array,
    U32 *pu4_variance,
    S32 i4_num_valid_parts,
    U08 u1_cu_size)
{
    ULWORD64 au8_SigmaX[16];
    ULWORD64 au8_SigmaXSquared[16];
    U08 i, j, k, l;
    U08 u1_base_block_size;
    U08 u1_num_base_blocks_in_cu_row;
    U08 u1_num_base_blocks_in_cu_column;

    u1_base_block_size =
        hme_determine_base_block_size(pi4_valid_part_array, i4_num_valid_parts, u1_cu_size);

    u1_num_base_blocks_in_cu_row = u1_num_base_blocks_in_cu_column =
        u1_cu_size / u1_base_block_size;

    ASSERT(u1_num_base_blocks_in_cu_row <= 4);

    for(i = 0; i < u1_num_base_blocks_in_cu_column; i++)
    {
        for(j = 0; j < u1_num_base_blocks_in_cu_row; j++)
        {
            U08 *pu1_buf =
                pu1_data + (u1_base_block_size * j) + (u1_base_block_size * i * i4_data_stride);

            au8_SigmaX[j + i * u1_num_base_blocks_in_cu_row] = 0;
            au8_SigmaXSquared[j + i * u1_num_base_blocks_in_cu_row] = 0;

            for(k = 0; k < u1_base_block_size; k++)
            {
                for(l = 0; l < u1_base_block_size; l++)
                {
                    au8_SigmaX[j + i * u1_num_base_blocks_in_cu_row] +=
                        pu1_buf[l + k * i4_data_stride];
                    au8_SigmaXSquared[j + i * u1_num_base_blocks_in_cu_row] +=
                        pu1_buf[l + k * i4_data_stride] * pu1_buf[l + k * i4_data_stride];
                }
            }
        }
    }

    for(i = 0; i < i4_num_valid_parts; i++)
    {
        pu4_variance[pi4_valid_part_array[i]] = hme_compute_variance_of_pu_from_base_blocks(
            au8_SigmaX, au8_SigmaXSquared, u1_cu_size, u1_base_block_size, pi4_valid_part_array[i]);
    }
}

void hme_compute_final_sigma_of_pu_from_base_blocks(
    U32 *pu4_SigmaX,
    U32 *pu4_SigmaXSquared,
    ULWORD64 *pu8_final_sigmaX,
    ULWORD64 *pu8_final_sigmaX_Squared,
    U08 u1_cu_size,
    U08 u1_base_block_size,
    S32 i4_part_id,
    U08 u1_base_blk_array_stride)
{
    U08 i, j;
    //U08 u1_num_base_blocks_in_cu_row;

    U08 u1_part_dimension_multiplier = (u1_cu_size >> 4);
    S32 i4_part_wd = gai1_part_wd_and_ht[i4_part_id][0] * u1_part_dimension_multiplier;
    S32 i4_part_ht = gai1_part_wd_and_ht[i4_part_id][1] * u1_part_dimension_multiplier;
    U08 u1_num_base_blocks_in_pu_row = i4_part_wd / u1_base_block_size;
    U08 u1_num_base_blocks_in_pu_column = i4_part_ht / u1_base_block_size;
    U16 u2_num_base_blocks = (u1_num_base_blocks_in_pu_row * u1_num_base_blocks_in_pu_column);
    U32 u4_num_pixels_in_base_block = u1_base_block_size * u1_base_block_size;
    U32 u4_N = (u2_num_base_blocks * u4_num_pixels_in_base_block);

    /*if (u1_is_for_src)
    {
    u1_num_base_blocks_in_cu_row = 16;
    }
    else
    {
    u1_num_base_blocks_in_cu_row = u1_cu_size / u1_base_block_size;
    }*/

    pu8_final_sigmaX[i4_part_id] = 0;
    pu8_final_sigmaX_Squared[i4_part_id] = 0;

    if(ge_part_id_to_part_type[i4_part_id] != PRT_NxN)
    {
        U08 u1_column_start_index = gau1_part_id_to_part_num[i4_part_id]
                                        ? (gai1_is_part_vertical[i4_part_id]
                                               ? 0
                                               : (u1_cu_size - i4_part_wd) / u1_base_block_size)
                                        : 0;
        U08 u1_row_start_index = gau1_part_id_to_part_num[i4_part_id]
                                     ? (gai1_is_part_vertical[i4_part_id]
                                            ? (u1_cu_size - i4_part_ht) / u1_base_block_size
                                            : 0)
                                     : 0;
        U08 u1_column_end_index = u1_column_start_index + u1_num_base_blocks_in_pu_row;
        U08 u1_row_end_index = u1_row_start_index + u1_num_base_blocks_in_pu_column;

        for(i = u1_row_start_index; i < u1_row_end_index; i++)
        {
            for(j = u1_column_start_index; j < u1_column_end_index; j++)
            {
                pu8_final_sigmaX_Squared[i4_part_id] +=
                    pu4_SigmaXSquared[j + i * u1_base_blk_array_stride];
                pu8_final_sigmaX[i4_part_id] += pu4_SigmaX[j + i * u1_base_blk_array_stride];
            }
        }
    }
    else
    {
        U08 u1_row_start_index;
        U08 u1_column_start_index;
        U08 u1_row_end_index;
        U08 u1_column_end_index;

        switch(gau1_part_id_to_part_num[i4_part_id])
        {
        case 0:
        {
            u1_row_start_index = 0;
            u1_column_start_index = 0;

            break;
        }
        case 1:
        {
            u1_row_start_index = 0;
            u1_column_start_index = u1_num_base_blocks_in_pu_row;

            break;
        }
        case 2:
        {
            u1_row_start_index = u1_num_base_blocks_in_pu_column;
            u1_column_start_index = 0;

            break;
        }
        case 3:
        {
            u1_row_start_index = u1_num_base_blocks_in_pu_column;
            u1_column_start_index = u1_num_base_blocks_in_pu_row;

            break;
        }
        }

        u1_column_end_index = u1_column_start_index + u1_num_base_blocks_in_pu_row;
        u1_row_end_index = u1_row_start_index + u1_num_base_blocks_in_pu_column;

        for(i = u1_row_start_index; i < u1_row_end_index; i++)
        {
            for(j = u1_column_start_index; j < u1_column_end_index; j++)
            {
                pu8_final_sigmaX_Squared[i4_part_id] +=
                    pu4_SigmaXSquared[j + i * u1_base_blk_array_stride];
                pu8_final_sigmaX[i4_part_id] += pu4_SigmaX[j + i * u1_base_blk_array_stride];
            }
        }
    }

    pu8_final_sigmaX_Squared[i4_part_id] *= u4_N;
}

void hme_compute_stim_injected_distortion_for_all_parts(
    U08 *pu1_pred,
    S32 i4_pred_stride,
    S32 *pi4_valid_part_array,
    ULWORD64 *pu8_src_sigmaX,
    ULWORD64 *pu8_src_sigmaXSquared,
    S32 *pi4_sad_array,
    S32 i4_alpha_stim_multiplier,
    S32 i4_inv_wt,
    S32 i4_inv_wt_shift_val,
    S32 i4_num_valid_parts,
    S32 i4_wpred_log_wdc,
    U08 u1_cu_size)
{
    U32 au4_sigmaX[16], au4_sigmaXSquared[16];
    ULWORD64 au8_final_ref_sigmaX[17], au8_final_ref_sigmaXSquared[17];
    S32 i4_noise_term;
    U16 i2_count;

    ULWORD64 u8_temp_var, u8_temp_var1, u8_pure_dist;
    ULWORD64 u8_ref_X_Square, u8_src_var, u8_ref_var;

    U08 u1_base_block_size;

    WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;

    u1_base_block_size =
        hme_determine_base_block_size(pi4_valid_part_array, i4_num_valid_parts, u1_cu_size);

    ASSERT(u1_cu_size >= 16);

    hme_compute_sigmaX_and_sigmaXSquared(
        pu1_pred,
        i4_pred_stride,
        au4_sigmaX,
        au4_sigmaXSquared,
        u1_base_block_size,
        u1_base_block_size,
        u1_cu_size,
        u1_cu_size,
        1,
        u1_cu_size / u1_base_block_size);

    /* Noise Term Computation */
    for(i2_count = 0; i2_count < i4_num_valid_parts; i2_count++)
    {
        unsigned long u4_shift_val;
        S32 i4_bits_req;
        S32 part_id = pi4_valid_part_array[i2_count];

        if(i4_alpha_stim_multiplier)
        {
            /* Final SigmaX and SigmaX-Squared Calculation */
            hme_compute_final_sigma_of_pu_from_base_blocks(
                au4_sigmaX,
                au4_sigmaXSquared,
                au8_final_ref_sigmaX,
                au8_final_ref_sigmaXSquared,
                u1_cu_size,
                u1_base_block_size,
                part_id,
                (u1_cu_size / u1_base_block_size));

            u8_ref_X_Square = (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
            u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);

            u4_shift_val = ihevce_calc_stim_injected_variance(
                pu8_src_sigmaX,
                pu8_src_sigmaXSquared,
                &u8_src_var,
                i4_inv_wt,
                i4_inv_wt_shift_val,
                i4_wpred_log_wdc,
                part_id);

            u8_ref_var = u8_ref_var >> u4_shift_val;

            GETRANGE64(i4_bits_req, u8_ref_var);

            if(i4_bits_req > 27)
            {
                u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
                u8_src_var = u8_src_var >> (i4_bits_req - 27);
            }

            if(u8_src_var == u8_ref_var)
            {
                u8_temp_var = (1 << STIM_Q_FORMAT);
            }
            else
            {
                u8_temp_var = (u8_src_var * u8_ref_var * (1 << STIM_Q_FORMAT));
                u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
                u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
                u8_temp_var = (u8_temp_var / u8_temp_var1);
                u8_temp_var = (2 * u8_temp_var);
            }

            i4_noise_term = (UWORD32)u8_temp_var;

            ASSERT(i4_noise_term >= 0);

            i4_noise_term *= i4_alpha_stim_multiplier;
        }
        else
        {
            i4_noise_term = 0;
        }

        u8_pure_dist = pi4_sad_array[part_id];
        u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
        u8_pure_dist += (1 << ((i4_q_level)-1));
        pi4_sad_array[part_id] = (UWORD32)(u8_pure_dist >> (i4_q_level));
    }
}

void hme_compute_sigmaX_and_sigmaXSquared(
    U08 *pu1_data,
    S32 i4_buf_stride,
    void *pv_sigmaX,
    void *pv_sigmaXSquared,
    U08 u1_base_blk_wd,
    U08 u1_base_blk_ht,
    U08 u1_blk_wd,
    U08 u1_blk_ht,
    U08 u1_is_sigma_pointer_size_32_bit,
    U08 u1_array_stride)
{
    U08 i, j, k, l;
    U08 u1_num_base_blks_in_row;
    U08 u1_num_base_blks_in_column;

    u1_num_base_blks_in_row = u1_blk_wd / u1_base_blk_wd;
    u1_num_base_blks_in_column = u1_blk_ht / u1_base_blk_ht;

    if(u1_is_sigma_pointer_size_32_bit)
    {
        U32 *sigmaX, *sigmaXSquared;

        sigmaX = (U32 *)pv_sigmaX;
        sigmaXSquared = (U32 *)pv_sigmaXSquared;

        /* Loop to compute the sigma_X and sigma_X_Squared */
        for(i = 0; i < u1_num_base_blks_in_column; i++)
        {
            for(j = 0; j < u1_num_base_blks_in_row; j++)
            {
                U32 u4_sigmaX = 0, u4_sigmaXSquared = 0;
                U08 *pu1_buf =
                    pu1_data + (u1_base_blk_wd * j) + (u1_base_blk_ht * i * i4_buf_stride);

                for(k = 0; k < u1_base_blk_ht; k++)
                {
                    for(l = 0; l < u1_base_blk_wd; l++)
                    {
                        u4_sigmaX += pu1_buf[l + k * i4_buf_stride];
                        u4_sigmaXSquared +=
                            (pu1_buf[l + k * i4_buf_stride] * pu1_buf[l + k * i4_buf_stride]);
                    }
                }

                sigmaX[j + i * u1_array_stride] = u4_sigmaX;
                sigmaXSquared[j + i * u1_array_stride] = u4_sigmaXSquared;
            }
        }
    }
    else
    {
        ULWORD64 *sigmaX, *sigmaXSquared;

        sigmaX = (ULWORD64 *)pv_sigmaX;
        sigmaXSquared = (ULWORD64 *)pv_sigmaXSquared;

        /* Loop to compute the sigma_X and sigma_X_Squared */
        for(i = 0; i < u1_num_base_blks_in_column; i++)
        {
            for(j = 0; j < u1_num_base_blks_in_row; j++)
            {
                ULWORD64 u8_sigmaX = 0, u8_sigmaXSquared = 0;
                U08 *pu1_buf =
                    pu1_data + (u1_base_blk_wd * j) + (u1_base_blk_ht * i * i4_buf_stride);

                for(k = 0; k < u1_base_blk_ht; k++)
                {
                    for(l = 0; l < u1_base_blk_wd; l++)
                    {
                        u8_sigmaX += pu1_buf[l + k * i4_buf_stride];
                        u8_sigmaXSquared +=
                            (pu1_buf[l + k * i4_buf_stride] * pu1_buf[l + k * i4_buf_stride]);
                    }
                }

                u8_sigmaXSquared = u8_sigmaXSquared * u1_blk_wd * u1_blk_ht;

                sigmaX[j + i * u1_array_stride] = u8_sigmaX;
                sigmaXSquared[j + i * u1_array_stride] = u8_sigmaXSquared;
            }
        }
    }
}

#if TEMPORAL_NOISE_DETECT
WORD32 ihevce_16x16block_temporal_noise_detect(
    WORD32 had_block_size,
    WORD32 ctb_width,
    WORD32 ctb_height,
    ihevce_ctb_noise_params *ps_ctb_noise_params,
    fpel_srch_cand_init_data_t *s_proj_srch_cand_init_data,
    hme_search_prms_t *s_search_prms_blk,
    me_frm_ctxt_t *ps_ctxt,
    WORD32 num_pred_dir,
    WORD32 i4_num_act_ref_l0,
    WORD32 i4_num_act_ref_l1,
    WORD32 i4_cu_x_off,
    WORD32 i4_cu_y_off,
    wgt_pred_ctxt_t *ps_wt_inp_prms,
    WORD32 input_stride,
    WORD32 index_8x8_block,
    WORD32 num_horz_blocks,
    WORD32 num_8x8_in_ctb_row,
    WORD32 i4_16x16_index)
{
    WORD32 i;
    WORD32 noise_detected;

    UWORD8 *pu1_l0_block;
    UWORD8 *pu1_l1_block;

    WORD32 mean;
    UWORD32 variance_8x8;

    /* to store the mean and variance of each 8*8 block and find the variance of any higher block sizes later on. block */
    WORD16 pi2_residue_16x16[256];
    WORD32 mean_16x16;
    UWORD32 variance_16x16[2];

    /* throw errors in case of un- supported arguments */
    /* assumptions size is 8 or 16 or 32 */
    assert(
        (had_block_size == 8) || (had_block_size == 16) || (had_block_size == 32));  //ihevc_assert

    /* initialize the variables */
    noise_detected = 0;
    variance_8x8 = 0;

    mean = 0;

    {
        i = 0;
        /* get the ref/pred and source using the MV of both directions */
        /* pick the best candidates in each direction */
        /* Colocated cands */
        {
            // steps to be done
            /* pick the candidates */
            /* do motion compoensation using the candidates got from prev step : pick from the offset */
            /* get the ref or the pred from the offset*/
            /* get the source data */
            /* send the pred - source to noise detect */
            /* do noise detect on the residue of source and pred */

            layer_mv_t *ps_layer_mvbank;
            hme_mv_t *ps_mv;

            //S32 i;
            S32 wd_c, ht_c, wd_p, ht_p;
            S32 blksize_p, blk_x, blk_y, i4_offset;
            S08 *pi1_ref_idx;
            fpel_srch_cand_init_data_t *ps_ctxt_2 = s_proj_srch_cand_init_data;
            layer_ctxt_t *ps_curr_layer = ps_ctxt_2->ps_curr_layer;
            layer_ctxt_t *ps_coarse_layer = ps_ctxt_2->ps_coarse_layer;
            err_prms_t s_err_prms;
            S32 i4_blk_wd;
            S32 i4_blk_ht;
            BLK_SIZE_T e_blk_size;
            hme_search_prms_t *ps_search_prms;
            S32 i4_part_mask;
            S32 *pi4_valid_part_ids;

            /* has list of valid partition to search terminated by -1 */
            S32 ai4_valid_part_ids[TOT_NUM_PARTS + 1];

            /*SEARCH_COMPLEXITY_T e_search_complexity = ps_ctxt->e_search_complexity;*/

            S32 i4_pos_x;
            S32 i4_pos_y;
            U08 u1_pred_dir;  // = ps_ctxt_2->u1_pred_dir;
            U08 u1_default_ref_id = 0;  //ps_ctxt_2->u1_default_ref_id;
            S32 i4_inp_off, i4_ref_offset, i4_ref_stride;

            /* The reference is actually an array of ptrs since there are several    */
            /* reference id. So an array gets passed form calling function           */
            U08 **ppu1_ref;

            /* Atributes of input candidates */
            search_node_t as_search_node[2];
            wgt_pred_ctxt_t *ps_wt_inp_prms;

            S32 posx;
            S32 posy;
            S32 i4_num_results_to_proj;
            S32 ai4_sad_grid[9 * TOT_NUM_PARTS];
            S32 i4_inp_stride;

            /* intialize variables */
            /* Width and ht of current and prev layers */
            wd_c = ps_curr_layer->i4_wd;
            ht_c = ps_curr_layer->i4_ht;
            wd_p = ps_coarse_layer->i4_wd;
            ht_p = ps_coarse_layer->i4_ht;

            ps_search_prms = s_search_prms_blk;

            ps_wt_inp_prms = &ps_ctxt->s_wt_pred;
            e_blk_size = ps_search_prms->e_blk_size;
            i4_part_mask = ps_search_prms->i4_part_mask;

            i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
            i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];

            ps_layer_mvbank = ps_coarse_layer->ps_layer_mvbank;
            blksize_p = gau1_blk_size_to_wd_shift[ps_layer_mvbank->e_blk_size];

            /* ASSERT for valid sizes */
            ASSERT((blksize_p == 3) || (blksize_p == 4) || (blksize_p == 5));

            i4_pos_x = i4_cu_x_off;
            i4_pos_y = i4_cu_y_off;
            posx = i4_pos_x + 2;
            posy = i4_pos_y + 2;

            i4_inp_stride = ps_search_prms->i4_inp_stride;
            /* Move to the location of the search blk in inp buffer */
            //i4_inp_off = i4_cu_x_off;
            //i4_inp_off += i4_cu_y_off * i4_inp_stride;
            i4_inp_off = (i4_16x16_index % 4) * 16;
            i4_inp_off += (i4_16x16_index / 4) * 16 * i4_inp_stride;

            /***********pick the candidates**************************************/
            for(u1_pred_dir = 0; u1_pred_dir < num_pred_dir; u1_pred_dir++)
            {
                WORD32 actual_pred_dir = 0;

                if(u1_pred_dir == 0 && i4_num_act_ref_l0 == 0)
                {
                    actual_pred_dir = 1;
                }
                else if(u1_pred_dir == 0 && i4_num_act_ref_l0 != 0)
                {
                    actual_pred_dir = 0;
                }
                else if(u1_pred_dir == 1)
                {
                    actual_pred_dir = 1;
                }

                i4_num_results_to_proj = 1;  // only the best proj

                /* Safety check to avoid uninitialized access across temporal layers */
                posx = CLIP3(posx, 0, (wd_c - blksize_p)); /* block position withing frAME */
                posy = CLIP3(posy, 0, (ht_c - blksize_p));

                /* Project the positions to prev layer */
                blk_x = posx >> blksize_p;
                blk_y = posy >> blksize_p;

                /* Pick up the mvs from the location */
                i4_offset = (blk_x * ps_layer_mvbank->i4_num_mvs_per_blk);
                i4_offset += (ps_layer_mvbank->i4_num_mvs_per_row * blk_y);

                ps_mv = ps_layer_mvbank->ps_mv + i4_offset;
                pi1_ref_idx = ps_layer_mvbank->pi1_ref_idx + i4_offset;

                if(actual_pred_dir == 1)
                {
                    ps_mv += (i4_num_act_ref_l0 * ps_layer_mvbank->i4_num_mvs_per_ref);
                    pi1_ref_idx += (i4_num_act_ref_l0 * ps_layer_mvbank->i4_num_mvs_per_ref);
                }

                {
                    as_search_node[actual_pred_dir].s_mv.i2_mvx = ps_mv[0].i2_mv_x << 1;
                    as_search_node[actual_pred_dir].s_mv.i2_mvy = ps_mv[0].i2_mv_y << 1;
                    as_search_node[actual_pred_dir].i1_ref_idx = pi1_ref_idx[0];

                    if((as_search_node[actual_pred_dir].i1_ref_idx < 0) ||
                       (as_search_node[actual_pred_dir].s_mv.i2_mvx == INTRA_MV))
                    {
                        as_search_node[actual_pred_dir].i1_ref_idx = u1_default_ref_id;
                        as_search_node[actual_pred_dir].s_mv.i2_mvx = 0;
                        as_search_node[actual_pred_dir].s_mv.i2_mvy = 0;
                    }
                }

                /********************************************************************************************/
                {
                    /* declare the variables */
                    //ps_fullpel_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;

                    pi4_valid_part_ids = ai4_valid_part_ids;
                    i4_ref_stride = ps_curr_layer->i4_rec_stride;
                    s_err_prms.i4_inp_stride = i4_inp_stride;
                    s_err_prms.i4_ref_stride = i4_ref_stride;
                    s_err_prms.i4_part_mask = i4_part_mask;
                    s_err_prms.pi4_sad_grid = &ai4_sad_grid[0];
                    s_err_prms.i4_blk_wd = i4_blk_wd;
                    s_err_prms.i4_blk_ht = i4_blk_ht;
                    s_err_prms.i4_step = 1;
                    s_err_prms.pi4_valid_part_ids = pi4_valid_part_ids;
                    //s_err_prms.i4_num_partitions = ps_fullpel_refine_ctxt->i4_num_valid_parts;

                    /*************************************************************************/
                    /* Depending on flag i4_use_rec, we use either input of previously       */
                    /* encoded pictures or we use recon of previously encoded pictures.      */
                    i4_ref_stride = ps_curr_layer->i4_rec_stride;
                    ppu1_ref = ps_curr_layer->ppu1_list_rec_fxfy;  // pointer to the pred

                    i4_ref_offset = (i4_ref_stride * i4_cu_y_off) + i4_cu_x_off;  //i4_x_off;

                    s_err_prms.pu1_ref =
                        ppu1_ref[as_search_node[actual_pred_dir].i1_ref_idx] + i4_ref_offset;
                    s_err_prms.pu1_ref += as_search_node[actual_pred_dir].s_mv.i2_mvx;
                    s_err_prms.pu1_ref +=
                        as_search_node[actual_pred_dir].s_mv.i2_mvy * i4_ref_stride;

                    /*get the source */
                    s_err_prms.pu1_inp =
                        ps_wt_inp_prms->apu1_wt_inp[as_search_node[actual_pred_dir].i1_ref_idx] +
                        i4_inp_off;  //pu1_src_input + i4_inp_off;//ps_wt_inp_prms->apu1_wt_inp[as_search_node[actual_pred_dir].i1_ref_idx] + i4_inp_off;

                    /* send the pred - source to noise detect */
                    // noise_detect_hme(noise_structure, s_err_prms.pu1_inp, s_err_prms.pu1_ref);
                }
                /* change the l0/l1 blcok pointer names accrodingle */

                /* get memory pointers the input and the reference */
                pu1_l0_block = s_err_prms.pu1_inp;
                pu1_l1_block = s_err_prms.pu1_ref;

                {
                    WORD32 i2, j2;
                    WORD32 dim = 16;
                    UWORD8 *buf1;
                    UWORD8 *buf2;
                    for(i2 = 0; i2 < dim; i2++)
                    {
                        buf1 = pu1_l0_block + i2 * i4_inp_stride;
                        buf2 = pu1_l1_block + i2 * i4_ref_stride;

                        for(j2 = 0; j2 < dim; j2++)
                        {
                            pi2_residue_16x16[i2 * dim + j2] = (WORD16)(buf1[j2] - buf2[j2]);
                        }
                    }

                    ihevce_calc_variance_signed(
                        pi2_residue_16x16, 16, &mean_16x16, &variance_16x16[u1_pred_dir], 16, 16);

                    /* compare the source and residue variance for this block ps_ctb_noise_params->i4_variance_src_16x16 */
                    if(variance_16x16[u1_pred_dir] >
                       ((TEMPORAL_VARIANCE_FACTOR *
                         ps_ctb_noise_params->au4_variance_src_16x16[i4_16x16_index]) >>
                        Q_TEMPORAL_VARIANCE_FACTOR))
                    {
                        /* update noisy block count only if all  best MV in diff directions indicates noise */
                        if(u1_pred_dir == num_pred_dir - 1)
                        {
                            ps_ctb_noise_params->au1_is_8x8Blk_noisy[index_8x8_block] = 1;
                            ps_ctb_noise_params->au1_is_8x8Blk_noisy[index_8x8_block + 1] = 1;
                            ps_ctb_noise_params
                                ->au1_is_8x8Blk_noisy[index_8x8_block + num_8x8_in_ctb_row] = 1;
                            ps_ctb_noise_params
                                ->au1_is_8x8Blk_noisy[index_8x8_block + num_8x8_in_ctb_row + 1] = 1;
                            noise_detected = 1;
                        }
                    }
                    else /* if any one of the direction mv says it as non noise then dont check for the other directions MV , move for next block*/
                    {
                        noise_detected = 0;
                        ps_ctb_noise_params->au1_is_8x8Blk_noisy[index_8x8_block] = 0;
                        ps_ctb_noise_params->au1_is_8x8Blk_noisy[index_8x8_block + 1] = 0;
                        ps_ctb_noise_params
                            ->au1_is_8x8Blk_noisy[index_8x8_block + num_8x8_in_ctb_row] = 0;
                        ps_ctb_noise_params
                            ->au1_is_8x8Blk_noisy[index_8x8_block + num_8x8_in_ctb_row + 1] = 0;
                        break;
                    }
                }  // variance analysis and calculation
            }  // for each direction
        }  // HME code

    }  // for each 16x16 block

    return (noise_detected);
}
#endif

void hme_qpel_interp_avg_1pt(
    interp_prms_t *ps_prms,
    S32 i4_mv_x,
    S32 i4_mv_y,
    S32 i4_buf_id,
    U08 **ppu1_final,
    S32 *pi4_final_stride)
{
    U08 *pu1_src1, *pu1_src2, *pu1_dst;
    qpel_input_buf_cfg_t *ps_inp_cfg;
    S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;

    /*************************************************************************/
    /* For a given QPEL pt, we need to determine the 2 source pts that are   */
    /* needed to do the QPEL averaging. The logic to do this is as follows   */
    /* i4_mv_x and i4_mv_y are the motion vectors in QPEL units that are     */
    /* pointing to the pt of interest. Obviously, they are w.r.t. the 0,0    */
    /* pt of th reference blk that is colocated to the inp blk.              */
    /*    A j E k B                                                          */
    /*    l m n o p                                                          */
    /*    F q G r H                                                          */
    /*    s t u v w                                                          */
    /*    C x I y D                                                          */
    /* In above diagram, A. B, C, D are full pts at offsets (0,0),(1,0),(0,1)*/
    /* and (1,1) respectively in the fpel buffer (id = 0)                    */
    /* E and I are hxfy pts in offsets (0,0),(0,1) respectively in hxfy buf  */
    /* F and H are fxhy pts in offsets (0,0),(1,0) respectively in fxhy buf  */
    /* G is hxhy pt in offset 0,0 in hxhy buf                                */
    /* All above offsets are computed w.r.t. motion displaced pt in          */
    /* respective bufs. This means that A corresponds to (i4_mv_x >> 2) and  */
    /* (i4_mv_y >> 2) in fxfy buf. Ditto with E, F and G                     */
    /* fxfy buf is buf id 0, hxfy is buf id 1, fxhy is buf id 2, hxhy is 3   */
    /* If we consider pt v to be derived. v has a fractional comp of 3, 3    */
    /* v is avg of H and I. So the table look up of v should give following  */
    /* buf 1 (H) : offset = (1, 0) buf id = 2.                               */
    /* buf 2 (I) : offset = 0 , 1) buf id = 1.                               */
    /* NOTE: For pts that are fxfy/hxfy/fxhy/hxhy, bufid 1 will be -1.       */
    /*************************************************************************/
    i4_mv_x_frac = i4_mv_x & 3;
    i4_mv_y_frac = i4_mv_y & 3;

    i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * ps_prms->i4_ref_stride;

    /* Derive the descriptor that has all offset and size info */
    ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];

    pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);

    pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
    pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
    pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * ps_prms->i4_ref_stride);

    pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
    hevc_avg_2d(
        pu1_src1,
        pu1_src2,
        ps_prms->i4_ref_stride,
        ps_prms->i4_ref_stride,
        ps_prms->i4_blk_wd,
        ps_prms->i4_blk_ht,
        pu1_dst,
        ps_prms->i4_out_stride);
    ppu1_final[i4_buf_id] = pu1_dst;
    pi4_final_stride[i4_buf_id] = ps_prms->i4_out_stride;
}

void hme_qpel_interp_avg_2pt_vert_with_reuse(
    interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride)
{
    hme_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);

    hme_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
}

void hme_qpel_interp_avg_2pt_horz_with_reuse(
    interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride)
{
    hme_qpel_interp_avg_1pt(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);

    hme_qpel_interp_avg_1pt(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
}

void hme_set_mv_limit_using_dvsr_data(
    me_frm_ctxt_t *ps_ctxt,
    layer_ctxt_t *ps_curr_layer,
    range_prms_t *ps_mv_limit,
    S16 *pi2_prev_enc_frm_max_mv_y,
    U08 u1_num_act_ref_pics)
{
    WORD32 ref_ctr;

    /* Only for B/b pic. */
    if(1 == ps_ctxt->s_frm_prms.bidir_enabled)
    {
        WORD16 i2_mv_y_per_poc, i2_max_mv_y;
        WORD32 cur_poc, prev_poc, ref_poc, abs_poc_diff;
        WORD32 prev_poc_count = 0;
        WORD32 i4_p_idx;

        pi2_prev_enc_frm_max_mv_y[0] = 0;

        cur_poc = ps_ctxt->i4_curr_poc;

        i4_p_idx = 0;

        /* Get abs MAX for symmetric search */
        i2_mv_y_per_poc = ps_curr_layer->i2_max_mv_y;
        /* Assuming P to P distance as 4 */
        i2_mv_y_per_poc = (i2_mv_y_per_poc + 2) >> 2;

        for(ref_ctr = 0; ref_ctr < u1_num_act_ref_pics; ref_ctr++)
        {
            /* Get the prev. encoded frame POC */
            prev_poc = ps_ctxt->i4_prev_poc;

            ref_poc = ps_ctxt->ai4_ref_idx_to_poc_lc[ref_ctr];
            abs_poc_diff = ABS((cur_poc - ref_poc));
            /* Get the cur. max MV based on POC distance */
            i2_max_mv_y = i2_mv_y_per_poc * abs_poc_diff;
            i2_max_mv_y = MIN(i2_max_mv_y, ps_curr_layer->i2_max_mv_y);

            ps_mv_limit[ref_ctr].i2_min_x = -ps_curr_layer->i2_max_mv_x;
            ps_mv_limit[ref_ctr].i2_min_y = -i2_max_mv_y;
            ps_mv_limit[ref_ctr].i2_max_x = ps_curr_layer->i2_max_mv_x;
            ps_mv_limit[ref_ctr].i2_max_y = i2_max_mv_y;

            /* Find the MAX MV for the prev. encoded frame to optimize */
            /* the reverse dependency of ME on Enc.Loop                */
            if(ref_poc == prev_poc)
            {
                /* TO DO : Same thing for horz. search also */
                pi2_prev_enc_frm_max_mv_y[0] = i2_max_mv_y;
                prev_poc_count++;
            }
        }
    }
    else
    {
        ASSERT(0 == ps_ctxt->s_frm_prms.u1_num_active_ref_l1);

        /* Set the Config. File Params for P pic. */
        for(ref_ctr = 0; ref_ctr < ps_ctxt->s_frm_prms.u1_num_active_ref_l0; ref_ctr++)
        {
            ps_mv_limit[ref_ctr].i2_min_x = -ps_curr_layer->i2_max_mv_x;
            ps_mv_limit[ref_ctr].i2_min_y = -ps_curr_layer->i2_max_mv_y;
            ps_mv_limit[ref_ctr].i2_max_x = ps_curr_layer->i2_max_mv_x;
            ps_mv_limit[ref_ctr].i2_max_y = ps_curr_layer->i2_max_mv_y;
        }

        /* For P PIC., go with  Config. File Params */
        pi2_prev_enc_frm_max_mv_y[0] = ps_curr_layer->i2_max_mv_y;
    }
}

S32 hme_part_mask_populator(
    U08 *pu1_inp,
    S32 i4_inp_stride,
    U08 u1_limit_active_partitions,
    U08 u1_is_bPic,
    U08 u1_is_refPic,
    U08 u1_blk_8x8_mask,
    ME_QUALITY_PRESETS_T e_me_quality_preset)
{
    if(15 != u1_blk_8x8_mask)
    {
        return ENABLE_NxN;
    }
    else
    {
        U08 u1_call_inp_segmentation_based_part_mask_populator =
            (ME_XTREME_SPEED_25 != e_me_quality_preset) ||
            (!u1_is_bPic && !DISABLE_8X8CUS_IN_PPICS_IN_P6) ||
            (u1_is_bPic && u1_is_refPic && !DISABLE_8X8CUS_IN_REFBPICS_IN_P6) ||
            (u1_is_bPic && !u1_is_refPic && !DISABLE_8X8CUS_IN_NREFBPICS_IN_P6);

        if(u1_call_inp_segmentation_based_part_mask_populator)
        {
            S32 i4_part_mask =
                hme_study_input_segmentation(pu1_inp, i4_inp_stride, u1_limit_active_partitions);

            if(e_me_quality_preset == ME_XTREME_SPEED)
            {
                i4_part_mask &= ~ENABLE_AMP;
            }

            if(e_me_quality_preset == ME_XTREME_SPEED_25)
            {
                i4_part_mask &= ~ENABLE_AMP;

                i4_part_mask &= ~ENABLE_SMP;
            }

            return i4_part_mask;
        }
        else
        {
            return ENABLE_2Nx2N;
        }
    }
}