C++程序  |  1681行  |  71.16 KB

/******************************************************************************
 *
 * Copyright (C) 2018 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 *****************************************************************************
 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
*/

/**
******************************************************************************
* @file hme_coarse.c
*
* @brief
*    Contains ME algorithm for the coarse layer.
*
* @author
*    Ittiam
*
*
* List of Functions
* hme_update_mv_bank_coarse()
* hme_coarse()
******************************************************************************
*/

/*****************************************************************************/
/* File Includes                                                             */
/*****************************************************************************/
/* System include files */
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <stdarg.h>
#include <math.h>
#include <limits.h>

/* User include files */
#include "ihevc_typedefs.h"
#include "itt_video_api.h"
#include "ihevce_api.h"

#include "rc_cntrl_param.h"
#include "rc_frame_info_collector.h"
#include "rc_look_ahead_params.h"

#include "ihevc_defs.h"
#include "ihevc_structs.h"
#include "ihevc_platform_macros.h"
#include "ihevc_deblk.h"
#include "ihevc_itrans_recon.h"
#include "ihevc_chroma_itrans_recon.h"
#include "ihevc_chroma_intra_pred.h"
#include "ihevc_intra_pred.h"
#include "ihevc_inter_pred.h"
#include "ihevc_mem_fns.h"
#include "ihevc_padding.h"
#include "ihevc_weighted_pred.h"
#include "ihevc_sao.h"
#include "ihevc_resi_trans.h"
#include "ihevc_quant_iquant_ssd.h"
#include "ihevc_cabac_tables.h"

#include "ihevce_defs.h"
#include "ihevce_lap_enc_structs.h"
#include "ihevce_multi_thrd_structs.h"
#include "ihevce_multi_thrd_funcs.h"
#include "ihevce_me_common_defs.h"
#include "ihevce_had_satd.h"
#include "ihevce_error_codes.h"
#include "ihevce_bitstream.h"
#include "ihevce_cabac.h"
#include "ihevce_rdoq_macros.h"
#include "ihevce_function_selector.h"
#include "ihevce_enc_structs.h"
#include "ihevce_entropy_structs.h"
#include "ihevce_cmn_utils_instr_set_router.h"
#include "ihevce_enc_loop_structs.h"
#include "ihevce_bs_compute_ctb.h"
#include "ihevce_global_tables.h"
#include "ihevce_dep_mngr_interface.h"
#include "hme_datatype.h"
#include "hme_interface.h"
#include "hme_common_defs.h"
#include "hme_defs.h"
#include "ihevce_me_instr_set_router.h"
#include "hme_globals.h"
#include "hme_utils.h"
#include "hme_coarse.h"
#include "hme_refine.h"
#include "hme_err_compute.h"
#include "hme_common_utils.h"
#include "hme_search_algo.h"

/*******************************************************************************
*                             MACROS
*******************************************************************************/
#define COPY_SEARCH_RESULT(ps_mv, pi1_ref_idx, ps_search_node, shift)                              \
    {                                                                                              \
        ps_mv->i2_mv_x = ps_search_node->s_mv.i2_mvx >> (shift);                                   \
        ps_mv->i2_mv_y = ps_search_node->s_mv.i2_mvy >> (shift);                                   \
        *pi1_ref_idx = ps_search_node->i1_ref_idx;                                                 \
    }

/*****************************************************************************/
/* Function Definitions                                                      */
/*****************************************************************************/

/**
********************************************************************************
*  @fn     void hme_update_mv_bank_coarse(search_results_t *ps_search_results,
*                                   layer_mv_t *ps_layer_mv,
*                                   S32 i4_blk_x,
*                                   S32 i4_blk_y,
*                                   search_node_t *ps_search_node_4x8_l,
*                                   search_node_t *ps_search_node_8x4_t,
*                                   S08 i1_ref_idx,
*                                   mvbank_update_prms_t *ps_prms
*
*  @brief  Updates the coarse layer MV Bank for a given ref id and blk pos
*
*  @param[in]  ps_search_results: Search results data structure
*
*  @param[in, out]  ps_layer_mv : MV Bank for this layer
*
*  @param[in]  i4_search_blk_x: column number of the 4x4 blk searched
*
*  @param[in]  i4_search_blk_y: row number of the 4x4 blk searched
*
*  @param[in]  ps_search_node_4x8_t: Best MV of the 4x8T blk
*
*  @param[in]  ps_search_node_8x4_l: Best MV of the 8x4L blk
*
*  @param[in]  i1_ref_idx : Reference ID that has been searched
*
*  @param[in]  ps_prms : Parameters pertaining to the MV Bank update
*
*  @return None
********************************************************************************
*/
void hme_update_mv_bank_coarse(
    search_results_t *ps_search_results,
    layer_mv_t *ps_layer_mv,
    S32 i4_search_blk_x,
    S32 i4_search_blk_y,
    search_node_t *ps_search_node_4x8_t,
    search_node_t *ps_search_node_8x4_l,
    S08 i1_ref_idx,
    mvbank_update_prms_t *ps_prms)
{
    /* These point to the MV and ref idx posn to be udpated */
    hme_mv_t *ps_mv;
    S08 *pi1_ref_idx;

    /* Offset within the bank */
    S32 i4_offset;

    S32 i, j, i4_blk_x, i4_blk_y;

    /* Best results for 8x4R and 4x8B blocks */
    search_node_t *ps_search_node_8x4_r, *ps_search_node_4x8_b;

    /* Number of MVs in a block */
    S32 num_mvs = ps_layer_mv->i4_num_mvs_per_ref;

    search_node_t *aps_search_nodes[4];

    /* The search blk may be different in size from the blk used to hold MV */
    i4_blk_x = i4_search_blk_x << ps_prms->i4_shift;
    i4_blk_y = i4_search_blk_y << ps_prms->i4_shift;

    /* Compute the offset in the MV bank */
    i4_offset = i4_blk_x + i4_blk_y * ps_layer_mv->i4_num_blks_per_row;
    i4_offset *= ps_layer_mv->i4_num_mvs_per_blk;

    /* Identify the correct offset in the mvbank and the reference id buf */
    ps_mv = ps_layer_mv->ps_mv + (i4_offset + (num_mvs * i1_ref_idx));
    pi1_ref_idx = ps_layer_mv->pi1_ref_idx + (i4_offset + (num_mvs * i1_ref_idx));

    /*************************************************************************/
    /* We have atleast 4 distinct results: the 4x8 top (coming from top blk) */
    /* 8x4 left (coming from left blk), 8x4 and 4x8 right and bot resp.      */
    /* If number of results to be stored is 4, then we store all these 4     */
    /* results, else we pick best ones                                       */
    /*************************************************************************/
    ps_search_node_8x4_r = ps_search_results->aps_part_results[i1_ref_idx][PART_ID_2NxN_B];
    ps_search_node_4x8_b = ps_search_results->aps_part_results[i1_ref_idx][PART_ID_Nx2N_R];

    ASSERT(num_mvs <= 4);

    /* Doing this to sort best results */
    aps_search_nodes[0] = ps_search_node_8x4_r;
    aps_search_nodes[1] = ps_search_node_4x8_b;
    aps_search_nodes[2] = ps_search_node_8x4_l;
    aps_search_nodes[3] = ps_search_node_4x8_t;
    if(num_mvs == 4)
    {
        COPY_SEARCH_RESULT(ps_mv, pi1_ref_idx, aps_search_nodes[0], 0);
        ps_mv++;
        pi1_ref_idx++;
        COPY_SEARCH_RESULT(ps_mv, pi1_ref_idx, aps_search_nodes[1], 0);
        ps_mv++;
        pi1_ref_idx++;
        COPY_SEARCH_RESULT(ps_mv, pi1_ref_idx, aps_search_nodes[2], 0);
        ps_mv++;
        pi1_ref_idx++;
        COPY_SEARCH_RESULT(ps_mv, pi1_ref_idx, aps_search_nodes[3], 0);
        ps_mv++;
        pi1_ref_idx++;
        return;
    }

    /* Run through the results, store them in best to worst order */
    for(i = 0; i < num_mvs; i++)
    {
        for(j = i + 1; j < 4; j++)
        {
            if(aps_search_nodes[j]->i4_tot_cost < aps_search_nodes[i]->i4_tot_cost)
            {
                SWAP_HME(aps_search_nodes[j], aps_search_nodes[i], search_node_t *);
            }
        }
        COPY_SEARCH_RESULT(ps_mv, pi1_ref_idx, aps_search_nodes[i], 0);
        ps_mv++;
        pi1_ref_idx++;
    }
}

/**
********************************************************************************
*  @fn     void hme_coarse_frm_init(me_ctxt_t *ps_ctxt, coarse_prms_t *ps_coarse_prms)
*
*  @brief  Frame init entry point Coarse ME.
*
*  @param[in,out]  ps_ctxt: ME Handle
*
*  @param[in]  ps_coarse_prms : Coarse layer config params
*
*  @return None
********************************************************************************
*/
void hme_coarse_frm_init(coarse_me_ctxt_t *ps_ctxt, coarse_prms_t *ps_coarse_prms)
{
    layer_ctxt_t *ps_curr_layer;

    S32 i4_pic_wd, i4_pic_ht;

    S32 num_blks_in_pic, num_blks_in_row;

    BLK_SIZE_T e_search_blk_size = BLK_4x4;

    S32 blk_size_shift = 2, blk_wd = 4, blk_ht = 4;

    /* Number of references to search */
    S32 i4_num_ref;

    ps_curr_layer = ps_ctxt->ps_curr_descr->aps_layers[ps_coarse_prms->i4_layer_id];
    i4_num_ref = ps_coarse_prms->i4_num_ref;

    i4_pic_wd = ps_curr_layer->i4_wd;
    i4_pic_ht = ps_curr_layer->i4_ht;
    /* Macro updates num_blks_in_pic and num_blks_in_row*/
    GET_NUM_BLKS_IN_PIC(i4_pic_wd, i4_pic_ht, blk_size_shift, num_blks_in_row, num_blks_in_pic);

    /************************************************************************/
    /* Initialize the mv bank that holds results of this layer.             */
    /************************************************************************/
    hme_init_mv_bank(
        ps_curr_layer,
        BLK_4x4,
        i4_num_ref,
        ps_coarse_prms->num_results,
        ps_ctxt->u1_encode[ps_coarse_prms->i4_layer_id]);

    return;
}

/**
********************************************************************************
*  @fn    void hme_derive_worst_case_search_range(range_prms_t *ps_range,
*                                   range_prms_t *ps_pic_limit,
*                                   range_prms_t *ps_mv_limit,
*                                   S32 i4_x,
*                                   S32 i4_y,
*                                   S32 blk_wd,
*                                   S32 blk_ht)
*
*  @brief  given picture limits and blk dimensions and mv search limits, obtains
*          teh valid search range such that the blk stays within pic boundaries,
*          where picture boundaries include padded portions of picture
*
*  @param[out] ps_range: updated with actual search range
*
*  @param[in] ps_pic_limit : picture boundaries
*
*  @param[in] ps_mv_limit: Search range limits for the mvs
*
*  @param[in] i4_x : x coordinate of the blk
*
*  @param[in] i4_y : y coordinate of the blk
*
*  @param[in] blk_wd : blk width
*
*  @param[in] blk_ht : blk height
*
*  @return void
********************************************************************************
*/
void hme_derive_worst_case_search_range(
    range_prms_t *ps_range,
    range_prms_t *ps_pic_limit,
    range_prms_t *ps_mv_limit,
    S32 i4_x,
    S32 i4_y,
    S32 blk_wd,
    S32 blk_ht)
{
    /* Taking max x of left block, min x of current block */
    ps_range->i2_max_x =
        MIN((ps_pic_limit->i2_max_x - (S16)blk_wd - (S16)(i4_x - 4)), ps_mv_limit->i2_max_x);
    ps_range->i2_min_x = MAX((ps_pic_limit->i2_min_x - (S16)i4_x), ps_mv_limit->i2_min_x);
    /* Taking max y of top block, min y of current block */
    ps_range->i2_max_y =
        MIN((ps_pic_limit->i2_max_y - (S16)blk_ht - (S16)(i4_y - 4)), ps_mv_limit->i2_max_y);
    ps_range->i2_min_y = MAX((ps_pic_limit->i2_min_y - (S16)i4_y), ps_mv_limit->i2_min_y);
}

/**
********************************************************************************
* @fn void hme_combine_4x4_sads_and_compute_cost(S08 i1_ref_idx,
*                                           range_prms_t *ps_mv_range,
*                                           range_prms_t *ps_mv_limit,
*                                           hme_mv_t *ps_best_mv_4x8,
*                                           hme_mv_t *ps_best_mv_8x4,
*                                           pred_ctxt_t *ps_pred_ctxt,
*                                           PF_MV_COST_FXN pf_mv_cost_compute,
*                                           ME_QUALITY_PRESETS_T e_me_quality_preset,
*                                           S16 *pi2_sads_4x4_current,
*                                           S16 *pi2_sads_4x4_east,
*                                           S16 *pi2_sads_4x4_south,
*                                           FILE *fp_dump_sad)
*
*  @brief  Does a full search on entire srch window with a given step size in coarse layer
*
*  @param[in] i1_ref_idx : Cur ref idx
*
*  @param[in] ps_layer_ctxt: All info about this layer
*
*  @param[out] ps_best_mv  : type hme_mv_t contains best mv x and y
*
*  @param[in] ps_pred_ctxt : Prediction ctxt for cost computation
*
*  @param[in] pf_mv_cost_compute : mv cost computation function
*
*  @return void
********************************************************************************
*/
void hme_combine_4x4_sads_and_compute_cost_high_quality(
    S08 i1_ref_idx,
    range_prms_t *ps_mv_range,
    range_prms_t *ps_mv_limit,
    hme_mv_t *ps_best_mv_4x8,
    hme_mv_t *ps_best_mv_8x4,
    pred_ctxt_t *ps_pred_ctxt,
    PF_MV_COST_FXN pf_mv_cost_compute,
    S16 *pi2_sads_4x4_current,
    S16 *pi2_sads_4x4_east,
    S16 *pi2_sads_4x4_south)
{
    /* These control number of parts and number of pts in grid to search */
    S32 stepy, stepx, best_mv_y_4x8, best_mv_x_4x8, best_mv_y_8x4, best_mv_x_8x4;
    S32 step_shift_x, step_shift_y;
    S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;

    S32 min_cost_4x8 = MAX_32BIT_VAL;
    S32 min_cost_8x4 = MAX_32BIT_VAL;

    search_node_t s_search_node;
    s_search_node.i1_ref_idx = i1_ref_idx;

    stepx = stepy = HME_COARSE_STEP_SIZE_HIGH_QUALITY;
    /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_QUALITY */
    step_shift_x = step_shift_y = 1;

    mv_x_offset = (-ps_mv_limit->i2_min_x >> step_shift_x);
    mv_y_offset = (-ps_mv_limit->i2_min_y >> step_shift_y);
    mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
    mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;

    /* Run 2loops to sweep over the reference area */
    for(mvy = ps_mv_range->i2_min_y; mvy < ps_mv_range->i2_max_y; mvy += stepy)
    {
        for(mvx = ps_mv_range->i2_min_x; mvx < ps_mv_range->i2_max_x; mvx += stepx)
        {
            S32 sad_4x8, cost_4x8, sad_8x4, cost_8x4;
            S32 sad_pos = ((mvx >> step_shift_x) + mv_x_offset) +
                          ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range;

            /* Get SAD by adding SAD for current and neighbour S  */
            sad_4x8 = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_south[sad_pos];
            sad_8x4 = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_east[sad_pos];

            //          fprintf(fp_dump_sad,"%d\t",sad);
            s_search_node.s_mv.i2_mvx = mvx;
            s_search_node.s_mv.i2_mvy = mvy;

            cost_4x8 = cost_8x4 =
                pf_mv_cost_compute(&s_search_node, ps_pred_ctxt, PART_ID_2Nx2N, MV_RES_FPEL);

            cost_4x8 += sad_4x8;
            cost_8x4 += sad_8x4;

            if(cost_4x8 < min_cost_4x8)
            {
                best_mv_x_4x8 = mvx;
                best_mv_y_4x8 = mvy;
                min_cost_4x8 = cost_4x8;
            }
            if(cost_8x4 < min_cost_8x4)
            {
                best_mv_x_8x4 = mvx;
                best_mv_y_8x4 = mvy;
                min_cost_8x4 = cost_8x4;
            }
        }
    }

    ps_best_mv_4x8->i2_mv_x = best_mv_x_4x8;
    ps_best_mv_4x8->i2_mv_y = best_mv_y_4x8;

    ps_best_mv_8x4->i2_mv_x = best_mv_x_8x4;
    ps_best_mv_8x4->i2_mv_y = best_mv_y_8x4;
}

void hme_combine_4x4_sads_and_compute_cost_high_speed(
    S08 i1_ref_idx,
    range_prms_t *ps_mv_range,
    range_prms_t *ps_mv_limit,
    hme_mv_t *ps_best_mv_4x8,
    hme_mv_t *ps_best_mv_8x4,
    pred_ctxt_t *ps_pred_ctxt,
    PF_MV_COST_FXN pf_mv_cost_compute,
    S16 *pi2_sads_4x4_current,
    S16 *pi2_sads_4x4_east,
    S16 *pi2_sads_4x4_south)
{
    /* These control number of parts and number of pts in grid to search */
    S32 stepy, stepx, best_mv_y_4x8, best_mv_x_4x8, best_mv_y_8x4, best_mv_x_8x4;
    S32 step_shift_x, step_shift_y;
    S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;

    S32 rnd, lambda, lambda_q_shift;

    S32 min_cost_4x8 = MAX_32BIT_VAL;
    S32 min_cost_8x4 = MAX_32BIT_VAL;

    (void)pf_mv_cost_compute;
    stepx = stepy = HME_COARSE_STEP_SIZE_HIGH_SPEED;
    /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_SPEED */
    step_shift_x = step_shift_y = 2;

    mv_x_offset = (-ps_mv_limit->i2_min_x >> step_shift_x);
    mv_y_offset = (-ps_mv_limit->i2_min_y >> step_shift_y);
    mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
    mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;

    lambda = ps_pred_ctxt->lambda;
    lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
    rnd = 1 << (lambda_q_shift - 1);

    ASSERT(MAX_MVX_SUPPORTED_IN_COARSE_LAYER >= ABS(ps_mv_range->i2_max_x));
    ASSERT(MAX_MVY_SUPPORTED_IN_COARSE_LAYER >= ABS(ps_mv_range->i2_max_y));

    /* Run 2loops to sweep over the reference area */
    for(mvy = ps_mv_range->i2_min_y; mvy < ps_mv_range->i2_max_y; mvy += stepy)
    {
        for(mvx = ps_mv_range->i2_min_x; mvx < ps_mv_range->i2_max_x; mvx += stepx)
        {
            S32 sad_4x8, cost_4x8, sad_8x4, cost_8x4;

            S32 sad_pos = ((mvx >> step_shift_x) + mv_x_offset) +
                          ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range;

            /* Get SAD by adding SAD for current and neighbour S  */
            sad_4x8 = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_south[sad_pos];
            sad_8x4 = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_east[sad_pos];

            //          fprintf(fp_dump_sad,"%d\t",sad);

            cost_4x8 = cost_8x4 =
                (2 * hme_get_range(ABS(mvx)) - 1) + (2 * hme_get_range(ABS(mvy)) - 1) + i1_ref_idx;

            cost_4x8 += (mvx != 0) ? 1 : 0;
            cost_4x8 += (mvy != 0) ? 1 : 0;
            cost_4x8 = (cost_4x8 * lambda + rnd) >> lambda_q_shift;

            cost_8x4 += (mvx != 0) ? 1 : 0;
            cost_8x4 += (mvy != 0) ? 1 : 0;
            cost_8x4 = (cost_8x4 * lambda + rnd) >> lambda_q_shift;

            cost_4x8 += sad_4x8;
            cost_8x4 += sad_8x4;

            if(cost_4x8 < min_cost_4x8)
            {
                best_mv_x_4x8 = mvx;
                best_mv_y_4x8 = mvy;
                min_cost_4x8 = cost_4x8;
            }
            if(cost_8x4 < min_cost_8x4)
            {
                best_mv_x_8x4 = mvx;
                best_mv_y_8x4 = mvy;
                min_cost_8x4 = cost_8x4;
            }
        }
    }

    ps_best_mv_4x8->i2_mv_x = best_mv_x_4x8;
    ps_best_mv_4x8->i2_mv_y = best_mv_y_4x8;

    ps_best_mv_8x4->i2_mv_x = best_mv_x_8x4;
    ps_best_mv_8x4->i2_mv_y = best_mv_y_8x4;
}

/**
********************************************************************************
*  @fn     hme_store_4x4_sads(hme_search_prms_t *ps_search_prms,
*                               layer_ctxt_t *ps_layer_ctxt)
*
*  @brief  Does a 4x4 sad computation on a given range and stores it in memory
*
*  @param[in] ps_search_prms : Search prms structure containing info like
*               blk dimensions, search range etc
*
*  @param[in] ps_layer_ctxt: All info about this layer
*
*  @param[in] ps_wt_inp_prms: All info about weighted input
*
*  @param[in] e_me_quality_preset: motion estimation quality preset
*
*  @param[in] pi2_sads_4x4: Memory to store all 4x4 SADs for given range
*
*  @return void
********************************************************************************
*/

void hme_store_4x4_sads_high_quality(
    hme_search_prms_t *ps_search_prms,
    layer_ctxt_t *ps_layer_ctxt,
    range_prms_t *ps_mv_limit,
    wgt_pred_ctxt_t *ps_wt_inp_prms,
    S16 *pi2_sads_4x4)
{
    S32 sad, i, j;

    /* Input and reference attributes */
    U08 *pu1_inp, *pu1_inp_orig, *pu1_ref;
    S32 i4_inp_stride, i4_ref_stride, i4_ref_offset;

    /* The reference is actually an array of ptrs since there are several    */
    /* reference id. So an array gets passed form calling function           */
    U08 **ppu1_ref, *pu1_ref_coloc;

    S32 stepy, stepx, step_shift_x, step_shift_y;
    S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;

    /* Points to the range limits for mv */
    range_prms_t *ps_range_prms;

    /* Reference index to be searched */
    S32 i4_search_idx = ps_search_prms->i1_ref_idx;
    /* Using the member 0 to store for all ref. idx. */
    ps_range_prms = ps_search_prms->aps_mv_range[0];
    pu1_inp_orig = ps_wt_inp_prms->apu1_wt_inp[i4_search_idx];
    i4_inp_stride = ps_search_prms->i4_inp_stride;

    /* Move to the location of the search blk in inp buffer */
    pu1_inp_orig += ps_search_prms->i4_cu_x_off;
    pu1_inp_orig += ps_search_prms->i4_cu_y_off * i4_inp_stride;

    /*************************************************************************/
    /* we use either input of previously encoded pictures as reference       */
    /* in coarse layer                                                       */
    /*************************************************************************/
    i4_ref_stride = ps_layer_ctxt->i4_inp_stride;
    ppu1_ref = ps_layer_ctxt->ppu1_list_inp;

    /* colocated position in reference picture */
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
    pu1_ref_coloc = ppu1_ref[i4_search_idx] + i4_ref_offset;

    stepx = stepy = HME_COARSE_STEP_SIZE_HIGH_QUALITY;
    /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_QUALITY */
    step_shift_x = step_shift_y = 1;

    mv_x_offset = -(ps_mv_limit->i2_min_x >> step_shift_x);
    mv_y_offset = -(ps_mv_limit->i2_min_y >> step_shift_y);
    mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
    mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;

    /* Run 2loops to sweep over the reference area */
    for(mvy = ps_range_prms->i2_min_y; mvy < ps_range_prms->i2_max_y; mvy += stepy)
    {
        for(mvx = ps_range_prms->i2_min_x; mvx < ps_range_prms->i2_max_x; mvx += stepx)
        {
            /* Set up the reference and inp ptr */
            pu1_ref = pu1_ref_coloc + mvx + (mvy * i4_ref_stride);
            pu1_inp = pu1_inp_orig;
            /* SAD computation */
            {
                sad = 0;
                for(i = 0; i < 4; i++)
                {
                    for(j = 0; j < 4; j++)
                    {
                        sad += (ABS(((S32)pu1_inp[j] - (S32)pu1_ref[j])));
                    }
                    pu1_inp += i4_inp_stride;
                    pu1_ref += i4_ref_stride;
                }
            }

            pi2_sads_4x4
                [((mvx >> step_shift_x) + mv_x_offset) +
                 ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range] = sad;
        }
    }
}

void hme_store_4x4_sads_high_speed(
    hme_search_prms_t *ps_search_prms,
    layer_ctxt_t *ps_layer_ctxt,
    range_prms_t *ps_mv_limit,
    wgt_pred_ctxt_t *ps_wt_inp_prms,
    S16 *pi2_sads_4x4)
{
    S32 sad, i, j;

    /* Input and reference attributes */
    U08 *pu1_inp, *pu1_inp_orig, *pu1_ref;
    S32 i4_inp_stride, i4_ref_stride, i4_ref_offset;

    /* The reference is actually an array of ptrs since there are several    */
    /* reference id. So an array gets passed form calling function           */
    U08 **ppu1_ref, *pu1_ref_coloc;

    S32 stepy, stepx, step_shift_x, step_shift_y;
    S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;

    /* Points to the range limits for mv */
    range_prms_t *ps_range_prms;

    /* Reference index to be searched */
    S32 i4_search_idx = ps_search_prms->i1_ref_idx;

    /* Using the member 0 for all ref. idx */
    ps_range_prms = ps_search_prms->aps_mv_range[0];
    pu1_inp_orig = ps_wt_inp_prms->apu1_wt_inp[i4_search_idx];
    i4_inp_stride = ps_search_prms->i4_inp_stride;

    /* Move to the location of the search blk in inp buffer */
    pu1_inp_orig += ps_search_prms->i4_cu_x_off;
    pu1_inp_orig += ps_search_prms->i4_cu_y_off * i4_inp_stride;

    /*************************************************************************/
    /* we use either input of previously encoded pictures as reference       */
    /* in coarse layer                                                       */
    /*************************************************************************/
    i4_ref_stride = ps_layer_ctxt->i4_inp_stride;
    ppu1_ref = ps_layer_ctxt->ppu1_list_inp;

    /* colocated position in reference picture */
    i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
    pu1_ref_coloc = ppu1_ref[i4_search_idx] + i4_ref_offset;

    stepx = stepy = HME_COARSE_STEP_SIZE_HIGH_SPEED;
    /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_SPEED */
    step_shift_x = step_shift_y = 2;

    mv_x_offset = -(ps_mv_limit->i2_min_x >> step_shift_x);
    mv_y_offset = -(ps_mv_limit->i2_min_y >> step_shift_y);
    mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
    mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;

    /* Run 2loops to sweep over the reference area */
    for(mvy = ps_range_prms->i2_min_y; mvy < ps_range_prms->i2_max_y; mvy += stepy)
    {
        for(mvx = ps_range_prms->i2_min_x; mvx < ps_range_prms->i2_max_x; mvx += stepx)
        {
            /* Set up the reference and inp ptr */
            pu1_ref = pu1_ref_coloc + mvx + (mvy * i4_ref_stride);
            pu1_inp = pu1_inp_orig;
            /* SAD computation */
            {
                sad = 0;
                for(i = 0; i < 4; i++)
                {
                    for(j = 0; j < 4; j++)
                    {
                        sad += (ABS(((S32)pu1_inp[j] - (S32)pu1_ref[j])));
                    }
                    pu1_inp += i4_inp_stride;
                    pu1_ref += i4_ref_stride;
                }
            }

            pi2_sads_4x4
                [((mvx >> step_shift_x) + mv_x_offset) +
                 ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range] = sad;
        }
    }
}
/**
********************************************************************************
*  @fn     void hme_coarsest(me_ctxt_t *ps_ctxt, coarse_prms_t *ps_coarse_prms)
*
*  @brief  Top level entry point for Coarse ME. Runs across blks and searches
*          at a 4x4 blk granularity by using 4x8 and 8x4 patterns.
*
*  @param[in,out]  ps_ctxt: ME Handle
*
*  @param[in]  ps_coarse_prms : Coarse layer config params
*
*  @param[in]  ps_multi_thrd_ctxt : Multi thread context
*
*  @return None
********************************************************************************
*/
void hme_coarsest(
    coarse_me_ctxt_t *ps_ctxt,
    coarse_prms_t *ps_coarse_prms,
    multi_thrd_ctxt_t *ps_multi_thrd_ctxt,
    WORD32 i4_ping_pong,
    void **ppv_dep_mngr_hme_sync)
{
    S16 *pi2_cur_ref_sads_4x4;
    S32 ai4_sad_4x4_block_size[MAX_NUM_REF], ai4_sad_4x4_block_stride[MAX_NUM_REF];
    S32 num_rows_coarse;
    S32 sad_top_offset, sad_current_offset;
    S32 search_node_top_offset, search_node_left_offset;

    ME_QUALITY_PRESETS_T e_me_quality_preset =
        ps_ctxt->s_init_prms.s_me_coding_tools.e_me_quality_presets;

    search_results_t *ps_search_results;
    mvbank_update_prms_t s_mv_update_prms;
    BLK_SIZE_T e_search_blk_size = BLK_4x4;
    hme_search_prms_t s_search_prms_4x8, s_search_prms_8x4, s_search_prms_4x4;

    S32 global_id_8x4, global_id_4x8;

    /*************************************************************************/
    /* These directly point to the best search result nodes that will be     */
    /* updated by the search algorithm, rather than have to go through an    */
    /* elaborate structure                                                   */
    /*************************************************************************/
    search_node_t *aps_best_search_node_8x4[MAX_NUM_REF];
    search_node_t *aps_best_search_node_4x8[MAX_NUM_REF];

    /* These point to various spatial candts */
    search_node_t *ps_candt_8x4_l, *ps_candt_8x4_t, *ps_candt_8x4_tl;
    search_node_t *ps_candt_4x8_l, *ps_candt_4x8_t, *ps_candt_4x8_tl;
    search_node_t *ps_candt_zeromv_8x4, *ps_candt_zeromv_4x8;
    search_node_t *ps_candt_fs_8x4, *ps_candt_fs_4x8;
    search_node_t as_top_neighbours[4], as_left_neighbours[3];

    /* Holds the global mv for a given ref index */
    search_node_t s_candt_global[MAX_NUM_REF];

    /* All the search candidates */
    search_candt_t as_search_candts_8x4[MAX_INIT_CANDTS];
    search_candt_t as_search_candts_4x8[MAX_INIT_CANDTS];
    search_candt_t *ps_search_candts_8x4, *ps_search_candts_4x8;

    /* Actual range per blk and the pic level boundaries */
    range_prms_t s_range_prms, s_pic_limit, as_mv_limit[MAX_NUM_REF];

    /* Current and prev pic layer ctxt at the coarsest layer */
    layer_ctxt_t *ps_curr_layer, *ps_prev_layer;

    /* best mv of full search */
    hme_mv_t best_mv_4x8, best_mv_8x4;

    /* Book keeping at blk level */
    S32 blk_x, num_blks_in_pic, num_blks_in_row, num_4x4_blks_in_row;

    S32 blk_y;

    /* Block dimensions */
    S32 blk_size_shift = 2, blk_wd = 4, blk_ht = 4;

    S32 lambda = ps_coarse_prms->lambda;

    /* Number of references to search */
    S32 i4_num_ref;

    S32 i4_i, id, i;
    S08 i1_ref_idx;

    S32 i4_pic_wd, i4_pic_ht;
    S32 i4_layer_id;

    S32 end_of_frame;

    pf_get_wt_inp fp_get_wt_inp;

    /* Maximum search iterations around any candidate */
    S32 i4_max_iters = ps_coarse_prms->i4_max_iters;

    ps_curr_layer = ps_ctxt->ps_curr_descr->aps_layers[ps_coarse_prms->i4_layer_id];
    ps_prev_layer = hme_coarse_get_past_layer_ctxt(ps_ctxt, ps_coarse_prms->i4_layer_id);

    /* We need only one instance of search results structure */
    ps_search_results = &ps_ctxt->s_search_results_8x8;

    ps_search_candts_8x4 = &as_search_candts_8x4[0];
    ps_search_candts_4x8 = &as_search_candts_4x8[0];

    end_of_frame = 0;

    i4_pic_wd = ps_curr_layer->i4_wd;
    i4_pic_ht = ps_curr_layer->i4_ht;

    fp_get_wt_inp = ((ihevce_me_optimised_function_list_t *)ps_ctxt->pv_me_optimised_function_list)
                        ->pf_get_wt_inp_8x8;

    num_rows_coarse = ps_ctxt->i4_num_row_bufs;

    /*************************************************************************/
    /* Coarse Layer always does explicit search. Number of reference frames  */
    /* to search is a configurable parameter supplied by the application     */
    /*************************************************************************/
    i4_num_ref = ps_coarse_prms->i4_num_ref;
    i4_layer_id = ps_coarse_prms->i4_layer_id;

    /*************************************************************************/
    /*  The search algorithm goes as follows:                                */
    /*                                                                       */
    /*          ___                                                          */
    /*         | e |                                                         */
    /*      ___|___|___                                                      */
    /*     | c | a | b |                                                     */
    /*     |___|___|___|                                                     */
    /*         | d |                                                         */
    /*         |___|                                                         */
    /*                                                                       */
    /* For the target block a, we collect best results from 2 8x4 blks       */
    /* These are c-a and a-b. The 4x8 blks are e-a and a-d                   */
    /* c-a result is already available from results of blk c. a-b is         */
    /* evaluated in this blk. Likewise e-a result is stored in a row buffer  */
    /* a-d is evaluated this blk                                             */
    /* So we store a row buffer which stores best 4x8 results of all top blk */
    /*************************************************************************/

    /************************************************************************/
    /* Initialize the pointers to the best node.                            */
    /************************************************************************/
    for(i4_i = 0; i4_i < i4_num_ref; i4_i++)
    {
        aps_best_search_node_8x4[i4_i] = ps_search_results->aps_part_results[i4_i][PART_ID_2NxN_B];
        aps_best_search_node_4x8[i4_i] = ps_search_results->aps_part_results[i4_i][PART_ID_Nx2N_R];
    }

    /************************************************************************/
    /* Initialize the "searchresults" structure. This will set up the number*/
    /* of search types, result updates etc                                  */
    /************************************************************************/
    {
        S32 num_results_per_part;
        /* We evaluate 4 types of results per 4x4 blk. 8x4L and 8x4R and     */
        /* 4x8 T and 4x8B. So if we are to give 4 results, then we need to   */
        /* only evaluate 1 result per part. In the coarse layer, we are      */
        /* limited to 2 results max per part, and max of 8 results.          */
        num_results_per_part = (ps_coarse_prms->num_results + 3) >> 2;
        hme_init_search_results(
            ps_search_results,
            i4_num_ref,
            ps_coarse_prms->num_results,
            num_results_per_part,
            BLK_8x8,
            0,
            0,
            ps_ctxt->au1_is_past);
    }

    /* Macro updates num_blks_in_pic and num_blks_in_row*/
    GET_NUM_BLKS_IN_PIC(i4_pic_wd, i4_pic_ht, blk_size_shift, num_blks_in_row, num_blks_in_pic);

    num_4x4_blks_in_row = num_blks_in_row + 1;

    s_mv_update_prms.e_search_blk_size = e_search_blk_size;
    s_mv_update_prms.i4_num_ref = i4_num_ref;
    s_mv_update_prms.i4_shift = 0;

    /* For full search, support 2 or 4 step size */
    if(ps_coarse_prms->do_full_search)
    {
        ASSERT((ps_coarse_prms->full_search_step == 2) || (ps_coarse_prms->full_search_step == 4));
    }

    for(i4_i = 0; i4_i < i4_num_ref; i4_i++)
    {
        S32 blk, delta_poc;
        S32 mv_x_clip, mv_y_clip;
        /* Initialize only the first row */
        for(blk = 0; blk < num_blks_in_row; blk++)
        {
            INIT_SEARCH_NODE(&ps_ctxt->aps_best_search_nodes_4x8_n_rows[i4_i][blk], i4_i);
        }

        delta_poc = ABS(ps_curr_layer->i4_poc - ps_curr_layer->ai4_ref_id_to_poc_lc[i4_i]);

        /* Setting search range for different references based on the delta poc */
        /*************************************************************************/
        /* set the MV limit per ref. pic.                                        */
        /*    - P pic. : Based on the config params.                             */
        /*    - B/b pic: Based on the Max/Min MV from prev. P and config. param. */
        /*************************************************************************/
        {
            /* TO DO : Remove hard coding of P-P dist. of 4 */
            mv_x_clip = (ps_curr_layer->i2_max_mv_x * delta_poc) / 4;

            /* Only for B/b pic. */
            if(1 == ps_ctxt->s_frm_prms.bidir_enabled)
            {
                WORD16 i2_mv_y_per_poc;

                /* Get abs MAX for symmetric search */
                i2_mv_y_per_poc =
                    MAX(ps_ctxt->s_coarse_dyn_range_prms.i2_dyn_max_y_per_poc[i4_layer_id],
                        (ABS(ps_ctxt->s_coarse_dyn_range_prms.i2_dyn_min_y_per_poc[i4_layer_id])));

                mv_y_clip = i2_mv_y_per_poc * delta_poc;
            }
            /* Set the Config. File Params for P pic. */
            else
            {
                /* TO DO : Remove hard coding of P-P dist. of 4 */
                mv_y_clip = (ps_curr_layer->i2_max_mv_y * delta_poc) / 4;
            }

            /* Making mv_x and mv_y range multiple of 4 */
            mv_x_clip = (((mv_x_clip + 3) >> 2) << 2);
            mv_y_clip = (((mv_y_clip + 3) >> 2) << 2);
            /* Clipping the range of mv_x and mv_y */
            mv_x_clip = CLIP3(mv_x_clip, 4, MAX_MVX_SUPPORTED_IN_COARSE_LAYER);
            mv_y_clip = CLIP3(mv_y_clip, 4, MAX_MVY_SUPPORTED_IN_COARSE_LAYER);

            as_mv_limit[i4_i].i2_min_x = -mv_x_clip;
            as_mv_limit[i4_i].i2_min_y = -mv_y_clip;
            as_mv_limit[i4_i].i2_max_x = mv_x_clip;
            as_mv_limit[i4_i].i2_max_y = mv_y_clip;
        }
        /*Populating SAD block size based on search range */
        ai4_sad_4x4_block_size[i4_i] = ((2 * mv_x_clip) / ps_coarse_prms->full_search_step) *
                                       ((2 * mv_y_clip) / ps_coarse_prms->full_search_step);
        ai4_sad_4x4_block_stride[i4_i] = (num_blks_in_row + 1) * ai4_sad_4x4_block_size[i4_i];
    }

    for(i = 0; i < 2 * MAX_INIT_CANDTS; i++)
    {
        search_node_t *ps_search_node;
        ps_search_node = &ps_ctxt->s_init_search_node[i];
        INIT_SEARCH_NODE(ps_search_node, 0);
    }
    for(i = 0; i < 3; i++)
    {
        search_node_t *ps_search_node;
        ps_search_node = &as_left_neighbours[i];
        INIT_SEARCH_NODE(ps_search_node, 0);
        ps_search_node = &as_top_neighbours[i];
        INIT_SEARCH_NODE(ps_search_node, 0);
    }
    INIT_SEARCH_NODE(&as_top_neighbours[3], 0);
    /* Set up place holders to hold the search nodes of each initial candt */
    for(i = 0; i < MAX_INIT_CANDTS; i++)
    {
        ps_search_candts_8x4[i].ps_search_node = &ps_ctxt->s_init_search_node[i];

        ps_search_candts_4x8[i].ps_search_node = &ps_ctxt->s_init_search_node[MAX_INIT_CANDTS + i];

        ps_search_candts_8x4[i].u1_num_steps_refine = (U08)i4_max_iters;
        ps_search_candts_4x8[i].u1_num_steps_refine = (U08)i4_max_iters;
    }

    /* For Top,TopLeft and Left cand., no need for refinement */
    id = 0;
    if((ps_coarse_prms->do_full_search) && (ME_XTREME_SPEED_25 == e_me_quality_preset))
    {
        /* This search candt has the full search result */
        ps_candt_fs_8x4 = ps_search_candts_8x4[id].ps_search_node;
        id++;
    }

    ps_candt_8x4_l = ps_search_candts_8x4[id].ps_search_node;
    ps_search_candts_8x4[id].u1_num_steps_refine = 0;
    id++;
    ps_candt_8x4_t = ps_search_candts_8x4[id].ps_search_node;
    ps_search_candts_8x4[id].u1_num_steps_refine = 0;
    id++;
    ps_candt_8x4_tl = ps_search_candts_8x4[id].ps_search_node;
    ps_search_candts_8x4[id].u1_num_steps_refine = 0;
    id++;
    /* This search candt stores the global candt */
    global_id_8x4 = id;
    id++;

    if((ps_coarse_prms->do_full_search) && (ME_XTREME_SPEED_25 != e_me_quality_preset))
    {
        /* This search candt has the full search result */
        ps_candt_fs_8x4 = ps_search_candts_8x4[id].ps_search_node;
        id++;
    }
    /* Don't increment id as (0,0) is removed from cand. list. Initializing */
    /* the pointer for hme_init_pred_ctxt_no_encode()                       */
    ps_candt_zeromv_8x4 = ps_search_candts_8x4[id].ps_search_node;

    /* For Top,TopLeft and Left cand., no need for refinement */
    id = 0;
    if((ps_coarse_prms->do_full_search) && (ME_XTREME_SPEED_25 == e_me_quality_preset))
    {
        /* This search candt has the full search result */
        ps_candt_fs_4x8 = ps_search_candts_4x8[id].ps_search_node;
        id++;
    }

    ps_candt_4x8_l = ps_search_candts_4x8[id].ps_search_node;
    ps_search_candts_4x8[id].u1_num_steps_refine = 0;
    id++;
    ps_candt_4x8_t = ps_search_candts_4x8[id].ps_search_node;
    ps_search_candts_4x8[id].u1_num_steps_refine = 0;
    id++;
    ps_candt_4x8_tl = ps_search_candts_4x8[id].ps_search_node;
    ps_search_candts_4x8[id].u1_num_steps_refine = 0;
    id++;
    /* This search candt stores the global candt */
    global_id_4x8 = id;
    id++;
    if((ps_coarse_prms->do_full_search) && (ME_XTREME_SPEED_25 != e_me_quality_preset))
    {
        /* This search candt has the full search result */
        ps_candt_fs_4x8 = ps_search_candts_4x8[id].ps_search_node;
        id++;
    }
    /* Don't increment id4as (0,0) is removed from cand. list. Initializing */
    /* the pointer for hme_init_pred_ctxt_no_encode()                       */
    ps_candt_zeromv_4x8 = ps_search_candts_4x8[id].ps_search_node;

    /* Zero mv always has 0 mvx and y componnent, ref idx initialized inside */
    ps_candt_zeromv_8x4->s_mv.i2_mvx = 0;
    ps_candt_zeromv_8x4->s_mv.i2_mvy = 0;
    ps_candt_zeromv_4x8->s_mv.i2_mvx = 0;
    ps_candt_zeromv_4x8->s_mv.i2_mvy = 0;

    /* SET UP THE PRED CTXT FOR L0 AND L1 */
    {
        S32 pred_lx;

        /* Bottom left always not available */
        as_left_neighbours[2].u1_is_avail = 0;

        for(pred_lx = 0; pred_lx < 2; pred_lx++)
        {
            pred_ctxt_t *ps_pred_ctxt;

            ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
            hme_init_pred_ctxt_no_encode(
                ps_pred_ctxt,
                ps_search_results,
                as_top_neighbours,
                as_left_neighbours,
                NULL,
                ps_candt_zeromv_8x4,
                ps_candt_zeromv_8x4,
                pred_lx,
                lambda,
                ps_coarse_prms->lambda_q_shift,
                ps_ctxt->apu1_ref_bits_tlu_lc,
                ps_ctxt->ai2_ref_scf);
        }
    }

    /*************************************************************************/
    /* Initialize the search parameters for search algo with the following   */
    /* parameters: No SATD, calculated number of initial candidates,         */
    /* No post refinement, initial step size and number of iterations as     */
    /* passed by the calling function.                                       */
    /* Also, we use input for this layer search, and not recon.              */
    /*************************************************************************/
    if(e_me_quality_preset == ME_XTREME_SPEED_25)
        s_search_prms_8x4.i4_num_init_candts = 1;
    else
        s_search_prms_8x4.i4_num_init_candts = id;
    s_search_prms_8x4.i4_use_satd = 0;
    s_search_prms_8x4.i4_start_step = ps_coarse_prms->i4_start_step;
    s_search_prms_8x4.i4_num_steps_post_refine = 0;
    s_search_prms_8x4.i4_use_rec = 0;
    s_search_prms_8x4.ps_search_candts = ps_search_candts_8x4;
    s_search_prms_8x4.e_blk_size = BLK_8x4;
    s_search_prms_8x4.i4_max_iters = ps_coarse_prms->i4_max_iters;
    /* Coarse layer is always explicit */
    if(ME_MEDIUM_SPEED > e_me_quality_preset)
    {
        s_search_prms_8x4.pf_mv_cost_compute = compute_mv_cost_coarse;
    }
    else
    {
        s_search_prms_8x4.pf_mv_cost_compute = compute_mv_cost_coarse_high_speed;
    }

    s_search_prms_8x4.i4_inp_stride = 8;
    s_search_prms_8x4.i4_cu_x_off = s_search_prms_8x4.i4_cu_y_off = 0;
    if(ps_coarse_prms->do_full_search)
        s_search_prms_8x4.i4_max_iters = 1;
    s_search_prms_8x4.i4_part_mask = (1 << PART_ID_2NxN_B);
    /* Using the member 0 to store for all ref. idx. */
    s_search_prms_8x4.aps_mv_range[0] = &s_range_prms;
    s_search_prms_8x4.ps_search_results = ps_search_results;
    s_search_prms_8x4.full_search_step = ps_coarse_prms->full_search_step;

    s_search_prms_4x8 = s_search_prms_8x4;
    s_search_prms_4x8.ps_search_candts = ps_search_candts_4x8;
    s_search_prms_4x8.e_blk_size = BLK_4x8;
    s_search_prms_4x8.i4_part_mask = (1 << PART_ID_Nx2N_R);

    s_search_prms_4x4 = s_search_prms_8x4;
    /* Since s_search_prms_4x4 is used only to computer sad at 4x4 level, search candidate is not used */
    s_search_prms_4x4.ps_search_candts = ps_search_candts_4x8;
    s_search_prms_4x4.e_blk_size = BLK_4x4;
    s_search_prms_4x4.i4_part_mask = (1 << PART_ID_2Nx2N);
    /*************************************************************************/
    /* Picture limit on all 4 sides. This will be used to set mv limits for  */
    /* every block given its coordinate.                                     */
    /*************************************************************************/
    SET_PIC_LIMIT(
        s_pic_limit,
        ps_curr_layer->i4_pad_x_inp,
        ps_curr_layer->i4_pad_y_inp,
        ps_curr_layer->i4_wd,
        ps_curr_layer->i4_ht,
        s_search_prms_4x4.i4_num_steps_post_refine);

    /* Pick the global mv from previous reference */
    for(i1_ref_idx = 0; i1_ref_idx < i4_num_ref; i1_ref_idx++)
    {
        if(ME_XTREME_SPEED_25 != e_me_quality_preset)
        {
            /* Distance of current pic from reference */
            S32 i4_delta_poc;

            hme_mv_t s_mv;
            i4_delta_poc = ps_curr_layer->i4_poc - ps_curr_layer->ai4_ref_id_to_poc_lc[i1_ref_idx];

            hme_get_global_mv(ps_prev_layer, &s_mv, i4_delta_poc);

            s_candt_global[i1_ref_idx].s_mv.i2_mvx = s_mv.i2_mv_x;
            s_candt_global[i1_ref_idx].s_mv.i2_mvy = s_mv.i2_mv_y;
            s_candt_global[i1_ref_idx].i1_ref_idx = i1_ref_idx;

            /*********************************************************************/
            /* Initialize the histogram for each reference index in current      */
            /* layer ctxt                                                        */
            /*********************************************************************/
            hme_init_histogram(
                ps_ctxt->aps_mv_hist[i1_ref_idx],
                (S32)as_mv_limit[i1_ref_idx].i2_max_x,
                (S32)as_mv_limit[i1_ref_idx].i2_max_y);
        }

        /*********************************************************************/
        /* Initialize the dyn. search range params. for each reference index */
        /* in current layer ctxt                                             */
        /*********************************************************************/
        /* Only for P pic. For P, both are 0, I&B has them mut. exclusive */
        if(ps_ctxt->s_frm_prms.is_i_pic == ps_ctxt->s_frm_prms.bidir_enabled)
        {
            INIT_DYN_SEARCH_PRMS(
                &ps_ctxt->s_coarse_dyn_range_prms.as_dyn_range_prms[i4_layer_id][i1_ref_idx],
                ps_curr_layer->ai4_ref_id_to_poc_lc[i1_ref_idx]);
        }
    }

    /*************************************************************************/
    /* if exhaustive algorithmm then we use only 1 candt 0, 0                */
    /* else we use a lot of causal and non causal candts                     */
    /* finally set number to the configured number of candts                 */
    /*************************************************************************/

    /* Loop in raster order over each 4x4 blk in a given row till end of frame */
    while(0 == end_of_frame)
    {
        job_queue_t *ps_job;
        void *pv_hme_dep_mngr;
        WORD32 offset_val, check_dep_pos, set_dep_pos;

        /* Get the current layer HME Dep Mngr       */
        /* Note : Use layer_id - 1 in HME layers    */
        pv_hme_dep_mngr = ppv_dep_mngr_hme_sync[ps_coarse_prms->i4_layer_id - 1];

        /* Get the current row from the job queue */
        ps_job = (job_queue_t *)ihevce_pre_enc_grp_get_next_job(
            ps_multi_thrd_ctxt, ps_multi_thrd_ctxt->i4_me_coarsest_lyr_type, 1, i4_ping_pong);

        /* If all rows are done, set the end of process flag to 1, */
        /* and the current row to -1 */
        if(NULL == ps_job)
        {
            blk_y = -1;
            end_of_frame = 1;
        }
        else
        {
            ASSERT(ps_multi_thrd_ctxt->i4_me_coarsest_lyr_type == ps_job->i4_pre_enc_task_type);

            /* Obtain the current row's details from the job */
            blk_y = ps_job->s_job_info.s_me_job_info.i4_vert_unit_row_no;

            if(1 == ps_ctxt->s_frm_prms.is_i_pic)
            {
                /* set the output dependency of current row */
                ihevce_pre_enc_grp_job_set_out_dep(ps_multi_thrd_ctxt, ps_job, i4_ping_pong);
                continue;
            }

            /* Set Variables for Dep. Checking and Setting */
            set_dep_pos = blk_y + 1;
            if(blk_y > 0)
            {
                offset_val = 2;
                check_dep_pos = blk_y - 1;
            }
            else
            {
                /* First row should run without waiting */
                offset_val = -1;
                check_dep_pos = 0;
            }

            /* Loop over all the blocks in current row */
            /* One block extra, since the last block in a row needs East block */
            for(blk_x = 0; blk_x < (num_blks_in_row + 1); blk_x++)
            {
                /* Wait till top row block is processed   */
                /* Currently checking till top right block*/
                if(blk_x < (num_blks_in_row))
                {
                    ihevce_dmgr_chk_row_row_sync(
                        pv_hme_dep_mngr,
                        blk_x,
                        offset_val,
                        check_dep_pos,
                        0, /* Col Tile No. : Not supported in PreEnc*/
                        ps_ctxt->thrd_id);
                }

                /***************************************************************/
                /* Get Weighted input for all references                       */
                /***************************************************************/
                fp_get_wt_inp(
                    ps_curr_layer,
                    &ps_ctxt->s_wt_pred,
                    1 << (blk_size_shift + 1),
                    blk_x << blk_size_shift,
                    (blk_y - 1) << blk_size_shift,
                    1 << (blk_size_shift + 1),
                    i4_num_ref,
                    ps_ctxt->i4_wt_pred_enable_flag);

                /* RESET ALL SEARCH RESULTS FOR THE NEW BLK */
                hme_reset_search_results(
                    ps_search_results,
                    s_search_prms_8x4.i4_part_mask | s_search_prms_4x8.i4_part_mask,
                    MV_RES_FPEL);

                /* Compute the search node offsets */
                /* MAX is used to clip when left and top neighbours are not availbale at coarse boundaries  */
                search_node_top_offset =
                    blk_x + ps_ctxt->ai4_row_index[MAX((blk_y - 2), 0)] * num_blks_in_row;
                search_node_left_offset =
                    MAX((blk_x - 1), 0) +
                    ps_ctxt->ai4_row_index[MAX((blk_y - 1), 0)] * num_blks_in_row;

                /* Input offset: wrt CU start. Offset for South block */
                s_search_prms_4x4.i4_cu_x_off = 0;
                s_search_prms_4x4.i4_cu_y_off = 4;
                s_search_prms_4x4.i4_inp_stride = 8;
                s_search_prms_4x4.i4_x_off = blk_x << blk_size_shift;
                s_search_prms_4x4.i4_y_off = blk_y << blk_size_shift;

                s_search_prms_4x8.i4_x_off = s_search_prms_8x4.i4_x_off = blk_x << blk_size_shift;
                s_search_prms_4x8.i4_y_off = s_search_prms_8x4.i4_y_off = (blk_y - 1)
                                                                          << blk_size_shift;

                /* This layer will always use explicit ME */
                /* Loop across different Ref IDx */
                for(i1_ref_idx = 0; i1_ref_idx < i4_num_ref; i1_ref_idx++)
                {
                    sad_top_offset = (blk_x * ai4_sad_4x4_block_size[i1_ref_idx]) +
                                     ps_ctxt->ai4_row_index[MAX((blk_y - 1), 0)] *
                                         ai4_sad_4x4_block_stride[i1_ref_idx];
                    sad_current_offset =
                        (blk_x * ai4_sad_4x4_block_size[i1_ref_idx]) +
                        ps_ctxt->ai4_row_index[blk_y] * ai4_sad_4x4_block_stride[i1_ref_idx];

                    /* Initialize search node if blk_x == 0, as it doesn't have left neighbours */
                    if(0 == blk_x)
                        INIT_SEARCH_NODE(
                            &ps_ctxt->aps_best_search_nodes_8x4_n_rows[i1_ref_idx][blk_x],
                            i1_ref_idx);

                    pi2_cur_ref_sads_4x4 = ps_ctxt->api2_sads_4x4_n_rows[i1_ref_idx];

                    /* Initialize changing params here */
                    s_search_prms_8x4.i1_ref_idx = i1_ref_idx;
                    s_search_prms_4x8.i1_ref_idx = i1_ref_idx;
                    s_search_prms_4x4.i1_ref_idx = i1_ref_idx;

                    if(num_blks_in_row == blk_x)
                    {
                        S16 *pi2_sads_4x4_current;
                        /* Since the current 4x4 block will be a padded region, which may not match with any of the reference  */
                        pi2_sads_4x4_current = pi2_cur_ref_sads_4x4 + sad_current_offset;

                        memset(pi2_sads_4x4_current, 0, ai4_sad_4x4_block_size[i1_ref_idx]);
                    }

                    /* SAD to be computed and stored for the 4x4 block in 1st row and the last block of all rows*/
                    if((0 == blk_y) || (num_blks_in_row == blk_x))
                    {
                        S16 *pi2_sads_4x4_current;
                        /* Computer 4x4 SADs for current block */
                        /* Pointer to store SADs */
                        pi2_sads_4x4_current = pi2_cur_ref_sads_4x4 + sad_current_offset;

                        hme_derive_worst_case_search_range(
                            &s_range_prms,
                            &s_pic_limit,
                            &as_mv_limit[i1_ref_idx],
                            blk_x << blk_size_shift,
                            blk_y << blk_size_shift,
                            blk_wd,
                            blk_ht);

                        if(ME_PRISTINE_QUALITY >= e_me_quality_preset)
                        {
                            ((ihevce_me_optimised_function_list_t *)
                                 ps_ctxt->pv_me_optimised_function_list)
                                ->pf_store_4x4_sads_high_quality(
                                    &s_search_prms_4x4,
                                    ps_curr_layer,
                                    &as_mv_limit[i1_ref_idx],
                                    &ps_ctxt->s_wt_pred,
                                    pi2_sads_4x4_current);
                        }
                        else
                        {
                            ((ihevce_me_optimised_function_list_t *)
                                 ps_ctxt->pv_me_optimised_function_list)
                                ->pf_store_4x4_sads_high_speed(
                                    &s_search_prms_4x4,
                                    ps_curr_layer,
                                    &as_mv_limit[i1_ref_idx],
                                    &ps_ctxt->s_wt_pred,
                                    pi2_sads_4x4_current);
                        }
                    }
                    else
                    {
                        /* For the zero mv candt, the ref idx to be modified */
                        ps_candt_zeromv_8x4->i1_ref_idx = i1_ref_idx;
                        ps_candt_zeromv_4x8->i1_ref_idx = i1_ref_idx;

                        if(ME_XTREME_SPEED_25 != e_me_quality_preset)
                        {
                            /* For the global mvs alone, the search node points to a local variable */
                            ps_search_candts_8x4[global_id_8x4].ps_search_node =
                                &s_candt_global[i1_ref_idx];
                            ps_search_candts_4x8[global_id_4x8].ps_search_node =
                                &s_candt_global[i1_ref_idx];
                        }

                        hme_get_spatial_candt(
                            ps_curr_layer,
                            BLK_4x4,
                            blk_x,
                            blk_y - 1,
                            i1_ref_idx,
                            as_top_neighbours,
                            as_left_neighbours,
                            0,
                            1,
                            0,
                            0);
                        /* set up the various candts */
                        *ps_candt_4x8_l = as_left_neighbours[0];
                        *ps_candt_4x8_t = as_top_neighbours[1];
                        *ps_candt_4x8_tl = as_top_neighbours[0];
                        *ps_candt_8x4_l = *ps_candt_4x8_l;
                        *ps_candt_8x4_tl = *ps_candt_4x8_tl;
                        *ps_candt_8x4_t = *ps_candt_4x8_t;

                        {
                            S32 pred_lx;
                            S16 *pi2_sads_4x4_current, *pi2_sads_4x4_top;
                            pred_ctxt_t *ps_pred_ctxt;
                            PF_MV_COST_FXN pf_mv_cost_compute;

                            /* Computer 4x4 SADs for current block */
                            /* Pointer to store SADs */
                            pi2_sads_4x4_current = pi2_cur_ref_sads_4x4 + sad_current_offset;

                            hme_derive_worst_case_search_range(
                                &s_range_prms,
                                &s_pic_limit,
                                &as_mv_limit[i1_ref_idx],
                                blk_x << blk_size_shift,
                                blk_y << blk_size_shift,
                                blk_wd,
                                blk_ht);
                            if(i4_pic_ht == blk_y)
                            {
                                memset(pi2_sads_4x4_current, 0, ai4_sad_4x4_block_size[i1_ref_idx]);
                            }
                            else
                            {
                                if(ME_PRISTINE_QUALITY >= e_me_quality_preset)
                                {
                                    ((ihevce_me_optimised_function_list_t *)
                                         ps_ctxt->pv_me_optimised_function_list)
                                        ->pf_store_4x4_sads_high_quality(
                                            &s_search_prms_4x4,
                                            ps_curr_layer,
                                            &as_mv_limit[i1_ref_idx],
                                            &ps_ctxt->s_wt_pred,
                                            pi2_sads_4x4_current);
                                }
                                else
                                {
                                    ((ihevce_me_optimised_function_list_t *)
                                         ps_ctxt->pv_me_optimised_function_list)
                                        ->pf_store_4x4_sads_high_speed(
                                            &s_search_prms_4x4,
                                            ps_curr_layer,
                                            &as_mv_limit[i1_ref_idx],
                                            &ps_ctxt->s_wt_pred,
                                            pi2_sads_4x4_current);
                                }
                            }
                            /* Set pred direction to L0 or L1 */
                            pred_lx = 1 - ps_search_results->pu1_is_past[i1_ref_idx];

                            /* Suitable context (L0 or L1) */
                            ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];

                            /* Coarse layer is always explicit */
                            if(ME_PRISTINE_QUALITY > e_me_quality_preset)
                            {
                                pf_mv_cost_compute = compute_mv_cost_coarse;
                            }
                            else
                            {
                                /* Cost function is not called in high speed case. Below one is just a dummy function */
                                pf_mv_cost_compute = compute_mv_cost_coarse_high_speed;
                            }

                            /*********************************************************************/
                            /* Now, compute the mv for the top block                             */
                            /*********************************************************************/
                            pi2_sads_4x4_top = pi2_cur_ref_sads_4x4 + sad_top_offset;

                            /*********************************************************************/
                            /* For every blk in the picture, the search range needs to be derived*/
                            /* Any blk can have any mv, but practical search constraints are     */
                            /* imposed by the picture boundary and amt of padding.               */
                            /*********************************************************************/
                            hme_derive_search_range(
                                &s_range_prms,
                                &s_pic_limit,
                                &as_mv_limit[i1_ref_idx],
                                blk_x << blk_size_shift,
                                (blk_y - 1) << blk_size_shift,
                                blk_wd,
                                blk_ht);

                            /* Computer the mv for the top block */
                            if(ME_PRISTINE_QUALITY >= e_me_quality_preset)
                            {
                                ((ihevce_me_optimised_function_list_t *)
                                     ps_ctxt->pv_me_optimised_function_list)
                                    ->pf_combine_4x4_sads_and_compute_cost_high_quality(
                                        i1_ref_idx,
                                        &s_range_prms, /* Both 4x8 and 8x4 has same search range */
                                        &as_mv_limit[i1_ref_idx],
                                        &best_mv_4x8,
                                        &best_mv_8x4,
                                        ps_pred_ctxt,
                                        pf_mv_cost_compute,
                                        pi2_sads_4x4_top, /* Current SAD block */
                                        (pi2_sads_4x4_top +
                                         ai4_sad_4x4_block_size[i1_ref_idx]), /* East SAD block */
                                        pi2_sads_4x4_current); /* South SAD block */
                            }
                            else
                            {
                                ((ihevce_me_optimised_function_list_t *)
                                     ps_ctxt->pv_me_optimised_function_list)
                                    ->pf_combine_4x4_sads_and_compute_cost_high_speed(
                                        i1_ref_idx,
                                        &s_range_prms, /* Both 4x8 and 8x4 has same search range */
                                        &as_mv_limit[i1_ref_idx],
                                        &best_mv_4x8,
                                        &best_mv_8x4,
                                        ps_pred_ctxt,
                                        pf_mv_cost_compute,
                                        pi2_sads_4x4_top, /* Current SAD block */
                                        (pi2_sads_4x4_top +
                                         ai4_sad_4x4_block_size[i1_ref_idx]), /* East SAD block */
                                        pi2_sads_4x4_current); /* South SAD block */
                            }

                            ps_candt_fs_4x8->s_mv.i2_mvx = best_mv_4x8.i2_mv_x;
                            ps_candt_fs_4x8->s_mv.i2_mvy = best_mv_4x8.i2_mv_y;
                            ps_candt_fs_4x8->i1_ref_idx = i1_ref_idx;

                            ps_candt_fs_8x4->s_mv.i2_mvx = best_mv_8x4.i2_mv_x;
                            ps_candt_fs_8x4->s_mv.i2_mvy = best_mv_8x4.i2_mv_y;
                            ps_candt_fs_8x4->i1_ref_idx = i1_ref_idx;
                        }

                        /* call the appropriate Search Algo for 4x8S. The 4x8N would  */
                        /* have already been called by top block */
                        hme_pred_search_square_stepn(
                            &s_search_prms_8x4,
                            ps_curr_layer,
                            &ps_ctxt->s_wt_pred,
                            e_me_quality_preset,
                            (ihevce_me_optimised_function_list_t *)
                                ps_ctxt->pv_me_optimised_function_list

                        );

                        /* Call the appropriate search algo for 8x4E */
                        hme_pred_search_square_stepn(
                            &s_search_prms_4x8,
                            ps_curr_layer,
                            &ps_ctxt->s_wt_pred,
                            e_me_quality_preset,
                            (ihevce_me_optimised_function_list_t *)
                                ps_ctxt->pv_me_optimised_function_list);

                        if(ME_XTREME_SPEED_25 != e_me_quality_preset)
                        {
                            /* Histogram updates across different Ref ID for global MV */
                            hme_update_histogram(
                                ps_ctxt->aps_mv_hist[i1_ref_idx],
                                aps_best_search_node_8x4[i1_ref_idx]->s_mv.i2_mvx,
                                aps_best_search_node_8x4[i1_ref_idx]->s_mv.i2_mvy);
                            hme_update_histogram(
                                ps_ctxt->aps_mv_hist[i1_ref_idx],
                                aps_best_search_node_4x8[i1_ref_idx]->s_mv.i2_mvx,
                                aps_best_search_node_4x8[i1_ref_idx]->s_mv.i2_mvy);
                        }

                        /* update the best results to the mv bank */
                        hme_update_mv_bank_coarse(
                            ps_search_results,
                            ps_curr_layer->ps_layer_mvbank,
                            blk_x,
                            (blk_y - 1),
                            ps_ctxt->aps_best_search_nodes_4x8_n_rows[i1_ref_idx] +
                                search_node_top_offset, /* Top Candidate */
                            ps_ctxt->aps_best_search_nodes_8x4_n_rows[i1_ref_idx] +
                                search_node_left_offset, /* Left candidate */
                            i1_ref_idx,
                            &s_mv_update_prms);

                        /* Copy the best search result to 5 row array for future use */
                        *(ps_ctxt->aps_best_search_nodes_4x8_n_rows[i1_ref_idx] + blk_x +
                          ps_ctxt->ai4_row_index[blk_y - 1] * num_blks_in_row) =
                            *(aps_best_search_node_4x8[i1_ref_idx]);

                        *(ps_ctxt->aps_best_search_nodes_8x4_n_rows[i1_ref_idx] + blk_x +
                          ps_ctxt->ai4_row_index[blk_y - 1] * num_blks_in_row) =
                            *(aps_best_search_node_8x4[i1_ref_idx]);

                        /* UPDATE the MIN and MAX MVs for Dynamical Search Range for each ref. pic. */
                        /* Only for P pic. For P, both are 0, I&B has them mut. exclusive */
                        if(ps_ctxt->s_frm_prms.is_i_pic == ps_ctxt->s_frm_prms.bidir_enabled)
                        {
                            WORD32 num_mvs, i, j;
                            search_node_t *aps_search_nodes[4];
                            /* Best results for 8x4R and 4x8B blocks */
                            search_node_t *ps_search_node_8x4_r, *ps_search_node_4x8_b;

                            num_mvs = ps_curr_layer->ps_layer_mvbank->i4_num_mvs_per_ref;

                            /*************************************************************************/
                            /* We have atleast 4 distinct results: the 4x8 top (coming from top blk) */
                            /* 8x4 left (coming from left blk), 8x4 and 4x8 right and bot resp.      */
                            /* If number of results to be stored is 4, then we store all these 4     */
                            /* results, else we pick best ones                                       */
                            /*************************************************************************/
                            ps_search_node_8x4_r =
                                ps_search_results->aps_part_results[i1_ref_idx][PART_ID_2NxN_B];
                            ps_search_node_4x8_b =
                                ps_search_results->aps_part_results[i1_ref_idx][PART_ID_Nx2N_R];

                            ASSERT(num_mvs <= 4);

                            /* Doing this to sort best results */
                            aps_search_nodes[0] = ps_search_node_8x4_r;
                            aps_search_nodes[1] = ps_search_node_4x8_b;
                            aps_search_nodes[2] =
                                ps_ctxt->aps_best_search_nodes_8x4_n_rows[i1_ref_idx] +
                                search_node_left_offset; /* Left candidate */
                            aps_search_nodes[3] =
                                ps_ctxt->aps_best_search_nodes_4x8_n_rows[i1_ref_idx] +
                                search_node_top_offset; /* Top Candidate */

                            /* Note : Need to be resolved!!! */
                            /* Added this to match with "hme_update_mv_bank_coarse" */
                            if(num_mvs != 4)
                            {
                                /* Run through the results, store them in best to worst order */
                                for(i = 0; i < num_mvs; i++)
                                {
                                    for(j = i + 1; j < 4; j++)
                                    {
                                        if(aps_search_nodes[j]->i4_tot_cost <
                                           aps_search_nodes[i]->i4_tot_cost)
                                        {
                                            SWAP_HME(
                                                aps_search_nodes[j],
                                                aps_search_nodes[i],
                                                search_node_t *);
                                        }
                                    }
                                }
                            }

                            /* UPDATE the MIN and MAX MVs for Dynamical Search Range for each ref. pic. */
                            for(i = 0; i < num_mvs; i++)
                            {
                                hme_update_dynamic_search_params(
                                    &ps_ctxt->s_coarse_dyn_range_prms
                                         .as_dyn_range_prms[i4_layer_id][i1_ref_idx],
                                    aps_search_nodes[i]->s_mv.i2_mvy);
                            }
                        }
                    }
                }

                /* Update the number of blocks processed in the current row */
                ihevce_dmgr_set_row_row_sync(
                    pv_hme_dep_mngr,
                    (blk_x + 1),
                    blk_y,
                    0 /* Col Tile No. : Not supported in PreEnc*/);
            }

            /* set the output dependency after completion of row */
            ihevce_pre_enc_grp_job_set_out_dep(ps_multi_thrd_ctxt, ps_job, i4_ping_pong);
        }
    }

    return;
}