//****************************************************************************** //* //* Copyright (C) 2015 The Android Open Source Project //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //***************************************************************************** //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore //*/ ///** //****************************************************************************** //* //* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC ) //* and do the prediction. //* //* @par Description //* This function evaluates first three intra chroma modes and compute corresponding sad //* and return the buffer predicted with best mode. //* //* @param[in] pu1_src //* UWORD8 pointer to the source //* //** @param[in] pu1_ngbr_pels //* UWORD8 pointer to neighbouring pels //* //* @param[out] pu1_dst //* UWORD8 pointer to the destination //* //* @param[in] src_strd //* integer source stride //* //* @param[in] dst_strd //* integer destination stride //* //* @param[in] u4_n_avblty //* availability of neighbouring pixels //* //* @param[in] u4_intra_mode //* Pointer to the variable in which best mode is returned //* //* @param[in] pu4_sadmin //* Pointer to the variable in which minimum sad is returned //* //* @param[in] u4_valid_intra_modes //* Says what all modes are valid //* //* //* @return none //* //****************************************************************************** //*/ // //void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src, // UWORD8 *pu1_ngbr_pels_i16, // UWORD8 *pu1_dst, // UWORD32 src_strd, // UWORD32 dst_strd, // WORD32 u4_n_avblty, // UWORD32 *u4_intra_mode, // WORD32 *pu4_sadmin, // UWORD32 u4_valid_intra_modes) // .text .p2align 2 .include "ih264_neon_macros.s" .global ih264e_evaluate_intra_chroma_modes_av8 ih264e_evaluate_intra_chroma_modes_av8: //x0 = pu1_src, //x1 = pu1_ngbr_pels_i16, //x2 = pu1_dst, //w3 = src_strd, //w4 = dst_strd, //w5 = u4_n_avblty, //x6 = u4_intra_mode, //x7 = pu4_sadmin // STMFD sp!, {x4-x12, x14} //store register values to stack push_v_regs sxtw x3, w3 sxtw x4, w4 stp x19, x20, [sp, #-16]! //----------------------- ldr w16, [sp, #80] mov x17, x4 mov w18, w5 mov x14, x6 mov x15, x7 mov w19, #5 ands w6, w5, w19 beq none_available cmp w6, #1 beq left_only_available cmp w6, #4 beq top_only_available all_available: ld1 {v0.8b, v1.8b}, [x1] add x6, x1, #18 ld1 {v2.8b, v3.8b}, [x6] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b addp v2.4s, v2.4s , v2.4s addp v3.4s, v3.4s , v3.4s addp v2.4s, v2.4s , v2.4s addp v3.4s, v3.4s , v3.4s rshrn v5.8b, v0.8h, #2 dup v21.8h, v5.h[0] rshrn v6.8b, v3.8h, #2 dup v20.8h, v6.h[0] add v1.8h, v1.8h, v2.8h rshrn v1.8b, v1.8h, #3 dup v23.8h, v1.h[0] mov v20.d[0], v23.d[0] add v0.8h, v0.8h, v3.8h rshrn v0.8b, v0.8h, #3 dup v23.8h, v0.h[0] mov v31.d[0], v23.d[0] mov v28.d[0], v20.d[0] mov v29.d[0], v20.d[1] mov v30.d[0], v21.d[0] b sad_comp left_only_available: ld1 {v0.8b, v1.8b}, [x1] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s rshrn v0.8b, v0.8h, #2 rshrn v1.8b, v1.8h, #2 dup v28.8h , v1.h[0] dup v29.8h , v1.h[0] dup v30.8h, v0.h[0] dup v31.8h, v0.h[0] b sad_comp top_only_available: add x6, x1, #18 ld1 {v0.8b, v1.8b}, [x6] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s rshrn v0.8b, v0.8h, #2 rshrn v1.8b, v1.8h, #2 dup v28.8h , v0.h[0] dup v30.8h, v1.h[0] mov v29.d[0], v30.d[1] mov v30.d[0], v28.d[0] mov v31.d[0], v30.d[1] b sad_comp none_available: mov w20, #128 dup v28.16b, w20 dup v29.16b, w20 dup v30.16b, w20 dup v31.16b, w20 sad_comp: add x6, x1, #18 ld1 {v10.8b, v11.8b}, [x6] // vertical values ld1 {v27.8h}, [x1] dup v20.8h, v27.h[7] ///HORIZONTAL VALUE ROW=0// dup v21.8h, v27.h[7] ld1 { v0.8b, v1.8b}, [x0], x3 ///vertical row 0@ uabdl v16.8h, v0.8b, v10.8b uabdl v18.8h, v1.8b, v11.8b ///HORZ row 0@ uabdl v26.8h, v0.8b, v20.8b uabdl v14.8h, v1.8b, v21.8b ld1 {v2.8b, v3.8b}, [x0], x3 ///dc row 0@ uabdl v22.8h, v0.8b, v28.8b uabdl v24.8h, v1.8b, v29.8b dup v20.8h, v27.h[6] dup v21.8h, v27.h[6] ///HORIZONTAL VALUE ROW=1// ///vertical row 1@ uabal v16.8h, v2.8b, v10.8b uabal v18.8h, v3.8b, v11.8b ld1 { v4.8b, v5.8b}, [x0], x3 ///HORZ row 1@ uabal v26.8h, v2.8b, v20.8b uabal v14.8h, v3.8b, v21.8b ///dc row 1@ uabal v22.8h, v2.8b, v28.8b uabal v24.8h, v3.8b, v29.8b dup v20.8h, v27.h[5] dup v21.8h, v27.h[5] ///HORIZONTAL VALUE ROW=2// ///vertical row 2@ uabal v16.8h, v4.8b, v10.8b uabal v18.8h, v5.8b, v11.8b ld1 { v6.8b, v7.8b}, [x0], x3 ///HORZ row 2@ uabal v26.8h, v4.8b, v20.8b uabal v14.8h, v5.8b, v21.8b ///dc row 2@ uabal v22.8h, v4.8b, v28.8b uabal v24.8h, v5.8b, v29.8b dup v20.8h, v27.h[4] dup v21.8h, v27.h[4] ///HORIZONTAL VALUE ROW=3// ///vertical row 3@ uabal v16.8h, v6.8b, v10.8b uabal v18.8h, v7.8b, v11.8b ///HORZ row 3@ uabal v26.8h, v6.8b, v20.8b uabal v14.8h, v7.8b, v21.8b ///dc row 3@ uabal v22.8h, v6.8b, v28.8b uabal v24.8h, v7.8b, v29.8b //---------------------------------------------------------------------------------------------- ld1 { v0.8b, v1.8b}, [x0], x3 dup v20.8h, v27.h[3] dup v21.8h, v27.h[3] ///HORIZONTAL VALUE ROW=0// ///vertical row 0@ uabal v16.8h, v0.8b, v10.8b uabal v18.8h, v1.8b, v11.8b ///HORZ row 0@ uabal v26.8h, v0.8b, v20.8b uabal v14.8h, v1.8b, v21.8b ld1 { v2.8b, v3.8b}, [x0], x3 ///dc row 0@ uabal v22.8h, v0.8b, v30.8b uabal v24.8h, v1.8b, v31.8b dup v20.8h, v27.h[2] dup v21.8h, v27.h[2] ///HORIZONTAL VALUE ROW=1// ///vertical row 1@ uabal v16.8h, v2.8b, v10.8b uabal v18.8h, v3.8b, v11.8b ///HORZ row 1@ uabal v26.8h, v2.8b, v20.8b uabal v14.8h, v3.8b, v21.8b ld1 { v4.8b, v5.8b}, [x0], x3 ///dc row 1@ uabal v22.8h, v2.8b, v30.8b uabal v24.8h, v3.8b, v31.8b dup v20.8h, v27.h[1] dup v21.8h, v27.h[1] ///HORIZONTAL VALUE ROW=2// ///vertical row 2@ uabal v16.8h, v4.8b, v10.8b uabal v18.8h, v5.8b, v11.8b ///HORZ row 2@ uabal v26.8h, v4.8b, v20.8b uabal v14.8h, v5.8b, v21.8b ld1 {v6.8b, v7.8b}, [x0], x3 ///dc row 2@ uabal v22.8h, v4.8b, v30.8b uabal v24.8h, v5.8b, v31.8b dup v20.8h, v27.h[0] dup v21.8h, v27.h[0] ///HORIZONTAL VALUE ROW=3// ///vertical row 3@ uabal v16.8h, v6.8b, v10.8b uabal v18.8h, v7.8b, v11.8b ///HORZ row 3@ uabal v26.8h, v6.8b, v20.8b uabal v14.8h, v7.8b, v21.8b ///dc row 3@ uabal v22.8h, v6.8b, v30.8b uabal v24.8h, v7.8b, v31.8b //------------------------------------------- //vert sum add v16.8h, v16.8h , v18.8h mov v18.d[0], v16.d[1] add v16.4h, v16.4h , v18.4h uaddlp v16.2s, v16.4h addp v16.2s, v16.2s, v16.2s smov x8, v16.s[0] //horz sum add v26.8h, v26.8h , v14.8h mov v14.d[0], v26.d[1] add v26.4h, v26.4h , v14.4h uaddlp v26.2s, v26.4h addp v26.2s, v26.2s, v26.2s smov x9, v26.s[0] //dc sum add v24.8h, v22.8h , v24.8h ///DC mov v25.d[0], v24.d[1] add v24.4h, v24.4h , v25.4h ///DC uaddlp v24.2s, v24.4h ///DC addp v24.2s, v24.2s, v24.2s ///DC smov x10, v24.s[0] //dc mov x11, #1 //----------------------- mov w0, w16 // u4_valid_intra_modes //-------------------------------------------- lsl x11, x11, #30 ands w7, w0, #04 // vert mode valid???????????? csel x8, x11, x8, eq ands w6, w0, #02 // horz mode valid???????????? csel x9, x11, x9, eq ands w6, w0, #01 // dc mode valid???????????? csel x10, x11, x10, eq //--------------------------- mov x4, x17 mov x6, x14 mov x7, x15 //-------------------------- cmp x10, x9 bgt not_dc cmp x10, x8 bgt do_vert ///---------------------- //DO DC PREDICTION str w10 , [x7] //MIN SAD mov w10, #0 str w10 , [x6] // MODE b do_dc_vert //----------------------------- not_dc: cmp x9, x8 bgt do_vert ///---------------------- //DO HORIZONTAL str w9 , [x7] //MIN SAD mov w10, #1 str w10 , [x6] // MODE ld1 {v0.8h}, [x1] dup v10.8h, v0.h[7] dup v11.8h, v0.h[6] dup v12.8h, v0.h[5] dup v13.8h, v0.h[4] st1 {v10.8h}, [x2], x4 dup v14.8h, v0.h[3] st1 {v11.8h}, [x2], x4 dup v15.8h, v0.h[2] st1 {v12.8h}, [x2], x4 dup v16.8h, v0.h[1] st1 {v13.8h}, [x2], x4 dup v17.8h, v0.h[0] st1 {v14.8h}, [x2], x4 st1 {v15.8h}, [x2], x4 st1 {v16.8h}, [x2], x4 st1 {v17.8h}, [x2], x4 b end_func do_vert: //DO VERTICAL PREDICTION str w8 , [x7] //MIN SAD mov w8, #2 str w8 , [x6] // MODE add x6, x1, #18 ld1 {v28.8b, v29.8b}, [x6] // vertical values ld1 {v30.8b, v31.8b}, [x6] // vertical values do_dc_vert: st1 {v28.2s, v29.2s} , [x2], x4 //0 st1 {v28.2s, v29.2s} , [x2], x4 //1 st1 {v28.2s, v29.2s} , [x2], x4 //2 st1 {v28.2s, v29.2s} , [x2], x4 //3 st1 {v30.2s, v31.2s} , [x2], x4 //4 st1 {v30.2s, v31.2s} , [x2], x4 //5 st1 {v30.2s, v31.2s} , [x2], x4 //6 st1 {v30.2s, v31.2s} , [x2], x4 //7 end_func: // LDMFD sp!,{x4-x12,PC} //Restoring registers from stack ldp x19, x20, [sp], #16 pop_v_regs ret