@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/

@/**
@******************************************************************************
@*
@* @brief :Evaluate best intr chroma mode (among VERT, HORZ and DC )
@*                and do the prediction.
@*
@* @par Description
@*   This function evaluates  first three intra chroma modes and compute corresponding sad
@*   and return the buffer predicted with best mode.
@*
@* @param[in] pu1_src
@*  UWORD8 pointer to the source
@*
@** @param[in] pu1_ngbr_pels
@*  UWORD8 pointer to neighbouring pels
@*
@* @param[out] pu1_dst
@*  UWORD8 pointer to the destination
@*
@* @param[in] src_strd
@*  integer source stride
@*
@* @param[in] dst_strd
@*  integer destination stride
@*
@* @param[in] u4_n_avblty
@* availability of neighbouring pixels
@*
@* @param[in] u4_intra_mode
@* Pointer to the variable in which best mode is returned
@*
@* @param[in] pu4_sadmin
@* Pointer to the variable in which minimum sad is returned
@*
@* @param[in] u4_valid_intra_modes
@* Says what all modes are valid
@*
@*
@* @return      none
@*
@******************************************************************************
@*/
@
@void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
@                                      UWORD8 *pu1_ngbr_pels_i16,
@                                      UWORD8 *pu1_dst,
@                                      UWORD32 src_strd,
@                                      UWORD32 dst_strd,
@                                      WORD32 u4_n_avblty,
@                                      UWORD32 *u4_intra_mode,
@                                      WORD32 *pu4_sadmin,
@                                      UWORD32 u4_valid_intra_modes)
@
.text
.p2align 2

    .global ih264e_evaluate_intra_chroma_modes_a9q

ih264e_evaluate_intra_chroma_modes_a9q:

@r0 = pu1_src,
@r1 = pu1_ngbr_pels_i16,
@r2 = pu1_dst,
@r3 = src_strd,
@r4 = dst_strd,
@r5 = u4_n_avblty,
@r6 = u4_intra_mode,
@r7 = pu4_sadmin



    stmfd         sp!, {r4-r12, r14}    @store register values to stack
    @-----------------------
    ldr           r5, [sp, #44]         @r5 = u4_n_avblty,
    @-------------------------
    mov           r12, r1               @
    vpush         {d8-d15}
    vld1.32       {q4}, [r1]!
    add           r1, r1, #2
    vld1.32       {q5}, [r1]!

    vuzp.u8       q4, q5                @

    vpaddl.u8     d8, d8
    vpadd.u16     d8, d8

    vpaddl.u8     d9, d9
    vpadd.u16     d9, d9

    vpaddl.u8     d10, d10
    vpadd.u16     d10, d10

    vpaddl.u8     d11, d11

    and           r7, r5, #5
    vpadd.u16     d11, d11
    subs          r8, r7, #5
    beq           all_available
    subs          r8, r7, #4
    beq           top_available
    subs          r8, r7, #1
    beq           left_available
    mov           r10, #128
    vdup.8        q14, r10
    vdup.8        q15, r10
    b             sad

all_available:
    vzip.u16      q4, q5
    vext.16       q6, q4, q4, #2
    vadd.u16      q7, q5, q6
    vqrshrn.u16   d14, q7, #3
    vqrshrn.u16   d15, q4, #2
    vqrshrn.u16   d16, q5, #2
    vdup.16       d28, d14[0]
    vdup.16       d29, d16[1]
    vdup.16       d30, d15[0]
    vdup.16       d31, d14[1]
    b             sad
top_available:
    vzip.u16      q4, q5
    vqrshrn.u16   d16, q5, #2
    vdup.16       d28, d16[0]
    vdup.16       d29, d16[1]
    vdup.16       d30, d16[0]
    vdup.16       d31, d16[1]
    b             sad
left_available:
    vzip.u16      q4, q5
    vqrshrn.u16   d16, q4, #2
    vdup.16       d28, d16[3]
    vdup.16       d29, d16[3]
    vdup.16       d30, d16[2]
    vdup.16       d31, d16[2]


sad:
    vld1.32       {q4}, [r12]!
    sub           r8, r12, #2
    add           r12, r12, #2
    vld1.32       {q5}, [r12]!
    add           r12, r0, r3, lsl  #2
    sub           r10, r8, #8
    vld1.32       {q0}, [r0], r3
    ldrh          r9, [r8]
    vdup.16       q10, r9               @ row 0

    @/vertical row 0;
    vabdl.u8      q8, d0, d10
    vabdl.u8      q9, d1, d11
    sub           r8, r8, #2
    vld1.32       {q1}, [r12], r3

    @/HORZ row 0;
    vabdl.u8      q13, d0, d20
    vabdl.u8      q7, d1, d21
    ldrh          r9, [r10]
    @/dc row 0;
    vabdl.u8      q11, d0, d28
    vabdl.u8      q12, d1, d29


    vdup.16       q10, r9               @ row 4
    @/vertical row 4;
    vabal.u8      q8, d2, d10
    vabal.u8      q9, d3, d11
    sub           r10, r10, #2

    @/HORZ row 4;
    vabal.u8      q13, d2, d20
    vabal.u8      q7, d3, d21
    @/dc row 4;
    vabal.u8      q11, d2, d30
    vabal.u8      q12, d3, d31

    mov           r11, #3

loop:
    vld1.32       {q0}, [r0], r3
    ldrh          r9, [r8]


    @/vertical row i;
    vabal.u8      q8, d0, d10
    vabal.u8      q9, d1, d11

    vdup.16       q10, r9               @ row i
    vld1.32       {q1}, [r12], r3
    sub           r8, r8, #2
    @/HORZ row i;
    vabal.u8      q13, d0, d20
    vabal.u8      q7, d1, d21
    ldrh          r9, [r10]
    @/dc row i;
    vabal.u8      q11, d0, d28
    vabal.u8      q12, d1, d29
    sub           r10, r10, #2

    vdup.16       q10, r9               @ row i+4
    @/vertical row 4;
    vabal.u8      q8, d2, d10
    vabal.u8      q9, d3, d11
    subs          r11, r11, #1

    @/HORZ row i+4;
    vabal.u8      q13, d2, d20
    vabal.u8      q7, d3, d21
    @/dc row i+4;
    vabal.u8      q11, d2, d30
    vabal.u8      q12, d3, d31
    bne           loop



@-------------------------------------------

    vadd.i16      q9, q9, q8            @/VERT
    vadd.i16      q7, q13, q7           @/HORZ
    vadd.i16      q12, q11, q12         @/DC
    vadd.i16      d18, d19, d18         @/VERT
    vadd.i16      d14, d15, d14         @/HORZ
    vadd.i16      d24, d24, d25         @/DC
    vpaddl.u16    d18, d18              @/VERT
    vpaddl.u16    d14, d14              @/HORZ
    vpaddl.u16    d24, d24              @/DC
    vpaddl.u32    d18, d18              @/VERT
    vpaddl.u32    d14, d14              @/HORZ
    vpaddl.u32    d24, d24              @/DC



    vmov.u32      r8, d18[0]            @ vert
    vmov.u32      r9, d14[0]            @horz
    vmov.u32      r10, d24[0]           @dc

    mov           r11, #1
@-----------------------
    ldr           r0, [sp, #120]        @ u4_valid_intra_modes
@--------------------------------------------


    lsl           r11 , #30

    ands          r7, r0, #04           @ vert mode valid????????????
    moveq         r8, r11

    ands          r6, r0, #02           @ horz mode valid????????????
    moveq         r9, r11

    ands          r6, r0, #01           @ dc mode valid????????????
    moveq         r10, r11


    @---------------------------
    ldr           r4, [sp, #104]        @r4 = dst_strd,
    ldr           r6, [sp, #112]        @ R6 =MODE
    ldr           r7, [sp, #116]        @r7 = pu4_sadmin

    @--------------------------

    cmp           r10, r9
    bgt           not_dc
    cmp           r10, r8
    bgt           do_vert

    @/----------------------
    @DO DC PREDICTION
    str           r10 , [r7]            @MIN SAD
    mov           r10, #0
    str           r10 , [r6]            @ MODE
    b             do_dc_vert
    @-----------------------------

not_dc:
    cmp           r9, r8
    bgt           do_vert
    @/----------------------
    @DO HORIZONTAL

    vdup.16       q10, d9[3]            @/HORIZONTAL VALUE ROW=0;
    str           r9 , [r7]             @MIN SAD
    mov           r9, #1
    vdup.16       q11, d9[2]            @/HORIZONTAL VALUE ROW=1;
    str           r9 , [r6]             @ MODE
    vdup.16       q12, d9[1]            @/HORIZONTAL VALUE ROW=2;
    vst1.32       {d20, d21} , [r2], r4 @0
    vdup.16       q13, d9[0]            @/HORIZONTAL VALUE ROW=3;
    vst1.32       {d22, d23} , [r2], r4 @1
    vdup.16       q14, d8[3]            @/HORIZONTAL VALUE ROW=4;
    vst1.32       {d24, d25} , [r2], r4 @2
    vdup.16       q15, d8[2]            @/HORIZONTAL VALUE ROW=5;
    vst1.32       {d26, d27} , [r2], r4 @3
    vdup.16       q1, d8[1]             @/HORIZONTAL VALUE ROW=6;
    vst1.32       {d28, d29} , [r2], r4 @4
    vdup.16       q2, d8[0]             @/HORIZONTAL VALUE ROW=7;
    vst1.32       {d30, d31} , [r2], r4 @5
    vst1.32       {d2, d3} , [r2], r4   @6
    vst1.32       {d4, d5} , [r2], r4   @7
    b             end_func

do_vert:
    @DO VERTICAL PREDICTION
    str           r8 , [r7]             @MIN SAD
    mov           r8, #2
    str           r8 , [r6]             @ MODE
    vmov          q15, q5
    vmov          q14, q5

do_dc_vert:
    vst1.32       {d28, d29} , [r2], r4 @0
    vst1.32       {d28, d29} , [r2], r4 @1
    vst1.32       {d28, d29} , [r2], r4 @2
    vst1.32       {d28, d29} , [r2], r4 @3
    vst1.32       {d30, d31} , [r2], r4 @4
    vst1.32       {d30, d31} , [r2], r4 @5
    vst1.32       {d30, d31} , [r2], r4 @6
    vst1.32       {d30, d31} , [r2], r4 @7


end_func:
    vpop          {d8-d15}
    ldmfd         sp!, {r4-r12, pc}     @Restoring registers from stack