@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@**
@ *******************************************************************************
@ * @file
@ *  ih264_mem_fns_neon.s
@ *
@ * @brief
@ *  Contains function definitions for memory manipulation
@ *
@ * @author
@ *  Naveen SR
@ *
@ * @par List of Functions:
@ *  - ih264_memcpy_mul_8_a9q()
@ *  - ih264_memcpy_a9q()
@ *  - ih264_memset_mul_8_a9q()
@ *  - ih264_memset_a9q()
@ *  - ih264_memset_16bit_mul_8_a9q()
@ *  - ih264_memset_a9q()
@ *
@ * @remarks
@ *  None
@ *
@ *******************************************************************************
@*

@**
@*******************************************************************************
@*
@* @brief
@*   memcpy of a 1d array
@*
@* @par Description:
@*   Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
@*
@* @param[in] pu1_dst
@*  UWORD8 pointer to the destination
@*
@* @param[in] pu1_src
@*  UWORD8 pointer to the source
@*
@* @param[in] num_bytes
@*  number of bytes to copy
@* @returns
@*
@* @remarks
@*  None
@*
@*******************************************************************************
@*
@void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
@                    UWORD8 *pu1_src,
@                   UWORD32 num_bytes)
@**************Variables Vs Registers*************************
@   r0 => *pu1_dst
@   r1 => *pu1_src
@   r2 => num_bytes

.text
.p2align 2


    .global ih264_memcpy_mul_8_a9q

ih264_memcpy_mul_8_a9q:

loop_neon_memcpy_mul_8:
    @ Memcpy 8 bytes
    vld1.8        d0, [r1]!
    vst1.8        d0, [r0]!

    subs          r2, r2, #8
    bne           loop_neon_memcpy_mul_8
    bx            lr



@*******************************************************************************
@*
@void ih264_memcpy(UWORD8 *pu1_dst,
@                  UWORD8 *pu1_src,
@                  UWORD32 num_bytes)
@**************Variables Vs Registers*************************
@   r0 => *pu1_dst
@   r1 => *pu1_src
@   r2 => num_bytes



    .global ih264_memcpy_a9q

ih264_memcpy_a9q:
    subs          r2, #8
    blt           memcpy
loop_neon_memcpy:
    @ Memcpy 8 bytes
    vld1.8        d0, [r1]!
    vst1.8        d0, [r0]!

    subs          r2, #8
    bge           loop_neon_memcpy
    cmp           r2, #-8
    bxeq          lr

memcpy:
    add           r2, #8

loop_memcpy:
    ldrb          r3, [r1], #1
    strb          r3, [r0], #1
    subs          r2, #1
    bne           loop_memcpy
    bx            lr




@void ih264_memset_mul_8(UWORD8 *pu1_dst,
@                       UWORD8 value,
@                       UWORD32 num_bytes)
@**************Variables Vs Registers*************************
@   r0 => *pu1_dst
@   r1 => value
@   r2 => num_bytes





    .global ih264_memset_mul_8_a9q

ih264_memset_mul_8_a9q:

@ Assumptions: numbytes is either 8, 16 or 32
    vdup.8        d0, r1
loop_memset_mul_8:
    @ Memset 8 bytes
    vst1.8        d0, [r0]!

    subs          r2, r2, #8
    bne           loop_memset_mul_8

    bx            lr




@void ih264_memset(UWORD8 *pu1_dst,
@                       UWORD8 value,
@                       UWORD8 num_bytes)
@**************Variables Vs Registers*************************
@   r0 => *pu1_dst
@   r1 => value
@   r2 => num_bytes



    .global ih264_memset_a9q

ih264_memset_a9q:
    subs          r2, #8
    blt           memset
    vdup.8        d0, r1
loop_neon_memset:
    @ Memcpy 8 bytes
    vst1.8        d0, [r0]!

    subs          r2, #8
    bge           loop_neon_memset
    cmp           r2, #-8
    bxeq          lr

memset:
    add           r2, #8

loop_memset:
    strb          r1, [r0], #1
    subs          r2, #1
    bne           loop_memset
    bx            lr




@void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
@                                   UWORD16 value,
@                                   UWORD32 num_words)
@**************Variables Vs Registers*************************
@   r0 => *pu2_dst
@   r1 => value
@   r2 => num_words





    .global ih264_memset_16bit_mul_8_a9q

ih264_memset_16bit_mul_8_a9q:

@ Assumptions: num_words is either 8, 16 or 32

    @ Memset 8 words
    vdup.16       d0, r1
loop_memset_16bit_mul_8:
    vst1.16       d0, [r0]!
    vst1.16       d0, [r0]!

    subs          r2, r2, #8
    bne           loop_memset_16bit_mul_8

    bx            lr




@void ih264_memset_16bit(UWORD16 *pu2_dst,
@                       UWORD16 value,
@                       UWORD32 num_words)
@**************Variables Vs Registers*************************
@   r0 => *pu2_dst
@   r1 => value
@   r2 => num_words



    .global ih264_memset_16bit_a9q

ih264_memset_16bit_a9q:
    subs          r2, #8
    blt           memset_16bit
    vdup.16       d0, r1
loop_neon_memset_16bit:
    @ Memset 8 words
    vst1.16       d0, [r0]!
    vst1.16       d0, [r0]!

    subs          r2, #8
    bge           loop_neon_memset_16bit
    cmp           r2, #-8
    bxeq          lr

memset_16bit:
    add           r2, #8

loop_memset_16bit:
    strh          r1, [r0], #2
    subs          r2, #1
    bne           loop_memset_16bit
    bx            lr