//****************************************************************************** //* //* Copyright (C) 2015 The Android Open Source Project //* //* Licensed under the Apache License, Version 2.0 (the "License"); //* you may not use this file except in compliance with the License. //* You may obtain a copy of the License at: //* //* http://www.apache.org/licenses/LICENSE-2.0 //* //* Unless required by applicable law or agreed to in writing, software //* distributed under the License is distributed on an "AS IS" BASIS, //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //* See the License for the specific language governing permissions and //* limitations under the License. //* //***************************************************************************** //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore //*/ ///** //****************************************************************************** //* @file //* ih264_intra_pred_chroma.s //* //* @brief //* Contains function definitions for intra chroma prediction . //* //* @author //* Ittiam //* //* @par List of Functions: //* //* - ih264_intra_pred_luma_chroma_mode_vert_av8() //* - ih264_intra_pred_luma_chroma_mode_horz_av8() //* - ih264_intra_pred_luma_chroma_mode_dc_av8() //* - ih264_intra_pred_luma_chroma_mode_plane_av8() //* //* @remarks //* None //* //******************************************************************************* //*/ ///* All the functions here are replicated from ih264_chroma_intra_pred_filters.c // ///** ///** ///** // .text .p2align 2 .include "ih264_neon_macros.s" .extern ih264_gai1_intrapred_chroma_plane_coeffs1 .extern ih264_gai1_intrapred_chroma_plane_coeffs2 ///** //******************************************************************************* //* //*ih264_intra_pred_chroma_8x8_mode_dc //* //* @brief //* Perform Intra prediction for chroma_8x8 mode:DC //* //* @par Description: //* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1 //* //* @param[in] pu1_src //* UWORD8 pointer to the source containing alternate U and V samples //* //* @param[out] pu1_dst //* UWORD8 pointer to the destination with alternate U and V samples //* //* @param[in] src_strd //* integer source stride //* //* @param[in] dst_strd //* integer destination stride //* //** @param[in] ui_neighboravailability //* availability of neighbouring pixels //* //* @returns //* //* @remarks //* None //* //*******************************************************************************/ //void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src, // UWORD8 *pu1_dst, // WORD32 src_strd, // WORD32 dst_strd, // WORD32 ui_neighboravailability) //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst // x2 => src_strd // x3 => dst_strd // x4 => ui_neighboravailability .global ih264_intra_pred_chroma_8x8_mode_dc_av8 ih264_intra_pred_chroma_8x8_mode_dc_av8: push_v_regs stp x19, x20, [sp, #-16]! mov x19, #5 ands x6, x4, x19 beq none_available cmp x6, #1 beq left_only_available cmp x6, #4 beq top_only_available all_available: ld1 {v0.8b, v1.8b}, [x0] add x6, x0, #18 ld1 {v2.8b, v3.8b}, [x6] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s uxtl v2.8h, v2.8b uxtl v3.8h, v3.8b addp v2.4s, v2.4s , v2.4s addp v3.4s, v3.4s , v3.4s addp v2.4s, v2.4s , v2.4s addp v3.4s, v3.4s , v3.4s rshrn v5.8b, v0.8h, #2 dup v21.8h, v5.h[0] rshrn v6.8b, v3.8h, #2 dup v20.8h, v6.h[0] add v1.8h, v1.8h, v2.8h rshrn v1.8b, v1.8h, #3 dup v23.8h, v1.h[0] mov v20.d[0], v23.d[0] add v0.8h, v0.8h, v3.8h rshrn v0.8b, v0.8h, #3 dup v23.8h, v0.h[0] mov v21.d[1], v23.d[0] b store left_only_available: ld1 {v0.8b, v1.8b}, [x0] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s rshrn v0.8b, v0.8h, #2 rshrn v1.8b, v1.8h, #2 dup v20.8h , v1.h[0] dup v21.8h, v0.h[0] b store top_only_available: add x6, x0, #18 ld1 {v0.8b, v1.8b}, [x6] uxtl v0.8h, v0.8b uxtl v1.8h, v1.8b addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s addp v0.4s, v0.4s , v0.4s addp v1.4s, v1.4s , v1.4s rshrn v0.8b, v0.8h, #2 rshrn v1.8b, v1.8h, #2 dup v20.8h , v0.h[0] dup v21.8h, v1.h[0] mov v20.d[1], v21.d[1] mov v21.d[0], v20.d[0] b store none_available: mov w15, #128 dup v20.16b, w15 dup v21.16b, w15 store: st1 { v20.16b}, [x1], x3 st1 { v20.16b}, [x1], x3 st1 { v20.16b}, [x1], x3 st1 { v20.16b}, [x1], x3 st1 { v21.16b}, [x1], x3 st1 { v21.16b}, [x1], x3 st1 { v21.16b}, [x1], x3 st1 { v21.16b}, [x1], x3 end_func: ldp x19, x20, [sp], #16 pop_v_regs ret ///****************************************************************************** ///** //******************************************************************************* //* //*ih264_intra_pred_chroma_8x8_mode_horz //* //* @brief //* Perform Intra prediction for chroma_8x8 mode:Horizontal //* //* @par Description: //* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2 //* //* @param[in] pu1_src //* UWORD8 pointer to the source containing alternate U and V samples //* //* @param[out] pu1_dst //* UWORD8 pointer to the destination with alternate U and V samples //* //* @param[in] src_strd //* integer source stride //* //* @param[in] dst_strd //* integer destination stride //* //* @param[in] ui_neighboravailability //* availability of neighbouring pixels(Not used in this function) //* //* @returns //* //* @remarks //* None //* //******************************************************************************* //*/ //void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src, // UWORD8 *pu1_dst, // WORD32 src_strd, // WORD32 dst_strd, // WORD32 ui_neighboravailability) //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst // x2 => src_strd // x3 => dst_strd // x4 => ui_neighboravailability .global ih264_intra_pred_chroma_8x8_mode_horz_av8 ih264_intra_pred_chroma_8x8_mode_horz_av8: push_v_regs ld1 {v0.8h}, [x0] dup v10.8h, v0.h[7] dup v11.8h, v0.h[6] dup v12.8h, v0.h[5] dup v13.8h, v0.h[4] st1 {v10.8h}, [x1], x3 dup v14.8h, v0.h[3] st1 {v11.8h}, [x1], x3 dup v15.8h, v0.h[2] st1 {v12.8h}, [x1], x3 dup v16.8h, v0.h[1] st1 {v13.8h}, [x1], x3 dup v17.8h, v0.h[0] st1 {v14.8h}, [x1], x3 st1 {v15.8h}, [x1], x3 st1 {v16.8h}, [x1], x3 st1 {v17.8h}, [x1], x3 pop_v_regs ret ///** //******************************************************************************* //* //*ih264_intra_pred_chroma_8x8_mode_vert //* //* @brief //* Perform Intra prediction for chroma_8x8 mode:vertical //* //* @par Description: //*Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3 //* //* @param[in] pu1_src //* UWORD8 pointer to the source containing alternate U and V samples //* //* @param[out] pu1_dst //* UWORD8 pointer to the destination with alternate U and V samples //* //* @param[in] src_strd //* integer source stride //* //* @param[in] dst_strd //* integer destination stride //* //* @param[in] ui_neighboravailability //* availability of neighbouring pixels(Not used in this function) //* //* @returns //* //* @remarks //* None //* //******************************************************************************* //void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src, // UWORD8 *pu1_dst, // WORD32 src_strd, // WORD32 dst_strd, // WORD32 ui_neighboravailability) //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst // x2 => src_strd // x3 => dst_strd // x4 => ui_neighboravailability .global ih264_intra_pred_chroma_8x8_mode_vert_av8 ih264_intra_pred_chroma_8x8_mode_vert_av8: push_v_regs add x0, x0, #18 ld1 {v0.8b, v1.8b}, [x0] st1 {v0.8b, v1.8b}, [x1], x3 st1 {v0.8b, v1.8b}, [x1], x3 st1 {v0.8b, v1.8b}, [x1], x3 st1 {v0.8b, v1.8b}, [x1], x3 st1 {v0.8b, v1.8b}, [x1], x3 st1 {v0.8b, v1.8b}, [x1], x3 st1 {v0.8b, v1.8b}, [x1], x3 st1 {v0.8b, v1.8b}, [x1], x3 pop_v_regs ret ///****************************************************************************** ///** //******************************************************************************* //* //*ih264_intra_pred_chroma_8x8_mode_plane //* //* @brief //* Perform Intra prediction for chroma_8x8 mode:PLANE //* //* @par Description: //* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4 //* //* @param[in] pu1_src //* UWORD8 pointer to the source containing alternate U and V samples //* //* @param[out] pu1_dst //* UWORD8 pointer to the destination with alternate U and V samples //* //* @param[in] src_strd //* integer source stride //* //* @param[in] dst_strd //* integer destination stride //* //* @param[in] ui_neighboravailability //* availability of neighbouring pixels //* //* @returns //* //* @remarks //* None //* //*******************************************************************************/ //void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src, // UWORD8 *pu1_dst, // WORD32 src_strd, // WORD32 dst_strd, // WORD32 ui_neighboravailability) //**************Variables Vs Registers***************************************** // x0 => *pu1_src // x1 => *pu1_dst // x2 => src_strd // x3 => dst_strd // x4 => ui_neighboravailability .global ih264_intra_pred_chroma_8x8_mode_plane_av8 ih264_intra_pred_chroma_8x8_mode_plane_av8: push_v_regs stp x19, x20, [sp, #-16]! ld1 {v0.2s}, [x0] add x10, x0, #10 ld1 {v1.2s}, [x10] add x10, x10, #6 rev64 v5.4h, v0.4h ld1 {v2.2s}, [x10], #8 add x10, x10, #2 rev64 v7.4h, v2.4h ld1 {v3.2s}, [x10] sub x5, x3, #8 adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs1 ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs1] usubl v10.8h, v5.8b, v1.8b ld1 {v8.8b, v9.8b}, [x12] // Load multiplication factors 1 to 8 into D3 mov v8.d[1], v9.d[0] usubl v12.8h, v3.8b, v7.8b mul v14.8h, v10.8h , v8.8h mul v16.8h, v12.8h , v8.8h uzp1 v15.8h, v14.8h, v16.8h uzp2 v16.8h, v14.8h, v16.8h mov v14.16b, v15.16b mov v15.d[0], v14.d[1] mov v17.d[0], v16.d[1] addp v14.4h, v14.4h, v14.4h addp v15.4h, v15.4h, v15.4h addp v16.4h, v16.4h, v16.4h addp v17.4h, v17.4h, v17.4h addp v14.4h, v14.4h, v14.4h addp v15.4h, v15.4h, v15.4h addp v16.4h, v16.4h, v16.4h addp v17.4h, v17.4h, v17.4h mov x6, #34 dup v18.8h, w6 smull v22.4s, v14.4h, v18.4h smull v24.4s, v15.4h, v18.4h smull v26.4s, v16.4h, v18.4h smull v28.4s, v17.4h, v18.4h rshrn v10.4h, v22.4s, #6 rshrn v12.4h, v24.4s, #6 rshrn v13.4h, v26.4s, #6 rshrn v14.4h, v28.4s, #6 ldrb w6, [x0], #1 sxtw x6, w6 add x10, x0, #31 ldrb w8, [x0], #1 sxtw x8, w8 ldrb w7, [x10], #1 sxtw x7, w7 ldrb w9, [x10], #1 sxtw x9, w9 add x6, x6, x7 add x8, x8, x9 lsl x6, x6, #4 lsl x8, x8, #4 dup v0.8h, w6 dup v2.8h, w8 dup v4.8h, v12.h[0] dup v6.8h, v10.h[0] dup v24.8h, v14.h[0] dup v26.8h, v13.h[0] zip1 v5.8h, v4.8h, v24.8h zip2 v24.8h, v4.8h, v24.8h mov v4.16b, v5.16b zip1 v7.8h, v6.8h, v26.8h zip2 v26.8h, v6.8h, v26.8h mov v6.16b, v7.16b zip1 v1.8h, v0.8h, v2.8h zip2 v2.8h, v0.8h, v2.8h mov v0.16b, v1.16b adrp x12, :got:ih264_gai1_intrapred_chroma_plane_coeffs2 ldr x12, [x12, #:got_lo12:ih264_gai1_intrapred_chroma_plane_coeffs2] ld1 {v8.2s, v9.2s}, [x12] mov v8.d[1], v9.d[0] mov v10.16b, v8.16b mov v22.16b, v8.16b zip1 v9.8h, v8.8h, v10.8h zip2 v10.8h, v8.8h, v10.8h mov v8.16b, v9.16b mul v12.8h, v4.8h , v8.8h mul v16.8h, v4.8h , v10.8h add v12.8h, v0.8h , v12.8h add v16.8h, v0.8h , v16.8h dup v20.8h, v22.h[0] mul v4.8h, v6.8h , v20.8h dup v30.8h, v22.h[1] mul v18.8h, v6.8h , v20.8h mul v14.8h, v6.8h , v30.8h mul v8.8h, v6.8h , v30.8h add v24.8h, v12.8h , v4.8h add v0.8h, v16.8h , v18.8h add v2.8h, v12.8h , v14.8h sqrshrun v28.8b, v24.8h, #5 add v26.8h, v16.8h , v8.8h sqrshrun v29.8b, v0.8h, #5 dup v20.8h, v22.h[2] st1 {v28.8b, v29.8b}, [x1], x3 sqrshrun v28.8b, v2.8h, #5 sqrshrun v29.8b, v26.8h, #5 mul v4.8h, v6.8h , v20.8h mul v18.8h, v6.8h , v20.8h st1 {v28.8b, v29.8b}, [x1], x3 add v24.8h, v12.8h , v4.8h add v0.8h, v16.8h , v18.8h dup v30.8h, v22.h[3] sqrshrun v28.8b, v24.8h, #5 sqrshrun v29.8b, v0.8h, #5 mul v14.8h, v6.8h , v30.8h mul v8.8h, v6.8h , v30.8h st1 {v28.8b, v29.8b}, [x1], x3 add v2.8h, v12.8h , v14.8h add v26.8h, v16.8h , v8.8h dup v20.8h, v22.h[4] sqrshrun v28.8b, v2.8h, #5 sqrshrun v29.8b, v26.8h, #5 mul v4.8h, v6.8h , v20.8h mul v18.8h, v6.8h , v20.8h st1 {v28.8b, v29.8b}, [x1], x3 add v24.8h, v12.8h , v4.8h add v0.8h, v16.8h , v18.8h dup v30.8h, v22.h[5] sqrshrun v28.8b, v24.8h, #5 sqrshrun v29.8b, v0.8h, #5 mul v14.8h, v6.8h , v30.8h mul v8.8h, v6.8h , v30.8h st1 {v28.8b, v29.8b}, [x1], x3 add v2.8h, v12.8h , v14.8h add v26.8h, v16.8h , v8.8h dup v20.8h, v22.h[6] sqrshrun v28.8b, v2.8h, #5 sqrshrun v29.8b, v26.8h, #5 mul v4.8h, v6.8h , v20.8h mul v18.8h, v6.8h , v20.8h st1 {v28.8b, v29.8b}, [x1], x3 add v24.8h, v12.8h , v4.8h add v0.8h, v16.8h , v18.8h dup v30.8h, v22.h[7] sqrshrun v28.8b, v24.8h, #5 sqrshrun v29.8b, v0.8h, #5 mul v14.8h, v6.8h , v30.8h mul v8.8h, v6.8h , v30.8h st1 {v28.8b, v29.8b}, [x1], x3 add v2.8h, v12.8h , v14.8h add v26.8h, v16.8h , v8.8h sqrshrun v28.8b, v2.8h, #5 sqrshrun v29.8b, v26.8h, #5 st1 {v28.8b, v29.8b}, [x1], x3 end_func_plane: ldp x19, x20, [sp], #16 pop_v_regs ret