;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"

%macro PROCESS_16X2X8 1
%if %1
        movdqa          xmm0,       XMMWORD PTR [rsi]
        movq            xmm1,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        movq            xmm2,       MMWORD PTR [rdi+16]
        punpcklqdq      xmm1,       xmm3
        punpcklqdq      xmm3,       xmm2

        movdqa          xmm2,       xmm1
        mpsadbw         xmm1,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5

        psrldq          xmm0,       8

        movdqa          xmm4,       xmm3
        mpsadbw         xmm3,       xmm0,  0x0
        mpsadbw         xmm4,       xmm0,  0x5

        paddw           xmm1,       xmm2
        paddw           xmm1,       xmm3
        paddw           xmm1,       xmm4
%else
        movdqa          xmm0,       XMMWORD PTR [rsi]
        movq            xmm5,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        movq            xmm2,       MMWORD PTR [rdi+16]
        punpcklqdq      xmm5,       xmm3
        punpcklqdq      xmm3,       xmm2

        movdqa          xmm2,       xmm5
        mpsadbw         xmm5,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5

        psrldq          xmm0,       8

        movdqa          xmm4,       xmm3
        mpsadbw         xmm3,       xmm0,  0x0
        mpsadbw         xmm4,       xmm0,  0x5

        paddw           xmm5,       xmm2
        paddw           xmm5,       xmm3
        paddw           xmm5,       xmm4

        paddw           xmm1,       xmm5
%endif
        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
        movq            xmm5,       MMWORD PTR [rdi+ rdx]
        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
        punpcklqdq      xmm5,       xmm3
        punpcklqdq      xmm3,       xmm2

        lea             rsi,        [rsi+rax*2]
        lea             rdi,        [rdi+rdx*2]

        movdqa          xmm2,       xmm5
        mpsadbw         xmm5,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5

        psrldq          xmm0,       8
        movdqa          xmm4,       xmm3
        mpsadbw         xmm3,       xmm0,  0x0
        mpsadbw         xmm4,       xmm0,  0x5

        paddw           xmm5,       xmm2
        paddw           xmm5,       xmm3
        paddw           xmm5,       xmm4

        paddw           xmm1,       xmm5
%endmacro

%macro PROCESS_8X2X8 1
%if %1
        movq            xmm0,       MMWORD PTR [rsi]
        movq            xmm1,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        punpcklqdq      xmm1,       xmm3

        movdqa          xmm2,       xmm1
        mpsadbw         xmm1,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5
        paddw           xmm1,       xmm2
%else
        movq            xmm0,       MMWORD PTR [rsi]
        movq            xmm5,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        punpcklqdq      xmm5,       xmm3

        movdqa          xmm2,       xmm5
        mpsadbw         xmm5,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5
        paddw           xmm5,       xmm2

        paddw           xmm1,       xmm5
%endif
        movq            xmm0,       MMWORD PTR [rsi + rax]
        movq            xmm5,       MMWORD PTR [rdi+ rdx]
        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
        punpcklqdq      xmm5,       xmm3

        lea             rsi,        [rsi+rax*2]
        lea             rdi,        [rdi+rdx*2]

        movdqa          xmm2,       xmm5
        mpsadbw         xmm5,       xmm0,  0x0
        mpsadbw         xmm2,       xmm0,  0x5
        paddw           xmm5,       xmm2

        paddw           xmm1,       xmm5
%endmacro

%macro PROCESS_4X2X8 1
%if %1
        movd            xmm0,       [rsi]
        movq            xmm1,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        punpcklqdq      xmm1,       xmm3

        mpsadbw         xmm1,       xmm0,  0x0
%else
        movd            xmm0,       [rsi]
        movq            xmm5,       MMWORD PTR [rdi]
        movq            xmm3,       MMWORD PTR [rdi+8]
        punpcklqdq      xmm5,       xmm3

        mpsadbw         xmm5,       xmm0,  0x0

        paddw           xmm1,       xmm5
%endif
        movd            xmm0,       [rsi + rax]
        movq            xmm5,       MMWORD PTR [rdi+ rdx]
        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
        punpcklqdq      xmm5,       xmm3

        lea             rsi,        [rsi+rax*2]
        lea             rdi,        [rdi+rdx*2]

        mpsadbw         xmm5,       xmm0,  0x0

        paddw           xmm1,       xmm5
%endmacro


;void vp8_sad16x16x8_sse4(
;    const unsigned char *src_ptr,
;    int  src_stride,
;    const unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned short *sad_array);
global sym(vp8_sad16x16x8_sse4)
sym(vp8_sad16x16x8_sse4):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog

        mov             rsi,        arg(0)           ;src_ptr
        mov             rdi,        arg(2)           ;ref_ptr

        movsxd          rax,        dword ptr arg(1) ;src_stride
        movsxd          rdx,        dword ptr arg(3) ;ref_stride

        PROCESS_16X2X8 1
        PROCESS_16X2X8 0
        PROCESS_16X2X8 0
        PROCESS_16X2X8 0
        PROCESS_16X2X8 0
        PROCESS_16X2X8 0
        PROCESS_16X2X8 0
        PROCESS_16X2X8 0

        mov             rdi,        arg(4)           ;Results
        movdqa          XMMWORD PTR [rdi],    xmm1

    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret


;void vp8_sad16x8x8_sse4(
;    const unsigned char *src_ptr,
;    int  src_stride,
;    const unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned short *sad_array
;);
global sym(vp8_sad16x8x8_sse4)
sym(vp8_sad16x8x8_sse4):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog

        mov             rsi,        arg(0)           ;src_ptr
        mov             rdi,        arg(2)           ;ref_ptr

        movsxd          rax,        dword ptr arg(1) ;src_stride
        movsxd          rdx,        dword ptr arg(3) ;ref_stride

        PROCESS_16X2X8 1
        PROCESS_16X2X8 0
        PROCESS_16X2X8 0
        PROCESS_16X2X8 0

        mov             rdi,        arg(4)           ;Results
        movdqa          XMMWORD PTR [rdi],    xmm1

    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret


;void vp8_sad8x8x8_sse4(
;    const unsigned char *src_ptr,
;    int  src_stride,
;    const unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned short *sad_array
;);
global sym(vp8_sad8x8x8_sse4)
sym(vp8_sad8x8x8_sse4):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog

        mov             rsi,        arg(0)           ;src_ptr
        mov             rdi,        arg(2)           ;ref_ptr

        movsxd          rax,        dword ptr arg(1) ;src_stride
        movsxd          rdx,        dword ptr arg(3) ;ref_stride

        PROCESS_8X2X8 1
        PROCESS_8X2X8 0
        PROCESS_8X2X8 0
        PROCESS_8X2X8 0

        mov             rdi,        arg(4)           ;Results
        movdqa          XMMWORD PTR [rdi],    xmm1

    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret


;void vp8_sad8x16x8_sse4(
;    const unsigned char *src_ptr,
;    int  src_stride,
;    const unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned short *sad_array
;);
global sym(vp8_sad8x16x8_sse4)
sym(vp8_sad8x16x8_sse4):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog

        mov             rsi,        arg(0)           ;src_ptr
        mov             rdi,        arg(2)           ;ref_ptr

        movsxd          rax,        dword ptr arg(1) ;src_stride
        movsxd          rdx,        dword ptr arg(3) ;ref_stride

        PROCESS_8X2X8 1
        PROCESS_8X2X8 0
        PROCESS_8X2X8 0
        PROCESS_8X2X8 0
        PROCESS_8X2X8 0
        PROCESS_8X2X8 0
        PROCESS_8X2X8 0
        PROCESS_8X2X8 0
        mov             rdi,        arg(4)           ;Results
        movdqa          XMMWORD PTR [rdi],    xmm1

    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret


;void vp8_sad4x4x8_c(
;    const unsigned char *src_ptr,
;    int  src_stride,
;    const unsigned char *ref_ptr,
;    int  ref_stride,
;    unsigned short *sad_array
;);
global sym(vp8_sad4x4x8_sse4)
sym(vp8_sad4x4x8_sse4):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    push        rsi
    push        rdi
    ; end prolog

        mov             rsi,        arg(0)           ;src_ptr
        mov             rdi,        arg(2)           ;ref_ptr

        movsxd          rax,        dword ptr arg(1) ;src_stride
        movsxd          rdx,        dword ptr arg(3) ;ref_stride

        PROCESS_4X2X8 1
        PROCESS_4X2X8 0

        mov             rdi,        arg(4)           ;Results
        movdqa          XMMWORD PTR [rdi],    xmm1

    ; begin epilog
    pop         rdi
    pop         rsi
    UNSHADOW_ARGS
    pop         rbp
    ret