; AesOpt.asm -- Intel's AES.
; 2009-12-12 : Igor Pavlov : Public domain

include 7zAsm.asm

MY_ASM_START

ifndef x64
    .xmm
endif

ifdef x64
    num     equ r8
else
    num     equ [r4 + REG_SIZE * 4]
endif

rD equ r2
rN equ r0

MY_PROLOG macro reg:req
    ifdef x64
    movdqa  [r4 + 8], xmm6
    movdqa  [r4 + 8 + 16], xmm7
    endif

    push    r3
    push    r5
    push    r6

    mov     rN, num
    mov     x6, [r1 + 16]
    shl     x6, 5

    movdqa  reg, [r1]
    add     r1, 32
endm

MY_EPILOG macro
    pop     r6
    pop     r5
    pop     r3

    ifdef x64
    movdqa  xmm6, [r4 + 8]
    movdqa  xmm7, [r4 + 8 + 16]
    endif

    MY_ENDP
endm

ways equ 4
ways16 equ (ways * 16)

OP_W macro op, op2
    i = 0
    rept ways
    op @CatStr(xmm,%i), op2
    i = i + 1
    endm
endm

LOAD_OP macro op:req, offs:req
    op      xmm0, [r1 + r3 offs]
endm
  
LOAD_OP_W macro op:req, offs:req
    movdqa  xmm7, [r1 + r3 offs]
    OP_W    op, xmm7
endm


; ---------- AES-CBC Decode ----------

CBC_DEC_UPDATE macro reg, offs
    pxor    reg, xmm6
    movdqa  xmm6, [rD + offs]
    movdqa  [rD + offs], reg
endm

DECODE macro op:req
    op      aesdec, +16
  @@:
    op      aesdec, +0
    op      aesdec, -16
    sub     x3, 32
    jnz     @B
    op      aesdeclast, +0
endm

MY_PROC AesCbc_Decode_Intel, 3
    MY_PROLOG xmm6

    sub     x6, 32

    jmp     check2

  align 16
  nextBlocks2:
    mov     x3, x6
    OP_W    movdqa, [rD + i * 16]
    LOAD_OP_W  pxor, +32
    DECODE  LOAD_OP_W
    OP_W    CBC_DEC_UPDATE, i * 16
    add     rD, ways16
  check2:
    sub     rN, ways
    jnc     nextBlocks2

    add     rN, ways
    jmp     check

  nextBlock:
    mov     x3, x6
    movdqa  xmm1, [rD]
    LOAD_OP movdqa, +32
    pxor    xmm0, xmm1
    DECODE  LOAD_OP
    pxor    xmm0, xmm6
    movdqa  [rD], xmm0
    movdqa  xmm6, xmm1
    add     rD, 16
  check:
    sub     rN, 1
    jnc     nextBlock

    movdqa  [r1 - 32], xmm6
    MY_EPILOG


; ---------- AES-CBC Encode ----------

ENCODE macro op:req
    op      aesenc, -16
  @@:
    op      aesenc, +0
    op      aesenc, +16
    add     r3, 32
    jnz     @B
    op      aesenclast, +0
endm

MY_PROC AesCbc_Encode_Intel, 3
    MY_PROLOG xmm0

    add     r1, r6
    neg     r6
    add     r6, 32

    jmp     check_e

  align 16
  nextBlock_e:
    mov     r3, r6
    pxor    xmm0, [rD]
    pxor    xmm0, [r1 + r3 - 32]
    ENCODE  LOAD_OP
    movdqa  [rD], xmm0
    add     rD, 16
  check_e:
    sub     rN, 1
    jnc     nextBlock_e

    movdqa  [r1 + r6 - 64], xmm0
    MY_EPILOG


; ---------- AES-CTR ----------

XOR_UPD_1 macro reg, offs
    pxor    reg, [rD + offs]
endm

XOR_UPD_2 macro reg, offs
    movdqa  [rD + offs], reg
endm

MY_PROC AesCtr_Code_Intel, 3
    MY_PROLOG xmm6

    mov     r5, r4
    shr     r5, 4
    dec     r5
    shl     r5, 4

    mov     DWORD PTR [r5], 1
    mov     DWORD PTR [r5 + 4], 0
    mov     DWORD PTR [r5 + 8], 0
    mov     DWORD PTR [r5 + 12], 0
    
    add     r1, r6
    neg     r6
    add     r6, 32

    jmp     check2_c

  align 16
  nextBlocks2_c:
    movdqa  xmm7, [r5]

    i = 0
    rept ways
    paddq   xmm6, xmm7
    movdqa  @CatStr(xmm,%i), xmm6
    i = i + 1
    endm

    mov     r3, r6
    LOAD_OP_W  pxor, -32
    ENCODE  LOAD_OP_W
    OP_W    XOR_UPD_1, i * 16
    OP_W    XOR_UPD_2, i * 16
    add     rD, ways16
  check2_c:
    sub     rN, ways
    jnc     nextBlocks2_c

    add     rN, ways
    jmp     check_c

  nextBlock_c:
    paddq   xmm6, [r5]
    mov     r3, r6
    movdqa  xmm0, [r1 + r3 - 32]
    pxor    xmm0, xmm6
    ENCODE  LOAD_OP
    XOR_UPD_1 xmm0, 0
    XOR_UPD_2 xmm0, 0
    add     rD, 16
  check_c:
    sub     rN, 1
    jnc     nextBlock_c

    movdqa  [r1 + r6 - 64], xmm6
    MY_EPILOG

end