/*
 * Copyright (C) 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/* Assumptions:
 *
 * ARMv8-a, AArch64
 */

#ifndef ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_
#define ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_

#include "asm_support_arm64.S"

/* Parameters and result.  */
#define src1        x0
#define src2        x1
#define limit       x2
#define result      x0

/* Internal variables.  */
#define data1       x3
#define data1w      w3
#define data2       x4
#define data2w      w4
#define has_nul     x5
#define diff        x6
#define endloop     x7
#define tmp1        x8
#define tmp2        x9
#define tmp3        x10
#define limit_wd    x12
#define mask        x13

// WARNING: If you change this code to use x14 and x15, you must also change
//          art_quick_string_compareto, which relies on these temps being unused.

ENTRY __memcmp16
  cbz     limit, .Lret0
  lsl     limit, limit, #1  /* Half-words to bytes.  */
  eor     tmp1, src1, src2
  tst     tmp1, #7
  b.ne    .Lmisaligned8
  ands    tmp1, src1, #7
  b.ne    .Lmutual_align
  add     limit_wd, limit, #7
  lsr     limit_wd, limit_wd, #3
  /* Start of performance-critical section  -- one 64B cache line.  */
.Lloop_aligned:
  ldr     data1, [src1], #8
  ldr     data2, [src2], #8
.Lstart_realigned:
  subs    limit_wd, limit_wd, #1
  eor     diff, data1, data2  /* Non-zero if differences found.  */
  csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */
  cbz     endloop, .Lloop_aligned
  /* End of performance-critical section  -- one 64B cache line.  */

  /* Not reached the limit, must have found a diff.  */
  cbnz    limit_wd, .Lnot_limit

  /* Limit % 8 == 0 => all bytes significant.  */
  ands    limit, limit, #7
  b.eq    .Lnot_limit

  lsl     limit, limit, #3  /* Bits -> bytes.  */
  mov     mask, #~0
  lsl     mask, mask, limit
  bic     data1, data1, mask
  bic     data2, data2, mask

.Lnot_limit:

  // Swap the byte order of diff. Exact reverse is not important, as we only need to detect
  // the half-word.
  rev     diff, diff
  // The most significant bit of DIFF marks the least significant bit of change between DATA1/2
  clz     diff, diff
  // Mask off 0xF to have shift amount. Why does ARM64 not have BIC with immediate?!?!
  bfi     diff, xzr, #0, #4
  // Create a 16b mask
  mov     mask, #0xFFFF
  // Shift to the right half-word.
  lsr     data1, data1, diff
  lsr     data2, data2, diff
  // Mask the lowest half-word.
  and     data1, data1, mask
  and     data2, data2, mask
  // Compute difference.
  sub     result, data1, data2
  ret

.Lmutual_align:
  /* Sources are mutually aligned, but are not currently at an
     alignment boundary.  Round down the addresses and then mask off
     the bytes that precede the start point.  */
  bic     src1, src1, #7
  bic     src2, src2, #7
  add     limit, limit, tmp1  /* Adjust the limit for the extra.  */
  lsl     tmp1, tmp1, #3    /* Bytes beyond alignment -> bits.  */
  ldr     data1, [src1], #8
  neg     tmp1, tmp1    /* Bits to alignment -64.  */
  ldr     data2, [src2], #8
  mov     tmp2, #~0
  /* Little-endian.  Early bytes are at LSB.  */
  lsr     tmp2, tmp2, tmp1  /* Shift (tmp1 & 63).  */
  add     limit_wd, limit, #7
  orr     data1, data1, tmp2
  orr     data2, data2, tmp2
  lsr     limit_wd, limit_wd, #3
  b       .Lstart_realigned

.Lret0:
  mov     result, #0
  ret

  .p2align 6
.Lmisaligned8:
  sub     limit, limit, #1
1:
  /* Perhaps we can do better than this.  */
  ldrh    data1w, [src1], #2
  ldrh    data2w, [src2], #2
  subs    limit, limit, #2
  ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
  b.eq    1b
  sub     result, data1, data2
  ret
END __memcmp16

#endif  // ART_RUNTIME_ARCH_ARM64_MEMCMP16_ARM64_S_