/* Tests e-vs-i or i-vs-m aspects for pcmp{e,i}str{i,m}.  Does not
   check the core arithmetic in any detail. This file checks the 16-bit
   character versions (w is for wide) */

#include <string.h>
#include <stdio.h>
#include <assert.h>

typedef  unsigned char  V128[16];
typedef  unsigned int   UInt;
typedef  signed int     Int;
typedef  unsigned char  UChar;
typedef  unsigned long long int ULong;
typedef  UChar          Bool;
#define False ((Bool)0)
#define True  ((Bool)1)

void show_V128 ( V128* vec )
{
   Int i;
   for (i = 15; i >= 0; i--)
      printf("%02x", (UInt)( (*vec)[i] ));
}

void expand ( V128* dst, char* summary )
{
   Int i;
   assert( strlen(summary) == 16 );
   for (i = 0; i < 16; i++) {
      UChar xx = 0;
      UChar x = summary[15-i];
      if      (x >= '0' && x <= '9') { xx = x - '0'; }
      else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
      else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
      else assert(0);

      assert(xx < 16);
      xx = (xx << 4) | xx;
      assert(xx < 256);
      (*dst)[i] = xx;
   }
}

void one_test ( char* summL, ULong rdxIN, char* summR, ULong raxIN )
{
   V128 argL, argR;
   expand( &argL, summL );
   expand( &argR, summR );
   printf("\n");
   printf("rdx %016llx  argL ", rdxIN);
   show_V128(&argL);
   printf("  rax %016llx  argR ", raxIN);
   show_V128(&argR);
   printf("\n");

   ULong block[ 2/*in:argL*/          // 0  0
                + 2/*in:argR*/        // 2  16
                + 1/*in:rdx*/         // 4  32
                + 1/*in:rax*/         // 5  40
                + 2/*inout:xmm0*/     // 6  48
                + 1/*inout:rcx*/      // 8  64
                + 1/*out:rflags*/ ];  // 9  72
   assert(sizeof(block) == 80);

   UChar* blockC = (UChar*)&block[0];

   /* ---------------- ISTRI_4B ---------------- */
   memset(blockC, 0x55, 80);
   memcpy(blockC + 0,  &argL,  16);
   memcpy(blockC + 16, &argR,  16);
   memcpy(blockC + 24, &rdxIN, 8);
   memcpy(blockC + 32, &raxIN, 8);
   memcpy(blockC + 40, &rdxIN, 8);
   __asm__ __volatile__(
      "movupd    0(%0), %%xmm2"           "\n\t"
      "movupd    16(%0), %%xmm13"         "\n\t"
      "movq      32(%0), %%rdx"           "\n\t"
      "movq      40(%0), %%rax"           "\n\t"
      "movupd    48(%0), %%xmm0"          "\n\t"
      "movw      64(%0), %%cx"            "\n\t"
      "pcmpistri $0x4B, %%xmm2, %%xmm13"  "\n\t"
      "movupd    %%xmm0, 48(%0)"          "\n\t"
      "movw      %%cx, 64(%0)"            "\n\t"
      "pushfq"                            "\n\t"
      "popq      %%r15"                   "\n\t"
      "movq      %%r15, 72(%0)"           "\n\t"
      : /*out*/ 
      : /*in*/"r"(blockC) 
      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
   );
   printf("  istri $0x4B:  ");
   printf("    xmm0 ");
   show_V128( (V128*)(blockC+48) );
   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);

   /* ---------------- ISTRI_0B ---------------- */
   memset(blockC, 0x55, 80);
   memcpy(blockC + 0,  &argL,  16);
   memcpy(blockC + 16, &argR,  16);
   memcpy(blockC + 24, &rdxIN, 8);
   memcpy(blockC + 32, &raxIN, 8);
   memcpy(blockC + 40, &rdxIN, 8);
   __asm__ __volatile__(
      "movupd    0(%0), %%xmm2"           "\n\t"
      "movupd    16(%0), %%xmm13"         "\n\t"
      "movq      32(%0), %%rdx"           "\n\t"
      "movq      40(%0), %%rax"           "\n\t"
      "movupd    48(%0), %%xmm0"          "\n\t"
      "movw      64(%0), %%cx"            "\n\t"
      "pcmpistri $0x0B, %%xmm2, %%xmm13"  "\n\t"
      "movupd    %%xmm0, 48(%0)"          "\n\t"
      "movw      %%cx, 64(%0)"            "\n\t"
      "pushfq"                            "\n\t"
      "popq      %%r15"                   "\n\t"
      "movq      %%r15, 72(%0)"           "\n\t"
      : /*out*/ 
      : /*in*/"r"(blockC) 
      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
   );
   printf("  istri $0x0B:  ");
   printf("    xmm0 ");
   show_V128( (V128*)(blockC+48) );
   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);

   /* ---------------- ISTRM_4B ---------------- */
   memset(blockC, 0x55, 80);
   memcpy(blockC + 0,  &argL,  16);
   memcpy(blockC + 16, &argR,  16);
   memcpy(blockC + 24, &rdxIN, 8);
   memcpy(blockC + 32, &raxIN, 8);
   memcpy(blockC + 40, &rdxIN, 8);
   __asm__ __volatile__(
      "movupd    0(%0), %%xmm2"           "\n\t"
      "movupd    16(%0), %%xmm13"         "\n\t"
      "movq      32(%0), %%rdx"           "\n\t"
      "movq      40(%0), %%rax"           "\n\t"
      "movupd    48(%0), %%xmm0"          "\n\t"
      "movw      64(%0), %%cx"            "\n\t"
      "pcmpistrm $0x4B, %%xmm2, %%xmm13"  "\n\t"
      "movupd    %%xmm0, 48(%0)"          "\n\t"
      "movw      %%cx, 64(%0)"            "\n\t"
      "pushfq"                            "\n\t"
      "popq      %%r15"                   "\n\t"
      "movq      %%r15, 72(%0)"           "\n\t"
      : /*out*/ 
      : /*in*/"r"(blockC) 
      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
   );
   printf("  istrm $0x4B:  ");
   printf("    xmm0 ");
   show_V128( (V128*)(blockC+48) );
   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);

   /* ---------------- ISTRM_0B ---------------- */
   memset(blockC, 0x55, 80);
   memcpy(blockC + 0,  &argL,  16);
   memcpy(blockC + 16, &argR,  16);
   memcpy(blockC + 24, &rdxIN, 8);
   memcpy(blockC + 32, &raxIN, 8);
   memcpy(blockC + 40, &rdxIN, 8);
   __asm__ __volatile__(
      "movupd    0(%0), %%xmm2"           "\n\t"
      "movupd    16(%0), %%xmm13"         "\n\t"
      "movq      32(%0), %%rdx"           "\n\t"
      "movq      40(%0), %%rax"           "\n\t"
      "movupd    48(%0), %%xmm0"          "\n\t"
      "movw      64(%0), %%cx"            "\n\t"
      "pcmpistrm $0x0B, %%xmm2, %%xmm13"  "\n\t"
      "movupd    %%xmm0, 48(%0)"          "\n\t"
      "movw      %%cx, 64(%0)"            "\n\t"
      "pushfq"                            "\n\t"
      "popq      %%r15"                   "\n\t"
      "movq      %%r15, 72(%0)"           "\n\t"
      : /*out*/ 
      : /*in*/"r"(blockC) 
      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
   );
   printf("  istrm $0x0B:  ");
   printf("    xmm0 ");
   show_V128( (V128*)(blockC+48) );
   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);

   /* ---------------- ESTRI_4B ---------------- */
   memset(blockC, 0x55, 80);
   memcpy(blockC + 0,  &argL,  16);
   memcpy(blockC + 16, &argR,  16);
   memcpy(blockC + 24, &rdxIN, 8);
   memcpy(blockC + 32, &raxIN, 8);
   memcpy(blockC + 40, &rdxIN, 8);
   __asm__ __volatile__(
      "movupd    0(%0), %%xmm2"           "\n\t"
      "movupd    16(%0), %%xmm13"         "\n\t"
      "movq      32(%0), %%rdx"           "\n\t"
      "movq      40(%0), %%rax"           "\n\t"
      "movupd    48(%0), %%xmm0"          "\n\t"
      "movw      64(%0), %%cx"            "\n\t"
      "pcmpestri $0x4B, %%xmm2, %%xmm13"  "\n\t"
      "movupd    %%xmm0, 48(%0)"          "\n\t"
      "movw      %%cx, 64(%0)"            "\n\t"
      "pushfq"                            "\n\t"
      "popq      %%r15"                   "\n\t"
      "movq      %%r15, 72(%0)"           "\n\t"
      : /*out*/ 
      : /*in*/"r"(blockC) 
      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
   );
   printf("  estri $0x4B:  ");
   printf("    xmm0 ");
   show_V128( (V128*)(blockC+48) );
   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);

   /* ---------------- ESTRI_0B ---------------- */
   memset(blockC, 0x55, 80);
   memcpy(blockC + 0,  &argL,  16);
   memcpy(blockC + 16, &argR,  16);
   memcpy(blockC + 24, &rdxIN, 8);
   memcpy(blockC + 32, &raxIN, 8);
   memcpy(blockC + 40, &rdxIN, 8);
   __asm__ __volatile__(
      "movupd    0(%0), %%xmm2"           "\n\t"
      "movupd    16(%0), %%xmm13"         "\n\t"
      "movq      32(%0), %%rdx"           "\n\t"
      "movq      40(%0), %%rax"           "\n\t"
      "movupd    48(%0), %%xmm0"          "\n\t"
      "movw      64(%0), %%cx"            "\n\t"
      "pcmpestri $0x0B, %%xmm2, %%xmm13"  "\n\t"
      "movupd    %%xmm0, 48(%0)"          "\n\t"
      "movw      %%cx, 64(%0)"            "\n\t"
      "pushfq"                            "\n\t"
      "popq      %%r15"                   "\n\t"
      "movq      %%r15, 72(%0)"           "\n\t"
      : /*out*/ 
      : /*in*/"r"(blockC) 
      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
   );
   printf("  estri $0x0B:  ");
   printf("    xmm0 ");
   show_V128( (V128*)(blockC+48) );
   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);

   /* ---------------- ESTRM_4B ---------------- */
   memset(blockC, 0x55, 80);
   memcpy(blockC + 0,  &argL,  16);
   memcpy(blockC + 16, &argR,  16);
   memcpy(blockC + 24, &rdxIN, 8);
   memcpy(blockC + 32, &raxIN, 8);
   memcpy(blockC + 40, &rdxIN, 8);
   __asm__ __volatile__(
      "movupd    0(%0), %%xmm2"           "\n\t"
      "movupd    16(%0), %%xmm13"         "\n\t"
      "movq      32(%0), %%rdx"           "\n\t"
      "movq      40(%0), %%rax"           "\n\t"
      "movupd    48(%0), %%xmm0"          "\n\t"
      "movw      64(%0), %%cx"            "\n\t"
      "pcmpestrm $0x4B, %%xmm2, %%xmm13"  "\n\t"
      "movupd    %%xmm0, 48(%0)"          "\n\t"
      "movw      %%cx, 64(%0)"            "\n\t"
      "pushfq"                            "\n\t"
      "popq      %%r15"                   "\n\t"
      "movq      %%r15, 72(%0)"           "\n\t"
      : /*out*/ 
      : /*in*/"r"(blockC) 
      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
   );
   printf("  estrm $0x4B:  ");
   printf("    xmm0 ");
   show_V128( (V128*)(blockC+48) );
   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);

   /* ---------------- ESTRM_0B ---------------- */
   memset(blockC, 0x55, 80);
   memcpy(blockC + 0,  &argL,  16);
   memcpy(blockC + 16, &argR,  16);
   memcpy(blockC + 24, &rdxIN, 8);
   memcpy(blockC + 32, &raxIN, 8);
   memcpy(blockC + 40, &rdxIN, 8);
   __asm__ __volatile__(
      "movupd    0(%0), %%xmm2"           "\n\t"
      "movupd    16(%0), %%xmm13"         "\n\t"
      "movq      32(%0), %%rdx"           "\n\t"
      "movq      40(%0), %%rax"           "\n\t"
      "movupd    48(%0), %%xmm0"          "\n\t"
      "movw      64(%0), %%cx"            "\n\t"
      "pcmpestrm $0x0B, %%xmm2, %%xmm13"  "\n\t"
      "movupd    %%xmm0, 48(%0)"          "\n\t"
      "movw      %%cx, 64(%0)"            "\n\t"
      "pushfq"                            "\n\t"
      "popq      %%r15"                   "\n\t"
      "movq      %%r15, 72(%0)"           "\n\t"
      : /*out*/ 
      : /*in*/"r"(blockC) 
      : /*trash*/"memory","cc","xmm2","xmm13","xmm0","rdx","rax","rcx","r15"
   );
   printf("  estrm $0x0B:  ");
   printf("    xmm0 ");
   show_V128( (V128*)(blockC+48) );
   printf("  rcx %016llx  flags %08llx\n", block[8], block[9] & 0x8D5);




}

int main ( void )
{
   one_test("aaaaaaaaaaaaaaaa", 0, "aaaaaaaa00aaaaaa", 0 );
   one_test("0000000000000000", 0, "aaaaaaaa00aaaaaa", 0 );

   one_test("aaaaaaaaaaaaaaaa", 0, "aaaaaaaaaaaaaaaa", 0 );
   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", 0 );
   one_test("aaaaaaaaaaaaaaaa", 0, "aaaaaaaaaaaaaaaa", 6 );

   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", 6 );
   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", 15 );
   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", 16 );
   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", 17 );

   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", -6 );
   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", -15 );
   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", -16 );
   one_test("aaaaaaaaaaaaaaaa", 5, "aaaaaaaaaaaaaaaa", -17 );

   one_test("aaaaaaaaaaaaaaaa", 5,  "aaaaaaaaaaaaaaaa", 6 );
   one_test("aaaaaaaaaaaaaaaa", 15, "aaaaaaaaaaaaaaaa", 6 );
   one_test("aaaaaaaaaaaaaaaa", 16, "aaaaaaaaaaaaaaaa", 6 );
   one_test("aaaaaaaaaaaaaaaa", 17, "aaaaaaaaaaaaaaaa", 6 );

   one_test("aaaaaaaaaaaaaaaa", -5,  "aaaaaaaaaaaaaaaa", 6 );
   one_test("aaaaaaaaaaaaaaaa", -15, "aaaaaaaaaaaaaaaa", 6 );
   one_test("aaaaaaaaaaaaaaaa", -16, "aaaaaaaaaaaaaaaa", 6 );
   one_test("aaaaaaaaaaaaaaaa", -17, "aaaaaaaaaaaaaaaa", 6 );

   return 0;
}