C++程序  |  1432行  |  84.02 KB

#include <stdio.h>
#include <string.h>

#define N 64
struct float_test {
   float x[N], y[N], z[N], expected[N], res[N];
} ft __attribute__((aligned (32)));

struct double_test {
   double x[N], y[N], z[N], expected[N], res[N];
} dt __attribute__((aligned (32)));

float plus_zero, plus_infty, minus_infty, nan_value;

static int testf( float x, float y )
{
   unsigned int a, b;
   memcpy( &a, &x, sizeof (a) );
   memcpy( &b, &y, sizeof (b) );
   if ((a & 0x7fc00000U) == 0x7fc00000U)
      return (b & 0x7fc00000U) != 0x7fc00000U;
   return memcmp( &a, &b, sizeof (a) ) != 0;
}

static int test_fmaf( void )
{
   int res = 0, i, j;
   float w;
   for (i = 0; i < N; i++) {
      int thisres = 0;
      __asm __volatile__ ("vfmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      __asm __volatile__ ("vfmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      __asm __volatile__ ("vfmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      __asm __volatile__ ("vfmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      __asm __volatile__ ("vfmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      __asm __volatile__ ("vfmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      if (thisres)
         printf( "Failure 1 %d %a %a\n", i, w, ft.expected[i] );
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vfnmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      __asm __volatile__ ("vfnmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      __asm __volatile__ ("vfnmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      __asm __volatile__ ("vfnmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      __asm __volatile__ ("vfnmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      __asm __volatile__ ("vfnmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      if (thisres)
         printf( "Failure 2 %d %a %a\n", i, w, ft.expected[i] );
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      ft.z[i] = -ft.z[i];
   for (i = 0; i < N; i++) {
      int thisres = 0;
      __asm __volatile__ ("vfmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      __asm __volatile__ ("vfmsub132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      __asm __volatile__ ("vfmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      __asm __volatile__ ("vfmsub213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      __asm __volatile__ ("vfmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      __asm __volatile__ ("vfmsub231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
      thisres |= testf( w, ft.expected[i] );
      if (thisres)
         printf( "Failure 3 %d %a %a\n", i, w, ft.expected[i] );
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vfnmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      __asm __volatile__ ("vfnmadd132ss %2, %3, %0" : "=x" (w) : "0" (ft.x[i]), "m" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      __asm __volatile__ ("vfnmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "x" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      __asm __volatile__ ("vfnmadd213ss %3, %2, %0" : "=x" (w) : "0" (ft.x[i]), "x" (ft.y[i]), "m" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      __asm __volatile__ ("vfnmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "x" (ft.y[i]), "0" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      __asm __volatile__ ("vfnmadd231ss %2, %1, %0" : "=x" (w) : "x" (ft.x[i]), "m" (ft.y[i]), "0" (ft.z[i]));
      thisres |= testf( -w, ft.expected[i] );
      if (thisres)
         printf( "Failure 4 %d %a %a\n", i, w, ft.expected[i] );
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      ft.z[i] = -ft.z[i];
   for (i = 0; i < N; i += 4) {
      int thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
                          "vfmadd132ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
                          "vfmadd132ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
                          "vfmadd213ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
                          "vfmadd213ps (%3), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
                          "vfmadd231ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
                          "vfmadd231ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 5 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
                          "vfnmsub132ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
                          "vfnmsub132ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
                          "vfnmsub213ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
                          "vfnmsub213ps (%3), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
                          "vfnmsub231ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
                          "vfnmsub231ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 6 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      ft.z[i] = -ft.z[i];
   for (i = 0; i < N; i += 4) {
      int thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
                          "vfmsub132ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
                          "vfmsub132ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
                          "vfmsub213ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
                          "vfmsub213ps (%3), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
                          "vfmsub231ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
                          "vfmsub231ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 7 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
                          "vfnmadd132ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
                          "vfnmadd132ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
                          "vfnmadd213ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
                          "vfnmadd213ps (%3), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
                          "vfnmadd231ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
                          "vfnmadd231ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 8 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 1; i < N; i += 2)
      ft.z[i] = -ft.z[i];
   for (i = 0; i < N; i += 4) {
      int thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
                          "vfmaddsub132ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
                          "vfmaddsub132ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
                          "vfmaddsub213ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
                          "vfmaddsub213ps (%3), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
                          "vfmaddsub231ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
                          "vfmaddsub231ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 9 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      ft.z[i] = -ft.z[i];
   for (i = 0; i < N; i += 4) {
      int thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%3), %%xmm8;"
                          "vfmsubadd132ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm8;"
                          "vfmsubadd132ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%3), %%xmm7; vmovaps (%2), %%xmm8;"
                          "vfmsubadd213ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%xmm9; vmovaps (%2), %%xmm8;"
                          "vfmsubadd213ps (%3), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%2), %%xmm7; vmovaps (%1), %%xmm8;"
                          "vfmsubadd231ps %%xmm7, %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%xmm9; vmovaps (%1), %%xmm8;"
                          "vfmsubadd231ps (%2), %%xmm8, %%xmm9;"
                          "vmovaps %%xmm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 10 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 1; i < N; i += 2)
      ft.z[i] = -ft.z[i];
   for (i = 0; i < N; i += 8) {
      int thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
                          "vfmadd132ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
                          "vfmadd132ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
                          "vfmadd213ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
                          "vfmadd213ps (%3), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
                          "vfmadd231ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
                          "vfmadd231ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 11 %d", i );
         for (j = 0; j < 8; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
                          "vfnmsub132ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
                          "vfnmsub132ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
                          "vfnmsub213ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
                          "vfnmsub213ps (%3), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
                          "vfnmsub231ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
                          "vfnmsub231ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 12 %d", i );
         for (j = 0; j < 8; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      ft.z[i] = -ft.z[i];
   for (i = 0; i < N; i += 8) {
      int thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
                          "vfmsub132ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
                          "vfmsub132ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
                          "vfmsub213ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
                          "vfmsub213ps (%3), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
                          "vfmsub231ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
                          "vfmsub231ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 13 %d", i );
         for (j = 0; j < 8; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
                          "vfnmadd132ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
                          "vfnmadd132ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
                          "vfnmadd213ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
                          "vfnmadd213ps (%3), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
                          "vfnmadd231ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
                          "vfnmadd231ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( -ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 14 %d", i );
         for (j = 0; j < 8; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 1; i < N; i += 2)
      ft.z[i] = -ft.z[i];
   for (i = 0; i < N; i += 8) {
      int thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
                          "vfmaddsub132ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
                          "vfmaddsub132ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
                          "vfmaddsub213ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
                          "vfmaddsub213ps (%3), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
                          "vfmaddsub231ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
                          "vfmaddsub231ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 15 %d", i );
         for (j = 0; j < 8; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      ft.z[i] = -ft.z[i];
   for (i = 0; i < N; i += 8) {
      int thisres = 0;
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%3), %%ymm8;"
                          "vfmsubadd132ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm8;"
                          "vfmsubadd132ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%3), %%ymm7; vmovaps (%2), %%ymm8;"
                          "vfmsubadd213ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%1), %%ymm9; vmovaps (%2), %%ymm8;"
                          "vfmsubadd213ps (%3), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%2), %%ymm7; vmovaps (%1), %%ymm8;"
                          "vfmsubadd231ps %%ymm7, %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      __asm __volatile__ ("vmovaps (%3), %%ymm9; vmovaps (%1), %%ymm8;"
                          "vfmsubadd231ps (%2), %%ymm8, %%ymm9;"
                          "vmovaps %%ymm9, (%0)" : : "r" (&ft.res[i]), "r" (&ft.x[i]),
                                                     "r" (&ft.y[i]), "r" (&ft.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 8; j++)
         thisres |= testf( ft.res[i+j], ft.expected[i+j] );
      if (thisres) {
         printf( "Failure 16 %d", i );
         for (j = 0; j < 8; j++)
            printf( " %a %a", ft.res[i+j], ft.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 1; i < N; i += 2)
      ft.z[i] = -ft.z[i];
   return res;
}

static int test( double x, double y )
{
   unsigned long long a, b;
   memcpy( &a, &x, sizeof (a) );
   memcpy( &b, &y, sizeof (b) );
   if ((a & 0x7ff8000000000000ULL) == 0x7ff8000000000000ULL)
      return (b & 0x7ff8000000000000ULL) != 0x7ff8000000000000ULL;
   return memcmp( &a, &b, sizeof (a) ) != 0;
}

static int test_fma( void )
{
   int res = 0, i, j;
   double w;
   for (i = 0; i < N; i++) {
      int thisres = 0;
      __asm __volatile__ ("vfmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      __asm __volatile__ ("vfmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      __asm __volatile__ ("vfmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      __asm __volatile__ ("vfmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      __asm __volatile__ ("vfmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      __asm __volatile__ ("vfmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      if (thisres)
         printf( "Failure 1 %d %a %a\n", i, w, dt.expected[i] );
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vfnmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      __asm __volatile__ ("vfnmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      __asm __volatile__ ("vfnmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      __asm __volatile__ ("vfnmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      __asm __volatile__ ("vfnmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      __asm __volatile__ ("vfnmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      if (thisres)
         printf( "Failure 2 %d %a %a\n", i, w, dt.expected[i] );
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      dt.z[i] = -dt.z[i];
   for (i = 0; i < N; i++) {
      int thisres = 0;
      __asm __volatile__ ("vfmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      __asm __volatile__ ("vfmsub132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      __asm __volatile__ ("vfmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      __asm __volatile__ ("vfmsub213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      __asm __volatile__ ("vfmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      __asm __volatile__ ("vfmsub231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
      thisres |= test( w, dt.expected[i] );
      if (thisres)
         printf( "Failure 3 %d %a %a\n", i, w, dt.expected[i] );
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vfnmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      __asm __volatile__ ("vfnmadd132sd %2, %3, %0" : "=x" (w) : "0" (dt.x[i]), "m" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      __asm __volatile__ ("vfnmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "x" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      __asm __volatile__ ("vfnmadd213sd %3, %2, %0" : "=x" (w) : "0" (dt.x[i]), "x" (dt.y[i]), "m" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      __asm __volatile__ ("vfnmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "x" (dt.y[i]), "0" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      __asm __volatile__ ("vfnmadd231sd %2, %1, %0" : "=x" (w) : "x" (dt.x[i]), "m" (dt.y[i]), "0" (dt.z[i]));
      thisres |= test( -w, dt.expected[i] );
      if (thisres)
         printf( "Failure 4 %d %a %a\n", i, w, dt.expected[i] );
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      dt.z[i] = -dt.z[i];
   for (i = 0; i < N; i += 2) {
      int thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
                          "vfmadd132pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
                          "vfmadd132pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
                          "vfmadd213pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
                          "vfmadd213pd (%3), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
                          "vfmadd231pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
                          "vfmadd231pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 5 %d", i );
         for (j = 0; j < 2; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
                          "vfnmsub132pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
                          "vfnmsub132pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
                          "vfnmsub213pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
                          "vfnmsub213pd (%3), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
                          "vfnmsub231pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
                          "vfnmsub231pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 6 %d", i );
         for (j = 0; j < 2; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      dt.z[i] = -dt.z[i];
   for (i = 0; i < N; i += 2) {
      int thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
                          "vfmsub132pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
                          "vfmsub132pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
                          "vfmsub213pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
                          "vfmsub213pd (%3), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
                          "vfmsub231pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
                          "vfmsub231pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 7 %d", i );
         for (j = 0; j < 2; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
                          "vfnmadd132pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
                          "vfnmadd132pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
                          "vfnmadd213pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
                          "vfnmadd213pd (%3), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
                          "vfnmadd231pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
                          "vfnmadd231pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 8 %d", i );
         for (j = 0; j < 2; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 1; i < N; i += 2)
      dt.z[i] = -dt.z[i];
   for (i = 0; i < N; i += 2) {
      int thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
                          "vfmaddsub132pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
                          "vfmaddsub132pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
                          "vfmaddsub213pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
                          "vfmaddsub213pd (%3), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
                          "vfmaddsub231pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
                          "vfmaddsub231pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 9 %d", i );
         for (j = 0; j < 2; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      dt.z[i] = -dt.z[i];
   for (i = 0; i < N; i += 2) {
      int thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%3), %%xmm8;"
                          "vfmsubadd132pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm8;"
                          "vfmsubadd132pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%3), %%xmm7; vmovapd (%2), %%xmm8;"
                          "vfmsubadd213pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%xmm9; vmovapd (%2), %%xmm8;"
                          "vfmsubadd213pd (%3), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%2), %%xmm7; vmovapd (%1), %%xmm8;"
                          "vfmsubadd231pd %%xmm7, %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%xmm9; vmovapd (%1), %%xmm8;"
                          "vfmsubadd231pd (%2), %%xmm8, %%xmm9;"
                          "vmovapd %%xmm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 2; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 10 %d", i );
         for (j = 0; j < 2; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 1; i < N; i += 2)
      dt.z[i] = -dt.z[i];
   for (i = 0; i < N; i += 4) {
      int thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
                          "vfmadd132pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
                          "vfmadd132pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
                          "vfmadd213pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
                          "vfmadd213pd (%3), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
                          "vfmadd231pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
                          "vfmadd231pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 11 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
                          "vfnmsub132pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
                          "vfnmsub132pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
                          "vfnmsub213pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
                          "vfnmsub213pd (%3), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
                          "vfnmsub231pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
                          "vfnmsub231pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 12 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      dt.z[i] = -dt.z[i];
   for (i = 0; i < N; i += 4) {
      int thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
                          "vfmsub132pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
                          "vfmsub132pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
                          "vfmsub213pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
                          "vfmsub213pd (%3), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
                          "vfmsub231pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
                          "vfmsub231pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 13 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
      thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
                          "vfnmadd132pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
                          "vfnmadd132pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
                          "vfnmadd213pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
                          "vfnmadd213pd (%3), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
                          "vfnmadd231pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
                          "vfnmadd231pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( -dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 14 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 1; i < N; i += 2)
      dt.z[i] = -dt.z[i];
   for (i = 0; i < N; i += 4) {
      int thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
                          "vfmaddsub132pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
                          "vfmaddsub132pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
                          "vfmaddsub213pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
                          "vfmaddsub213pd (%3), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
                          "vfmaddsub231pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
                          "vfmaddsub231pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 15 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 0; i < N; i++)
      dt.z[i] = -dt.z[i];
   for (i = 0; i < N; i += 4) {
      int thisres = 0;
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%3), %%ymm8;"
                          "vfmsubadd132pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm8;"
                          "vfmsubadd132pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%3), %%ymm7; vmovapd (%2), %%ymm8;"
                          "vfmsubadd213pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%1), %%ymm9; vmovapd (%2), %%ymm8;"
                          "vfmsubadd213pd (%3), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%2), %%ymm7; vmovapd (%1), %%ymm8;"
                          "vfmsubadd231pd %%ymm7, %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      __asm __volatile__ ("vmovapd (%3), %%ymm9; vmovapd (%1), %%ymm8;"
                          "vfmsubadd231pd (%2), %%ymm8, %%ymm9;"
                          "vmovapd %%ymm9, (%0)" : : "r" (&dt.res[i]), "r" (&dt.x[i]),
                                                     "r" (&dt.y[i]), "r" (&dt.z[i]) : "xmm7", "xmm8", "xmm9");
      for (j = 0; j < 4; j++)
         thisres |= test( dt.res[i+j], dt.expected[i+j] );
      if (thisres) {
         printf( "Failure 16 %d", i );
         for (j = 0; j < 4; j++)
            printf( " %a %a", dt.res[i+j], dt.expected[i+j] );
         printf( "\n" );
      }
      res |= thisres;
   }
   for (i = 1; i < N; i += 2)
      dt.z[i] = -dt.z[i];
   return res;
}

int main( )
{
   int res = 0;
   int i = 0;
   plus_zero = 0.0;
   __asm __volatile__ ("" : : "r" (&plus_zero) : "memory");
   nan_value = plus_zero / plus_zero;
   plus_infty = 3.40282346638528859812e+38F * 16.0F;
   minus_infty = -plus_infty;
#define TEST_F( a, b, c, d ) \
   do {				\
      ft.x[i] = a;		\
      ft.y[i] = b;		\
      ft.z[i] = c;		\
      ft.expected[i] = d;	\
      i++;			\
   } while (0)
   TEST_F( 1.0, 2.0, 3.0, 5.0 );
   TEST_F( nan_value, 2.0, 3.0, nan_value );
   TEST_F( 1.0, nan_value, 3.0, nan_value );
   TEST_F( 1.0, 2.0, nan_value, nan_value );
   TEST_F( plus_infty, 0.0, nan_value, nan_value );
   TEST_F( minus_infty, 0.0, nan_value, nan_value );
   TEST_F( 0.0, plus_infty, nan_value, nan_value );
   TEST_F( 0.0, minus_infty, nan_value, nan_value );
   TEST_F( plus_infty, 0.0, 1.0, nan_value );
   TEST_F( minus_infty, 0.0, 1.0, nan_value );
   TEST_F( 0.0, plus_infty, 1.0, nan_value );
   TEST_F( 0.0, minus_infty, 1.0, nan_value );
   TEST_F( plus_infty, plus_infty, minus_infty, nan_value );
   TEST_F( minus_infty, plus_infty, plus_infty, nan_value );
   TEST_F( plus_infty, minus_infty, plus_infty, nan_value );
   TEST_F( minus_infty, minus_infty, minus_infty, nan_value );
   TEST_F( plus_infty, 3.5L, minus_infty, nan_value );
   TEST_F( minus_infty, -7.5L, minus_infty, nan_value );
   TEST_F( -13.5L, plus_infty, plus_infty, nan_value );
   TEST_F( minus_infty, 7.5L, plus_infty, nan_value );
   TEST_F( 1.25L, 0.75L, 0.0625L, 1.0L );
   TEST_F( -3.40282346638528859812e+38F, -3.40282346638528859812e+38F, minus_infty, minus_infty );
   TEST_F( 3.40282346638528859812e+38F / 2, 3.40282346638528859812e+38F / 2, minus_infty, minus_infty );
   TEST_F( -3.40282346638528859812e+38F, 3.40282346638528859812e+38F, plus_infty, plus_infty );
   TEST_F( 3.40282346638528859812e+38F / 2, -3.40282346638528859812e+38F / 4, plus_infty, plus_infty );
   TEST_F( plus_infty, 4, plus_infty, plus_infty );
   TEST_F( 2, minus_infty, minus_infty, minus_infty );
   TEST_F( minus_infty, minus_infty, plus_infty, plus_infty );
   TEST_F( plus_infty, minus_infty, minus_infty, minus_infty );
   TEST_F( 0x1.7ff8p+13, 0x1.000002p+0, 0x1.ffffp-24, 0x1.7ff802p+13 );
   TEST_F( 0x1.fffp+0, 0x1.00001p+0, -0x1.fffp+0, 0x1.fffp-20 );
   TEST_F( 0x1.9abcdep+127, 0x0.9abcdep-126, -0x1.f08948p+0, 0x1.bb421p-25 );
   TEST_F( 0x1.9abcdep+100, 0x0.9abcdep-126, -0x1.f08948p-27, 0x1.bb421p-52 );
   TEST_F( 0x1.fffffep+127, 0x1.001p+0, -0x1.fffffep+127, 0x1.fffffep+115 );
   TEST_F( -0x1.fffffep+127, 0x1.fffffep+0, 0x1.fffffep+127, -0x1.fffffap+127 );
   TEST_F( 0x1.fffffep+127, 2.0, -0x1.fffffep+127, 0x1.fffffep+127 );

   res |= test_fmaf( );
   i = 0;
#define TEST( a, b, c, d ) \
   do {				\
      dt.x[i] = a;		\
      dt.y[i] = b;		\
      dt.z[i] = c;		\
      dt.expected[i] = d;	\
      i++;			\
   } while (0)
   TEST( 1.0, 2.0, 3.0, 5.0 );
   TEST( nan_value, 2.0, 3.0, nan_value );
   TEST( 1.0, nan_value, 3.0, nan_value );
   TEST( 1.0, 2.0, nan_value, nan_value );
   TEST( plus_infty, 0.0, nan_value, nan_value );
   TEST( minus_infty, 0.0, nan_value, nan_value );
   TEST( 0.0, plus_infty, nan_value, nan_value );
   TEST( 0.0, minus_infty, nan_value, nan_value );
   TEST( plus_infty, 0.0, 1.0, nan_value );
   TEST( minus_infty, 0.0, 1.0, nan_value );
   TEST( 0.0, plus_infty, 1.0, nan_value );
   TEST( 0.0, minus_infty, 1.0, nan_value );
   TEST( plus_infty, plus_infty, minus_infty, nan_value );
   TEST( minus_infty, plus_infty, plus_infty, nan_value );
   TEST( plus_infty, minus_infty, plus_infty, nan_value );
   TEST( minus_infty, minus_infty, minus_infty, nan_value );
   TEST( plus_infty, 3.5L, minus_infty, nan_value );
   TEST( minus_infty, -7.5L, minus_infty, nan_value );
   TEST( -13.5L, plus_infty, plus_infty, nan_value );
   TEST( minus_infty, 7.5L, plus_infty, nan_value );
   TEST( 1.25L, 0.75L, 0.0625L, 1.0L );
   TEST( -1.79769313486231570815e+308L, -1.79769313486231570815e+308L, minus_infty, minus_infty );
   TEST( 1.79769313486231570815e+308L / 2, 1.79769313486231570815e+308L / 2, minus_infty, minus_infty );
   TEST( -1.79769313486231570815e+308L, 1.79769313486231570815e+308L, plus_infty, plus_infty );
   TEST( 1.79769313486231570815e+308L / 2, -1.79769313486231570815e+308L / 4, plus_infty, plus_infty );
   TEST( plus_infty, 4, plus_infty, plus_infty );
   TEST( 2, minus_infty, minus_infty, minus_infty );
   TEST( minus_infty, minus_infty, plus_infty, plus_infty );
   TEST( plus_infty, minus_infty, minus_infty, minus_infty );
   TEST( 0x1.7fp+13, 0x1.0000000000001p+0, 0x1.ffep-48, 0x1.7f00000000001p+13 );
   TEST( 0x1.fffp+0, 0x1.0000000000001p+0, -0x1.fffp+0, 0x1.fffp-52 );
   TEST( 0x1.0000002p+0, 0x1.ffffffcp-1, 0x1p-300, 1.0 );
   TEST( 0x1.0000002p+0, 0x1.ffffffcp-1, -0x1p-300, 0x1.fffffffffffffp-1 );
   TEST( 0x1.deadbeef2feedp+1023, 0x0.deadbeef2feedp-1022, -0x1.a05f8c01a4bfbp+1, 0x1.0989687bc9da4p-53 );
   TEST( 0x1.deadbeef2feedp+900, 0x0.deadbeef2feedp-1022, -0x1.a05f8c01a4bfbp-122, 0x1.0989687bc9da4p-176 );
   TEST( 0x1.fffffffffffffp+1023, 0x1.001p+0, -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+1011 );
   TEST( -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+0, 0x1.fffffffffffffp+1023, -0x1.ffffffffffffdp+1023 );
   TEST( 0x1.fffffffffffffp+1023, 2.0, -0x1.fffffffffffffp+1023, 0x1.fffffffffffffp+1023 );
   TEST( 0x1.6a09e667f3bccp-538, 0x1.6a09e667f3bccp-538, 0.0, 0.0 );
   TEST( 0x1.deadbeef2feedp-495, 0x1.deadbeef2feedp-495, -0x1.bf86a5786a574p-989, 0x0.0000042625a1fp-1022 );
   TEST( 0x1.deadbeef2feedp-503, 0x1.deadbeef2feedp-503, -0x1.bf86a5786a574p-1005, 0x0.0000000004262p-1022 );
   TEST( 0x1p-537, 0x1p-538, 0x1p-1074, 0x0.0000000000002p-1022 );
   TEST( 0x1.7fffff8p-968, 0x1p-106, 0x0.000001p-1022, 0x0.0000010000001p-1022 );
   TEST( 0x1.4000004p-967, 0x1p-106, 0x0.000001p-1022, 0x0.0000010000003p-1022 );
   TEST( 0x1.4p-967, -0x1p-106, -0x0.000001p-1022, -0x0.0000010000002p-1022 );
   TEST( -0x1.19cab66d73e17p-959, 0x1.c7108a8c5ff51p-107, -0x0.80b0ad65d9b64p-1022, -0x0.80b0ad65d9d59p-1022 );
   TEST( -0x1.d2eaed6e8e9d3p-979, -0x1.4e066c62ac9ddp-63, -0x0.9245e6b003454p-1022, -0x0.9245c09c5fb5dp-1022 );
   TEST( 0x1.153d650bb9f06p-907, 0x1.2d01230d48407p-125, -0x0.b278d5acfc3cp-1022, -0x0.b22757123bbe9p-1022 );
   TEST( -0x1.fffffffffffffp-711, 0x1.fffffffffffffp-275, 0x1.fffffe00007ffp-983, 0x1.7ffffe00007ffp-983 );

   res |= test_fma( );
   if (res == 0)
      printf( "Testing successful\n");
   return 0;
}