/*---------------------------------------------------------------*/
/*--- begin                              host_generic_maddf.c ---*/
/*---------------------------------------------------------------*/

/* 
   Compute x * y + z as ternary operation.
   Copyright (C) 2010-2015 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Jakub Jelinek <jakub@redhat.com>, 2010.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.
*/

/* Generic helper functions for doing FMA, i.e. compute x * y + z
   as ternary operation.
   These are purely back-end entities and cannot be seen/referenced
   from IR. */

#include "libvex_basictypes.h"
#include "host_generic_maddf.h"
#include "main_util.h"

/* This implementation relies on Double being more than twice as
   precise as Float and uses rounding to odd in order to avoid problems
   with double rounding.
   See a paper by Boldo and Melquiond:
   http://www.lri.fr/~melquion/doc/08-tc.pdf  */

#define FORCE_EVAL(X) __asm __volatile__ ("" : : "m" (X))

#if defined(__x86_64__) && defined(__SSE2_MATH__)
# define ENV_TYPE unsigned int
/* Save current rounding mode into ENV, hold exceptions, set rounding
   mode to rounding toward zero.  */
# define ROUNDTOZERO(env) \
   do {							\
      unsigned int mxcsr;				\
      __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
      (env) = mxcsr;					\
      mxcsr = (mxcsr | 0x7f80) & ~0x3f;			\
      __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
   } while (0)
/* Restore exceptions from ENV, return if inexact exception has been raised
   since ROUNDTOZERO.  */
# define RESET_TESTINEXACT(env) \
   ({							\
      unsigned int mxcsr, ret;				\
      __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
      ret = (mxcsr >> 5) & 1;				\
      mxcsr = (mxcsr & 0x3d) | (env);			\
      __asm __volatile__ ("ldmxcsr %0" : : "m" (mxcsr));\
      ret;						\
   })
/* Return if inexact exception has been raised since ROUNDTOZERO.  */
# define TESTINEXACT() \
   ({							\
      unsigned int mxcsr;				\
      __asm __volatile__ ("stmxcsr %0" : "=m" (mxcsr));	\
      (mxcsr >> 5) & 1;					\
   })
#endif

#define DBL_MANT_DIG 53
#define IEEE754_DOUBLE_BIAS 0x3ff

union vg_ieee754_double {
   Double d;

   /* This is the IEEE 754 double-precision format.  */
   struct {
#ifdef VKI_BIG_ENDIAN
      unsigned int negative:1;
      unsigned int exponent:11;
      unsigned int mantissa0:20;
      unsigned int mantissa1:32;
#else
      unsigned int mantissa1:32;
      unsigned int mantissa0:20;
      unsigned int exponent:11;
      unsigned int negative:1;
#endif
   } ieee;
};

void VEX_REGPARM(3)
     h_generic_calc_MAddF32 ( /*OUT*/Float* res,
                               Float* argX, Float* argY, Float* argZ )
{
#ifndef ENV_TYPE
   /* Lame fallback implementation.  */
   *res = *argX * *argY + *argZ;
#else
   ENV_TYPE env;
   /* Multiplication is always exact.  */
   Double temp = (Double) *argX * (Double) *argY;
   union vg_ieee754_double u;

   ROUNDTOZERO (env);

   /* Perform addition with round to odd.  */
   u.d = temp + (Double) *argZ;
   /* Ensure the addition is not scheduled after fetestexcept call.  */
   FORCE_EVAL (u.d);

   /* Reset rounding mode and test for inexact simultaneously.  */
   int j = RESET_TESTINEXACT (env);

   if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
      u.ieee.mantissa1 |= j;

   /* And finally truncation with round to nearest.  */
   *res = (Float) u.d;
#endif
}


void VEX_REGPARM(3)
     h_generic_calc_MAddF64 ( /*OUT*/Double* res,
                               Double* argX, Double* argY, Double* argZ )
{
#ifndef ENV_TYPE
   /* Lame fallback implementation.  */
   *res = *argX * *argY + *argZ;
#else
   Double x = *argX, y = *argY, z = *argZ;
   union vg_ieee754_double u, v, w;
   int adjust = 0;
   u.d = x;
   v.d = y;
   w.d = z;
   if (UNLIKELY (u.ieee.exponent + v.ieee.exponent
                 >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
       || UNLIKELY (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
       || UNLIKELY (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
       || UNLIKELY (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
       || UNLIKELY (u.ieee.exponent + v.ieee.exponent
                    <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG)) {
      /* If z is Inf, but x and y are finite, the result should be
         z rather than NaN.  */
      if (w.ieee.exponent == 0x7ff
          && u.ieee.exponent != 0x7ff
          && v.ieee.exponent != 0x7ff) {
         *res = (z + x) + y;
         return;
      }
      /* If x or y or z is Inf/NaN, or if fma will certainly overflow,
         or if x * y is less than half of DBL_DENORM_MIN,
         compute as x * y + z.  */
      if (u.ieee.exponent == 0x7ff
          || v.ieee.exponent == 0x7ff
          || w.ieee.exponent == 0x7ff
          || u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS
          || u.ieee.exponent + v.ieee.exponent
             < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2) {
         *res = x * y + z;
         return;
      }
      if (u.ieee.exponent + v.ieee.exponent
          >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG) {
         /* Compute 1p-53 times smaller result and multiply
            at the end.  */
         if (u.ieee.exponent > v.ieee.exponent)
            u.ieee.exponent -= DBL_MANT_DIG;
         else
            v.ieee.exponent -= DBL_MANT_DIG;
         /* If x + y exponent is very large and z exponent is very small,
            it doesn't matter if we don't adjust it.  */
         if (w.ieee.exponent > DBL_MANT_DIG)
            w.ieee.exponent -= DBL_MANT_DIG;
         adjust = 1;
      } else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
         /* Similarly.
            If z exponent is very large and x and y exponents are
            very small, it doesn't matter if we don't adjust it.  */
         if (u.ieee.exponent > v.ieee.exponent) {
            if (u.ieee.exponent > DBL_MANT_DIG)
               u.ieee.exponent -= DBL_MANT_DIG;
         } else if (v.ieee.exponent > DBL_MANT_DIG)
            v.ieee.exponent -= DBL_MANT_DIG;
         w.ieee.exponent -= DBL_MANT_DIG;
         adjust = 1;
      } else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
         u.ieee.exponent -= DBL_MANT_DIG;
         if (v.ieee.exponent)
            v.ieee.exponent += DBL_MANT_DIG;
         else
            v.d *= 0x1p53;
      } else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG) {
         v.ieee.exponent -= DBL_MANT_DIG;
         if (u.ieee.exponent)
            u.ieee.exponent += DBL_MANT_DIG;
         else
            u.d *= 0x1p53;
      } else /* if (u.ieee.exponent + v.ieee.exponent
                    <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */ {
         if (u.ieee.exponent > v.ieee.exponent)
            u.ieee.exponent += 2 * DBL_MANT_DIG;
         else
            v.ieee.exponent += 2 * DBL_MANT_DIG;
         if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 4) {
            if (w.ieee.exponent)
               w.ieee.exponent += 2 * DBL_MANT_DIG;
            else
               w.d *= 0x1p106;
            adjust = -1;
         }
         /* Otherwise x * y should just affect inexact
            and nothing else.  */
      }
      x = u.d;
      y = v.d;
      z = w.d;
   }
   /* Multiplication m1 + m2 = x * y using Dekker's algorithm.  */
#  define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1)
   Double x1 = x * C;
   Double y1 = y * C;
   Double m1 = x * y;
   x1 = (x - x1) + x1;
   y1 = (y - y1) + y1;
   Double x2 = x - x1;
   Double y2 = y - y1;
   Double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
#  undef C

   /* Addition a1 + a2 = z + m1 using Knuth's algorithm.  */
   Double a1 = z + m1;
   Double t1 = a1 - z;
   Double t2 = a1 - t1;
   t1 = m1 - t1;
   t2 = z - t2;
   Double a2 = t1 + t2;

   ENV_TYPE env;
   ROUNDTOZERO (env);

   /* Perform m2 + a2 addition with round to odd.  */
   u.d = a2 + m2;

   if (UNLIKELY (adjust < 0)) {
      if ((u.ieee.mantissa1 & 1) == 0)
         u.ieee.mantissa1 |= TESTINEXACT ();
      v.d = a1 + u.d;
      /* Ensure the addition is not scheduled after fetestexcept call.  */
      FORCE_EVAL (v.d);
   }

   /* Reset rounding mode and test for inexact simultaneously.  */
   int j = RESET_TESTINEXACT (env) != 0;

   if (LIKELY (adjust == 0)) {
      if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
         u.ieee.mantissa1 |= j;
      /* Result is a1 + u.d.  */
      *res = a1 + u.d;
   } else if (LIKELY (adjust > 0)) {
      if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
         u.ieee.mantissa1 |= j;
      /* Result is a1 + u.d, scaled up.  */
      *res = (a1 + u.d) * 0x1p53;
   } else {
      /* If a1 + u.d is exact, the only rounding happens during
         scaling down.  */
      if (j == 0) {
         *res = v.d * 0x1p-106;
         return;
      }
      /* If result rounded to zero is not subnormal, no double
         rounding will occur.  */
      if (v.ieee.exponent > 106) {
         *res = (a1 + u.d) * 0x1p-106;
         return;
      }
      /* If v.d * 0x1p-106 with round to zero is a subnormal above
         or equal to DBL_MIN / 2, then v.d * 0x1p-106 shifts mantissa
         down just by 1 bit, which means v.ieee.mantissa1 |= j would
         change the round bit, not sticky or guard bit.
         v.d * 0x1p-106 never normalizes by shifting up,
         so round bit plus sticky bit should be already enough
         for proper rounding.  */
      if (v.ieee.exponent == 106) {
         /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
            v.ieee.mantissa1 & 1 is the round bit and j is our sticky
            bit.  In round-to-nearest 001 rounds down like 00,
            011 rounds up, even though 01 rounds down (thus we need
            to adjust), 101 rounds down like 10 and 111 rounds up
            like 11.  */
         if ((v.ieee.mantissa1 & 3) == 1) {
            v.d *= 0x1p-106;
            if (v.ieee.negative)
               *res = v.d - 0x1p-1074;
            else
               *res = v.d + 0x1p-1074;
         } else
            *res = v.d * 0x1p-106;
         return;
      }
      v.ieee.mantissa1 |= j;
      *res = v.d * 0x1p-106;
      return;
    }
#endif
}

/*---------------------------------------------------------------*/
/*--- end                                 host_generic_maddf.c --*/
/*---------------------------------------------------------------*/