/*
 * e_rem_pio2.c
 *
 * Copyright (c) 1999-2018, Arm Limited.
 * SPDX-License-Identifier: MIT
 */

#include <math.h>
#include "math_private.h"

int __ieee754_rem_pio2(double x, double *y) {
    int q;

    y[1] = 0.0;                        /* default */

    /*
     * Simple cases: all nicked from the fdlibm version for speed.
     */
    {
        static const double invpio2 =  0x1.45f306dc9c883p-1;
        static const double pio2s[] = {
            0x1.921fb544p+0, /* 1.57079632673412561417e+00 */
            0x1.0b4611a626331p-34, /* 6.07710050650619224932e-11 */
            0x1.0b4611a6p-34, /* 6.07710050630396597660e-11 */
            0x1.3198a2e037073p-69, /* 2.02226624879595063154e-21 */
            0x1.3198a2ep-69, /* 2.02226624871116645580e-21 */
            0x1.b839a252049c1p-104, /* 8.47842766036889956997e-32 */
        };

        double z,w,t,r,fn;
        int i,j,idx,n,ix,hx;

        hx = __HI(x);           /* high word of x */
        ix = hx&0x7fffffff;
        if(ix<=0x3fe921fb)   /* |x| ~<= pi/4 , no need for reduction */
            {y[0] = x; y[1] = 0; return 0;}
        if(ix<0x4002d97c) {  /* |x| < 3pi/4, special case with n=+-1 */
            if(hx>0) {
                z = x - pio2s[0];
                if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
                    y[0] = z - pio2s[1];
#ifdef bottomhalf
                    y[1] = (z-y[0])-pio2s[1];
#endif
                } else {                /* near pi/2, use 33+33+53 bit pi */
                    z -= pio2s[2];
                    y[0] = z - pio2s[3];
#ifdef bottomhalf
                    y[1] = (z-y[0])-pio2s[3];
#endif
                }
                return 1;
            } else {    /* negative x */
                z = x + pio2s[0];
                if(ix!=0x3ff921fb) {    /* 33+53 bit pi is good enough */
                    y[0] = z + pio2s[1];
#ifdef bottomhalf
                    y[1] = (z-y[0])+pio2s[1];
#endif
                } else {                /* near pi/2, use 33+33+53 bit pi */
                    z += pio2s[2];
                    y[0] = z + pio2s[3];
#ifdef bottomhalf
                    y[1] = (z-y[0])+pio2s[3];
#endif
                }
                return -1;
            }
        }
        if(ix<=0x413921fb) { /* |x| ~<= 2^19*(pi/2), medium size */
            t  = fabs(x);
            n  = (int) (t*invpio2+0.5);
            fn = (double)n;
            r  = t-fn*pio2s[0];
            w  = fn*pio2s[1];    /* 1st round good to 85 bit */
            j  = ix>>20;
            idx = 1;
            while (1) {
                y[0] = r-w;
                if (idx == 3)
                    break;
                i = j-(((__HI(y[0]))>>20)&0x7ff);
                if(i <= 33*idx-17)
                    break;
                t  = r;
                w  = fn*pio2s[2*idx];
                r  = t-w;
                w  = fn*pio2s[2*idx+1]-((t-r)-w);
                idx++;
            }
#ifdef bottomhalf
            y[1] = (r-y[0])-w;
#endif
            if(hx<0) {
                y[0] = -y[0];
#ifdef bottomhalf
                y[1] = -y[1];
#endif
                return -n;
            } else {
                return n;
            }
        }
    }

    {
        static const unsigned twooverpi[] = {
            /* We start with two zero words, because they take up less
             * space than the array bounds checking and special-case
             * handling that would have to occur in their absence. */
            0, 0,
            /* 2/pi in hex is 0.a2f9836e... */
            0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599,
            0x3c439041, 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0,
            0x06492eea, 0x09d1921c, 0xfe1deb1c, 0xb129a73e, 0xe88235f5,
            0x2ebb4484, 0xe99c7026, 0xb45f7e41, 0x3991d639, 0x835339f4,
            0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f, 0xef2f118b,
            0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
            0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1,
            0x1f8d5d08,
        };

        /*
         * Multiprecision multiplication of this nature is more
         * readily done in integers than in VFP, since we can use
         * UMULL (on CPUs that support it) to multiply 32 by 32 bits
         * at a time whereas the VFP would only be able to do 12x12
         * without losing accuracy.
         *
         * So extract the mantissa of the input number as two 32-bit
         * integers.
         */
        unsigned mant1 = 0x00100000 | (__HI(x) & 0xFFFFF);
        unsigned mant2 = __LO(x);
        /*
         * Now work out which part of our stored value of 2/pi we're
         * supposed to be multiplying by.
         *
         * Let the IEEE exponent field of x be e. With its bias
         * removed, (e-1023) is the index of the set bit in bit 20
         * of 'mant1' (i.e. that set bit has real place value
         * 2^(e-1023)). So the lowest set bit in 'mant1', 52 bits
         * further down, must have place value 2^(e-1075).
         *
         * We begin taking an interest in the value of 2/pi at the
         * bit which multiplies by _that_ to give something with
         * place value at most 2. In other words, the highest bit of
         * 2/pi we're interested in is the one with place value
         * 2/(2^(e-1075)) = 2^(1076-e).
         *
         * The bit at the top of the first zero word of the above
         * array has place value 2^63. Hence, the bit we want to put
         * at the top of the first word we extract from that array
         * is the one at bit index n, where 63-n = 1076-e and hence
         * n=e-1013.
         */
        int topbitindex = ((__HI(x) >> 20) & 0x7FF) - 1013;
        int wordindex = topbitindex >> 5;
        int shiftup = topbitindex & 31;
        int shiftdown = 32 - shiftup;
        unsigned scaled[8];
        int i;

        scaled[7] = scaled[6] = 0;

        for (i = 6; i-- > 0 ;) {
            /*
             * Extract a word from our representation of 2/pi.
             */
            unsigned word;
            if (shiftup)
                word = (twooverpi[wordindex + i] << shiftup) | (twooverpi[wordindex + i + 1] >> shiftdown);
            else
                word = twooverpi[wordindex + i];
            /*
             * Multiply it by both words of our mantissa. (Should
             * generate UMULLs where available.)
             */
            unsigned long long mult1 = (unsigned long long)word * mant1;
            unsigned long long mult2 = (unsigned long long)word * mant2;
            /*
             * Split those up into 32-bit chunks.
             */
            unsigned bottom1 = (unsigned)mult1, top1 = (unsigned)(mult1 >> 32);
            unsigned bottom2 = (unsigned)mult2, top2 = (unsigned)(mult2 >> 32);
            /*
             * Add those two chunks together.
             */
            unsigned out1, out2, out3;

            out3 = bottom2;
            out2 = top2 + bottom1;
            out1 = top1 + (out2 < top2);
            /*
             * And finally add them to our 'scaled' array.
             */
            unsigned s3 = scaled[i+2], s2 = scaled[i+1], s1;
            unsigned carry;
            s3 = out3 + s3; carry = (s3 < out3);
            s2 = out2 + s2 + carry; carry = carry ? (s2 <= out2) : (s2 < out2);
            s1 = out1 + carry;

            scaled[i+2] = s3;
            scaled[i+1] = s2;
            scaled[i] = s1;
        }


        /*
         * The topmost set bit in mant1 is bit 20, and that has
         * place value 2^(e-1023). The topmost bit (bit 31) of the
         * most significant word we extracted from our twooverpi
         * array had place value 2^(1076-e). So the product of those
         * two bits must have place value 2^53; and that bit will
         * have ended up as bit 19 of scaled[0]. Hence, the integer
         * quadrant value we want to output, consisting of the bits
         * with place values 2^1 and 2^0, must be 52 and 53 bits
         * below that, i.e. precisely the topmost two bits of
         * scaled[2].
         *
         * Or, at least, it will be once we add 1/2, to round to the
         * _nearest_ multiple of pi/2 rather than the next one down.
         */
        q = (scaled[2] + (1<<29)) >> 30;

        /*
         * Now we construct the output fraction, which is most
         * simply done in the VFP. We just extract four consecutive
         * bit strings from our chunk of binary data, convert them
         * to doubles, equip each with an appropriate FP exponent,
         * add them together, and (don't forget) multiply back up by
         * pi/2. That way we don't have to work out ourselves where
         * the highest fraction bit ended up.
         *
         * Since our displacement from the nearest multiple of pi/2
         * can be positive or negative, the topmost of these four
         * values must be arranged with its 2^-1 bit at the very top
         * of the word, and then treated as a _signed_ integer.
         */
        int i1 = (scaled[2] << 2);
        unsigned i2 = scaled[3];
        unsigned i3 = scaled[4];
        unsigned i4 = scaled[5];
        double f1 = i1, f2 = i2 * (0x1.0p-30), f3 = i3 * (0x1.0p-62), f4 = i4 * (0x1.0p-94);

        /*
         * Now f1+f2+f3+f4 is a representation, potentially to
         * twice double precision, of 2^32 times ((2/pi)*x minus
         * some integer). So our remaining job is to multiply
         * back down by (pi/2)*2^-32, and convert back to one
         * single-precision output number.
         */
        double ftop = f1 + (f2 + (f3 + f4));
        ftop = __SET_LO(ftop, 0);
        double fbot = f4 - (((ftop-f1)-f2)-f3);

        /* ... and multiply by a prec-and-a-half value of (pi/2)*2^-32. */
        double ret = (ftop * 0x1.921fb54p-32) + (ftop * 0x1.10b4611a62633p-62 + fbot * 0x1.921fb54442d18p-32);

        /* Just before we return, take the input sign into account. */
        if (__HI(x) & 0x80000000) {
            q = -q;
            ret = -ret;
        }
        y[0] = ret;
        return q;
    }
}