/* * e_powf.c - single-precision power function * * Copyright (c) 2009-2018, Arm Limited. * SPDX-License-Identifier: MIT */ #include <math.h> #include <errno.h> #include "math_private.h" float powf(float x, float y) { float logh, logl; float rlogh, rlogl; float sign = 1.0f; int expadjust = 0; unsigned ix, iy; ix = fai(x); iy = fai(y); if (__builtin_expect((ix - 0x00800000) >= (0x7f800000 - 0x00800000) || ((iy << 1) + 0x02000000) < 0x40000000, 0)) { /* * The above test rules out, as quickly as I can see how to, * all possible inputs except for a normalised positive x * being raised to the power of a normalised (and not * excessively small) y. That's the fast-path case: if * that's what the user wants, we can skip all of the * difficult special-case handling. * * Now we must identify, as efficiently as we can, cases * which will return to the fast path with a little tidying * up. These are, in order of likelihood and hence of * processing: * * - a normalised _negative_ x raised to the power of a * non-zero finite y. Having identified this case, we * must categorise y into one of the three categories * 'odd integer', 'even integer' and 'non-integer'; for * the last of these we return an error, while for the * other two we rejoin the main code path having rendered * x positive and stored an appropriate sign to append to * the eventual result. * * - a _denormal_ x raised to the power of a non-zero * finite y, in which case we multiply it up by a power * of two to renormalise it, store an appropriate * adjustment for its base-2 logarithm, and depending on * the sign of y either return straight to the main code * path or go via the categorisation of y above. * * - any of the above kinds of x raised to the power of a * zero, denormal, nearly-denormal or nearly-infinite y, * in which case we must do the checks on x as above but * otherwise the algorithm goes through basically * unchanged. Denormal and very tiny y values get scaled * up to something not in range of accidental underflow * when split into prec-and-a-half format; very large y * values get scaled down by a factor of two to prevent * CLEARBOTTOMHALF's round-up from overflowing them to * infinity. (Of course the _output_ will overflow either * way - the largest y value that can possibly yield a * finite result is well below this range anyway - so * this is a safe change.) */ if (__builtin_expect(((iy << 1) + 0x02000000) >= 0x40000000, 1)) { /* normalised and sensible y */ y_ok_check_x: if (__builtin_expect((ix - 0x80800000) < (0xff800000 - 0x80800000), 1)) { /* normal but negative x */ y_ok_x_negative: x = fabsf(x); ix = fai(x); /* * Determine the parity of y, if it's an integer at * all. */ { int yexp, yunitsbit; /* * Find the exponent of y. */ yexp = (iy >> 23) & 0xFF; /* * Numbers with an exponent smaller than 0x7F * are strictly smaller than 1, and hence must * be fractional. */ if (yexp < 0x7F) return MATHERR_POWF_NEGFRAC(x,y); /* * Numbers with an exponent at least 0x97 are by * definition even integers. */ if (yexp >= 0x97) goto mainpath; /* rejoin main code, giving positive result */ /* * In between, we must check the mantissa. * * Note that this case includes yexp==0x7f, * which means 1 point something. In this case, * the 'units bit' we're testing is semantically * the lowest bit of the exponent field, not the * leading 1 on the mantissa - but fortunately, * that bit position will just happen to contain * the 1 that we would wish it to, because the * exponent describing that particular case just * happens to be odd. */ yunitsbit = 0x96 - yexp; if (iy & ((1 << yunitsbit)-1)) return MATHERR_POWF_NEGFRAC(x,y); else if (iy & (1 << yunitsbit)) sign = -1.0f; /* y is odd; result should be negative */ goto mainpath; /* now we can rejoin the main code */ } } else if (__builtin_expect((ix << 1) != 0 && (ix << 1) < 0x01000000, 0)) { /* denormal x */ /* * Renormalise x. */ x *= 0x1p+27F; ix = fai(x); /* * Set expadjust to compensate for that. */ expadjust = -27; /* Now we need to handle negative x as above. */ if (ix & 0x80000000) goto y_ok_x_negative; else goto mainpath; } else if ((ix - 0x00800000) < (0x7f800000 - 0x00800000)) { /* normal positive x, back here from denormal-y case below */ goto mainpath; } } else if (((iy << 1) + 0x02000000) >= 0x02000000) { /* denormal, nearly-denormal or zero y */ if (y == 0.0F) { /* * y == 0. Any finite x returns 1 here. (Quiet NaNs * do too, but we handle that below since we don't * mind doing them more slowly.) */ if ((ix << 1) != 0 && (ix << 1) < 0xFF000000) return 1.0f; } else { /* * Denormal or very very small y. In this situation * we have to be a bit careful, because when we * break up y into precision-and-a-half later on we * risk working with denormals and triggering * underflow exceptions within this function that * aren't related to the smallness of the output. So * here we convert all such y values into a standard * small-but-not-too-small value which will give the * same output. * * What value should that be? Well, we work in * 16*log2(x) below (equivalently, log to the base * 2^{1/16}). So the maximum magnitude of that for * any finite x is about 2416 (= 16 * (128+23), for * log of the smallest denormal x), i.e. certainly * less than 2^12. If multiplying that by y gives * anything of magnitude less than 2^-32 (and even * that's being generous), the final output will be * indistinguishable from 1. So any value of y with * magnitude less than 2^-(32+12) = 2^-44 is * completely indistinguishable from any other such * value. Hence we got here in the first place by * checking the exponent of y against 64 (i.e. -63, * counting the exponent bias), so we might as well * normalise all tiny y values to the same threshold * of 2^-64. */ iy = 0x1f800000 | (iy & 0x80000000); /* keep the sign; that's important */ y = fhex(iy); } goto y_ok_check_x; } else if (((iy << 1) + 0x02000000) < 0x01000000) { /* y in top finite exponent bracket */ y = fhex(fai(y) - 0x00800000); /* scale down by a factor of 2 */ goto y_ok_check_x; } /* * Having dealt with the above cases, we now know that * either x is zero, infinite or NaN, or y is infinite or * NaN, or both. We can deal with all of those cases without * ever rejoining the main code path. */ if ((unsigned)(((ix & 0x7FFFFFFF) - 0x7f800001) < 0x7fc00000 - 0x7f800001) || (unsigned)(((iy & 0x7FFFFFFF) - 0x7f800001) < 0x7fc00000 - 0x7f800001)) { /* * At least one signalling NaN. Do a token arithmetic * operation on the two operands to provoke an exception * and return the appropriate QNaN. */ return FLOAT_INFNAN2(x,y); } else if (ix==0x3f800000 || (iy << 1)==0) { /* * C99 says that 1^anything and anything^0 should both * return 1, _even for a NaN_. I modify that slightly to * apply only to QNaNs (which doesn't violate C99, since * C99 doesn't specify anything about SNaNs at all), * because I can't bring myself not to throw an * exception on an SNaN since its _entire purpose_ is to * throw an exception whenever touched. */ return 1.0f; } else if (((ix & 0x7FFFFFFF) > 0x7f800000) || ((iy & 0x7FFFFFFF) > 0x7f800000)) { /* * At least one QNaN. Do a token arithmetic operation on * the two operands to get the right one to propagate to * the output. */ return FLOAT_INFNAN2(x,y); } else if (ix == 0x7f800000) { /* * x = +infinity. Return +infinity for positive y, +0 * for negative y, and 1 for zero y. */ if (!(iy << 1)) return MATHERR_POWF_INF0(x,y); else if (iy & 0x80000000) return 0.0f; else return INFINITY; } else { /* * Repeat the parity analysis of y above, returning 1 * (odd), 2 (even) or 0 (fraction). */ int ypar, yexp, yunitsbit; yexp = (iy >> 23) & 0xFF; if (yexp < 0x7F) ypar = 0; else if (yexp >= 0x97) ypar = 2; else { yunitsbit = 0x96 - yexp; if (iy & ((1 << yunitsbit)-1)) ypar = 0; else if (iy & (1 << yunitsbit)) ypar = 1; else ypar = 2; } if (ix == 0xff800000) { /* * x = -infinity. We return infinity or zero * depending on whether y is positive or negative, * and the sign is negative iff y is an odd integer. * (SGT: I don't like this, but it's what C99 * mandates.) */ if (!(iy & 0x80000000)) { if (ypar == 1) return -INFINITY; else return INFINITY; } else { if (ypar == 1) return -0.0f; else return +0.0f; } } else if (ix == 0) { /* * x = +0. We return +0 for all positive y including * infinity; a divide-by-zero-like error for all * negative y including infinity; and an 0^0 error * for zero y. */ if ((iy << 1) == 0) return MATHERR_POWF_00(x,y); else if (iy & 0x80000000) return MATHERR_POWF_0NEGEVEN(x,y); else return +0.0f; } else if (ix == 0x80000000) { /* * x = -0. We return errors in almost all cases (the * exception being positive integer y, in which case * we return a zero of the appropriate sign), but * the errors are almost all different. Gah. */ if ((iy << 1) == 0) return MATHERR_POWF_00(x,y); else if (iy == 0x7f800000) return MATHERR_POWF_NEG0FRAC(x,y); else if (iy == 0xff800000) return MATHERR_POWF_0NEG(x,y); else if (iy & 0x80000000) return (ypar == 0 ? MATHERR_POWF_0NEG(x,y) : ypar == 1 ? MATHERR_POWF_0NEGODD(x,y) : /* ypar == 2 ? */ MATHERR_POWF_0NEGEVEN(x,y)); else return (ypar == 0 ? MATHERR_POWF_NEG0FRAC(x,y) : ypar == 1 ? -0.0f : /* ypar == 2 ? */ +0.0f); } else { /* * Now we know y is an infinity of one sign or the * other and x is finite and nonzero. If x == -1 (+1 * is already ruled out), we return +1; otherwise * C99 mandates that we return either +0 or +inf, * the former iff exactly one of |x| < 1 and y<0 is * true. */ if (ix == 0xbf800000) { return +1.0f; } else if (!((ix << 1) < 0x7f000000) ^ !(iy & 0x80000000)) { return +0.0f; } else { return INFINITY; } } } } mainpath: #define PHMULTIPLY(rh,rl, xh,xl, yh,yl) do { \ float tmph, tmpl; \ tmph = (xh) * (yh); \ tmpl = (xh) * (yl) + (xl) * ((yh)+(yl)); \ /* printf("PHMULTIPLY: tmp=%08x+%08x\n", fai(tmph), fai(tmpl)); */ \ (rh) = CLEARBOTTOMHALF(tmph + tmpl); \ (rl) = tmpl + (tmph - (rh)); \ } while (0) /* * Same as the PHMULTIPLY macro above, but bounds the absolute value * of rh+rl. In multiplications uncontrolled enough that rh can go * infinite, we can get an IVO exception from the subtraction tmph - * rh, so we should spot that case in advance and avoid it. */ #define PHMULTIPLY_SATURATE(rh,rl, xh,xl, yh,yl, bound) do { \ float tmph, tmpl; \ tmph = (xh) * (yh); \ if (fabsf(tmph) > (bound)) { \ (rh) = copysignf((bound),(tmph)); \ (rl) = 0.0f; \ } else { \ tmpl = (xh) * (yl) + (xl) * ((yh)+(yl)); \ (rh) = CLEARBOTTOMHALF(tmph + tmpl); \ (rl) = tmpl + (tmph - (rh)); \ } \ } while (0) /* * Determine log2 of x to relative prec-and-a-half, as logh + * logl. * * Well, we're actually computing 16*log2(x), so that it's the * right size for the subsequently fiddly messing with powers of * 2^(1/16) in the exp step at the end. */ if (__builtin_expect((ix - 0x3f7ff000) <= (0x3f801000 - 0x3f7ff000), 0)) { /* * For x this close to 1, we write x = 1 + t and then * compute t - t^2/2 + t^3/3 - t^4/4; and the neat bit is * that t itself, being the bottom half of an input * mantissa, is in half-precision already, so the output is * naturally in canonical prec-and-a-half form. */ float t = x - 1.0; float lnh, lnl; /* * Compute natural log of x in prec-and-a-half. */ lnh = t; lnl = - (t * t) * ((1.0f/2.0f) - t * ((1.0f/3.0f) - t * (1.0f/4.0f))); /* * Now we must scale by 16/log(2), still in prec-and-a-half, * to turn this from natural log(x) into 16*log2(x). */ PHMULTIPLY(logh, logl, lnh, lnl, 0x1.716p+4F, -0x1.7135a8p-9F); } else { /* * For all other x, we start by normalising to [1,2), and * then dividing that further into subintervals. For each * subinterval we pick a number a in that interval, compute * s = (x-a)/(x+a) in precision-and-a-half, and then find * the log base 2 of (1+s)/(1-s), still in precision-and-a- * half. * * Why would we do anything so silly? For two main reasons. * * Firstly, if s = (x-a)/(x+a), then a bit of algebra tells * us that x = a * (1+s)/(1-s); so once we've got * log2((1+s)/(1-s)), we need only add on log2(a) and then * we've got log2(x). So this lets us treat all our * subintervals in essentially the same way, rather than * requiring a separate approximation for each one; the only * correction factor we need is to store a table of the * base-2 logs of all our values of a. * * Secondly, log2((1+s)/(1-s)) is a nice thing to compute, * once we've got s. Switching to natural logarithms for the * moment (it's only a scaling factor to sort that out at * the end), we write it as the difference of two logs: * * log((1+s)/(1-s)) = log(1+s) - log(1-s) * * Now recall that Taylor series expansion gives us * * log(1+s) = s - s^2/2 + s^3/3 - ... * * and therefore we also have * * log(1-s) = -s - s^2/2 - s^3/3 - ... * * These series are exactly the same except that the odd * terms (s, s^3 etc) have flipped signs; so subtracting the * latter from the former gives us * * log(1+s) - log(1-s) = 2s + 2s^3/3 + 2s^5/5 + ... * * which requires only half as many terms to be computed * before the powers of s get too small to see. Then, of * course, we have to scale the result by 1/log(2) to * convert natural logs into logs base 2. * * To compute the above series in precision-and-a-half, we * first extract a factor of 2s (which we can multiply back * in later) so that we're computing 1 + s^2/3 + s^4/5 + ... * and then observe that if s starts off small enough to * make s^2/3 at most 2^-12, we need only compute the first * couple of terms in laborious prec-and-a-half, and can * delegate everything after that to a simple polynomial * approximation whose error will end up at the bottom of * the low word of the result. * * How many subintervals does that mean we need? * * To go back to s = (x-a)/(x+a). Let x = a + e, for some * positive e. Then |s| = |e| / |2a+e| <= |e/2a|. So suppose * we have n subintervals of equal width covering the space * from 1 to 2. If a is at the centre of each interval, then * we have e at most 1/2n and a can equal any of 1, 1+1/n, * 1+2/n, ... 1+(n-1)/n. In that case, clearly the largest * value of |e/2a| is given by the largest e (i.e. 1/2n) and * the smallest a (i.e. 1); so |s| <= 1/4n. Hence, when we * know how big we're prepared to let s be, we simply make * sure 1/4n is at most that. * * And if we want s^2/3 to be at most 2^-12, then that means * s^2 is at most 3*2^-12, so that s is at most sqrt(3)*2^-6 * = 0.02706. To get 1/4n smaller than that, we need to have * n>=9.23; so we'll set n=16 (for ease of bit-twiddling), * and then s is at most 1/64. */ int n, i; float a, ax, sh, sl, lsh, lsl; /* * Let ax be x normalised to a single exponent range. * However, the exponent range in question is not a simple * one like [1,2). What we do is to round up the top four * bits of the mantissa, so that the top 1/32 of each * natural exponent range rounds up to the next one and is * treated as a displacement from the lowest a in that * range. * * So this piece of bit-twiddling gets us our input exponent * and our subinterval index. */ n = (ix + 0x00040000) >> 19; i = n & 15; n = ((n >> 4) & 0xFF) - 0x7F; ax = fhex(ix - (n << 23)); n += expadjust; /* * Compute the subinterval centre a. */ a = 1.0f + i * (1.0f/16.0f); /* * Compute s = (ax-a)/(ax+a), in precision-and-a-half. */ { float u, vh, vl, vapprox, rvapprox; u = ax - a; /* exact numerator */ vapprox = ax + a; /* approximate denominator */ vh = CLEARBOTTOMHALF(vapprox); vl = (a - vh) + ax; /* vh+vl is exact denominator */ rvapprox = 1.0f/vapprox; /* approximate reciprocal of denominator */ sh = CLEARBOTTOMHALF(u * rvapprox); sl = ((u - sh*vh) - sh*vl) * rvapprox; } /* * Now compute log2(1+s) - log2(1-s). We do this in several * steps. * * By polynomial approximation, we compute * * log(1+s) - log(1-s) * p = ------------------- - 1 * 2s * * in single precision only, using a single-precision * approximation to s. This polynomial has s^2 as its * lowest-order term, so we expect the result to be in * [0,2^-12). * * Then we form a prec-and-a-half number out of 1 and p, * which is therefore equal to (log(1+s) - log(1-s))/(2s). * * Finally, we do two prec-and-a-half multiplications: one * by s itself, and one by the constant 32/log(2). */ { float s = sh + sl; float s2 = s*s; /* * p is actually a polynomial in s^2, with the first * term constrained to zero. In other words, treated on * its own terms, we're computing p(s^2) such that p(x) * is an approximation to the sum of the series 1/3 + * x/5 + x^2/7 + ..., valid on the range [0, 1/40^2]. */ float p = s2 * (0.33333332920177422f + s2 * 0.20008275183621479f); float th, tl; PHMULTIPLY(th,tl, 1.0f,p, sh,sl); PHMULTIPLY(lsh,lsl, th,tl, 0x1.716p+5F,-0x1.7135a8p-8F); } /* * And our final answer for 16*log2(x) is equal to 16n (from * the exponent), plus lsh+lsl (the result of the above * computation), plus 16*log2(a) which we must look up in a * table. */ { struct f2 { float h, l; }; static const struct f2 table[16] = { /* * When constructing this table, we have to be sure * that we produce the same values of a which will * be produced by the computation above. Ideally, I * would tell Perl to actually do its _arithmetic_ * in single precision here; but I don't know a way * to do that, so instead I just scrupulously * convert every intermediate value to and from SP. */ // perl -e 'for ($i=0; $i<16; $i++) { $v = unpack "f", pack "f", 1/16.0; $a = unpack "f", pack "f", $i * $v; $a = unpack "f", pack "f", $a+1.0; $l = 16*log($a)/log(2); $top = unpack "f", pack "f", int($l*256.0+0.5)/256.0; $bot = unpack "f", pack "f", $l - $top; printf "{0f_%08X,0f_%08X}, ", unpack "VV", pack "ff", $top, $bot; } print "\n"' | fold -s -w56 | sed 's/^/ /' {0x0p+0F,0x0p+0F}, {0x1.66p+0F,0x1.fb7d64p-11F}, {0x1.5cp+1F,0x1.a39fbep-15F}, {0x1.fcp+1F,-0x1.f4a37ep-10F}, {0x1.49cp+2F,-0x1.87b432p-10F}, {0x1.91cp+2F,-0x1.15db84p-12F}, {0x1.d68p+2F,-0x1.583f9ap-11F}, {0x1.0c2p+3F,-0x1.f5fe54p-10F}, {0x1.2b8p+3F,0x1.a39fbep-16F}, {0x1.49ap+3F,0x1.e12f34p-11F}, {0x1.66ap+3F,0x1.1c8f12p-18F}, {0x1.828p+3F,0x1.3ab7cep-14F}, {0x1.9d6p+3F,-0x1.30158p-12F}, {0x1.b74p+3F,0x1.291eaap-10F}, {0x1.d06p+3F,-0x1.8125b4p-10F}, {0x1.e88p+3F,0x1.8d66c4p-10F}, }; float lah = table[i].h, lal = table[i].l; float fn = 16*n; logh = CLEARBOTTOMHALF(lsl + lal + lsh + lah + fn); logl = lsl - ((((logh - fn) - lah) - lsh) - lal); } } /* * Now we have 16*log2(x), multiply it by y in prec-and-a-half. */ { float yh, yl; int savedexcepts; yh = CLEARBOTTOMHALF(y); yl = y - yh; /* This multiplication could become infinite, so to avoid IVO * in PHMULTIPLY we bound the output at 4096, which is big * enough to allow any non-overflowing case through * unmodified. Also, we must mask out the OVF exception, which * we won't want left in the FP status word in the case where * rlogh becomes huge and _negative_ (since that will be an * underflow from the perspective of powf's return value, not * an overflow). */ savedexcepts = __ieee_status(0,0) & (FE_IEEE_OVERFLOW | FE_IEEE_UNDERFLOW); PHMULTIPLY_SATURATE(rlogh, rlogl, logh, logl, yh, yl, 4096.0f); __ieee_status(FE_IEEE_OVERFLOW | FE_IEEE_UNDERFLOW, savedexcepts); } /* * And raise 2 to the power of whatever that gave. Again, this * is done in three parts: the fractional part of our input is * fed through a polynomial approximation, all but the bottom * four bits of the integer part go straight into the exponent, * and the bottom four bits of the integer part index into a * lookup table of powers of 2^(1/16) in prec-and-a-half. */ { float rlog = rlogh + rlogl; int i16 = (rlog + (rlog < 0 ? -0.5f : +0.5f)); float rlogi = i16 >> 4; float x = rlogl + (rlogh - i16); static const float powersof2to1over16top[16] = { 0x1p+0F, 0x1.0b4p+0F, 0x1.172p+0F, 0x1.238p+0F, 0x1.306p+0F, 0x1.3dep+0F, 0x1.4bep+0F, 0x1.5aap+0F, 0x1.6ap+0F, 0x1.7ap+0F, 0x1.8acp+0F, 0x1.9c4p+0F, 0x1.ae8p+0F, 0x1.c18p+0F, 0x1.d58p+0F, 0x1.ea4p+0F }; static const float powersof2to1over16bot[16] = { 0x0p+0F, 0x1.586cfap-12F, 0x1.7078fap-13F, 0x1.e9b9d6p-14F, 0x1.fc1464p-13F, 0x1.4c9824p-13F, 0x1.dad536p-12F, 0x1.07dd48p-12F, 0x1.3cccfep-13F, 0x1.1473ecp-12F, 0x1.ca8456p-13F, 0x1.230548p-13F, 0x1.3f32b6p-13F, 0x1.9bdd86p-12F, 0x1.8dcfbap-16F, 0x1.5f454ap-13F }; static const float powersof2to1over16all[16] = { 0x1p+0F, 0x1.0b5586p+0F, 0x1.172b84p+0F, 0x1.2387a6p+0F, 0x1.306fep+0F, 0x1.3dea64p+0F, 0x1.4bfdaep+0F, 0x1.5ab07ep+0F, 0x1.6a09e6p+0F, 0x1.7a1148p+0F, 0x1.8ace54p+0F, 0x1.9c4918p+0F, 0x1.ae89fap+0F, 0x1.c199bep+0F, 0x1.d5818ep+0F, 0x1.ea4afap+0F }; /* * Coefficients generated using the command ./auxiliary/remez.jl --suffix=f -- '-1/BigFloat(2)' '+1/BigFloat(2)' 2 0 'expm1(x*log(BigFloat(2))/16)/x' */ float p = x * ( 4.332169876512769231967668743345473181486157887703125512683507537369503902991722e-02f+x*(9.384123108485637159805511308039285411735300871134684682779057580789341719567367e-04f+x*(1.355120515540562256928614563584948866224035897564701496826514330445829352922309e-05f)) ); int index = (i16 & 15); p = powersof2to1over16top[index] + (powersof2to1over16bot[index] + powersof2to1over16all[index]*p); if ( fabsf(rlogi) < 126.0f ) { return sign * p * fhex((unsigned)((127.0f+rlogi) * 8388608.0f)); } else if ( fabsf(rlogi) < 192.0f ) { int i = rlogi; float ret; ret = sign * p * fhex((unsigned)((0x7f+i/2) * 8388608)) * fhex((unsigned)((0x7f+i-i/2) * 8388608)); if ((fai(ret) << 1) == 0xFF000000) return MATHERR_POWF_OFL(x, y, sign); else if ((fai(ret) << 1) == 0) return MATHERR_POWF_UFL(x, y, sign); else return FLOAT_CHECKDENORM(ret); } else { if (rlogi < 0) return MATHERR_POWF_UFL(x, y, sign); else return MATHERR_POWF_OFL(x, y, sign); } } }