123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331 |
- /* Copyright (C) 2007-2022 Free Software Foundation, Inc.
- This file is part of GCC.
- GCC is free software; you can redistribute it and/or modify it under
- the terms of the GNU General Public License as published by the Free
- Software Foundation; either version 3, or (at your option) any later
- version.
- GCC is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or
- FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
- for more details.
- Under Section 7 of GPL version 3, you are granted additional
- permissions described in the GCC Runtime Library Exception, version
- 3.1, as published by the Free Software Foundation.
- You should have received a copy of the GNU General Public License and
- a copy of the GCC Runtime Library Exception along with this program;
- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
- <http://www.gnu.org/licenses/>. */
- #ifndef _SQRT_MACROS_H_
- #define _SQRT_MACROS_H_
- #define FENCE __fence
- #if DOUBLE_EXTENDED_ON
- extern BINARY80 SQRT80 (BINARY80);
- __BID_INLINE__ UINT64
- short_sqrt128 (UINT128 A10) {
- BINARY80 lx, ly, l64;
- int_float f64;
- // 2^64
- f64.i = 0x5f800000;
- l64 = (BINARY80) f64.d;
- lx = (BINARY80) A10.w[1] * l64 + (BINARY80) A10.w[0];
- ly = SQRT80 (lx);
- return (UINT64) ly;
- }
- __BID_INLINE__ void
- long_sqrt128 (UINT128 * pCS, UINT256 C256) {
- UINT256 C4;
- UINT128 CS;
- UINT64 X;
- SINT64 SE;
- BINARY80 l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2,
- l1, l0, lp, lCl;
- int_float fx, f64, fm64;
- int *ple = (int *) &lx;
- // 2^64
- f64.i = 0x5f800000;
- l64 = (BINARY80) f64.d;
- l128 = l64 * l64;
- lx = l3 = (BINARY80) C256.w[3] * l64 * l128;
- l2 = (BINARY80) C256.w[2] * l128;
- lx = FENCE (lx + l2);
- l1 = (BINARY80) C256.w[1] * l64;
- lx = FENCE (lx + l1);
- l0 = (BINARY80) C256.w[0];
- lx = FENCE (lx + l0);
- // sqrt(C256)
- lS = SQRT80 (lx);
- // get coefficient
- // 2^(-64)
- fm64.i = 0x1f800000;
- lm64 = (BINARY80) fm64.d;
- CS.w[1] = (UINT64) (lS * lm64);
- CS.w[0] = (UINT64) (lS - (BINARY80) CS.w[1] * l64);
- ///////////////////////////////////////
- // CAUTION!
- // little endian code only
- // add solution for big endian
- //////////////////////////////////////
- lSH = lS;
- *((UINT64 *) & lSH) &= 0xffffffff00000000ull;
- // correction for C256 rounding
- lCl = FENCE (l3 - lx);
- lCl = FENCE (lCl + l2);
- lCl = FENCE (lCl + l1);
- lCl = FENCE (lCl + l0);
- lSL = lS - lSH;
- //////////////////////////////////////////
- // Watch for compiler re-ordering
- //
- /////////////////////////////////////////
- // C256-S^2
- lxL = FENCE (lx - lSH * lSH);
- lp = lSH * lSL;
- lp += lp;
- lxL = FENCE (lxL - lp);
- lSL *= lSL;
- lxL = FENCE (lxL - lSL);
- lCl += lxL;
- // correction term
- lE = lCl / (lS + lS);
- // get low part of coefficient
- X = CS.w[0];
- if (lCl >= 0) {
- SE = (SINT64) (lE);
- CS.w[0] += SE;
- if (CS.w[0] < X)
- CS.w[1]++;
- } else {
- SE = (SINT64) (-lE);
- CS.w[0] -= SE;
- if (CS.w[0] > X)
- CS.w[1]--;
- }
- pCS->w[0] = CS.w[0];
- pCS->w[1] = CS.w[1];
- }
- #else
- extern double sqrt (double);
- __BID_INLINE__ UINT64
- short_sqrt128 (UINT128 A10) {
- UINT256 ARS, ARS0, AE0, AE, S;
- UINT64 MY, ES, CY;
- double lx, l64;
- int_double f64, ly;
- int ey, k;
- // 2^64
- f64.i = 0x43f0000000000000ull;
- l64 = f64.d;
- lx = (double) A10.w[1] * l64 + (double) A10.w[0];
- ly.d = 1.0 / sqrt (lx);
- MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
- ey = 0x3ff - (ly.i >> 52);
- // A10*RS^2
- __mul_64x128_to_192 (ARS0, MY, A10);
- __mul_64x192_to_256 (ARS, MY, ARS0);
- // shr by 2*ey+40, to get a 64-bit value
- k = (ey << 1) + 104 - 64;
- if (k >= 128) {
- if (k > 128)
- ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k));
- else
- ES = ARS.w[2];
- } else {
- if (k >= 64) {
- ARS.w[0] = ARS.w[1];
- ARS.w[1] = ARS.w[2];
- k -= 64;
- }
- if (k) {
- __shr_128 (ARS, ARS, k);
- }
- ES = ARS.w[0];
- }
- ES = ((SINT64) ES) >> 1;
- if (((SINT64) ES) < 0) {
- ES = -ES;
- // A*RS*eps (scaled by 2^64)
- __mul_64x192_to_256 (AE0, ES, ARS0);
- AE.w[0] = AE0.w[1];
- AE.w[1] = AE0.w[2];
- AE.w[2] = AE0.w[3];
- __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
- __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
- S.w[2] = ARS0.w[2] + AE.w[2] + CY;
- } else {
- // A*RS*eps (scaled by 2^64)
- __mul_64x192_to_256 (AE0, ES, ARS0);
- AE.w[0] = AE0.w[1];
- AE.w[1] = AE0.w[2];
- AE.w[2] = AE0.w[3];
- __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]);
- __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY);
- S.w[2] = ARS0.w[2] - AE.w[2] - CY;
- }
- k = ey + 51;
- if (k >= 64) {
- if (k >= 128) {
- S.w[0] = S.w[2];
- S.w[1] = 0;
- k -= 128;
- } else {
- S.w[0] = S.w[1];
- S.w[1] = S.w[2];
- }
- k -= 64;
- }
- if (k) {
- __shr_128 (S, S, k);
- }
- return (UINT64) ((S.w[0] + 1) >> 1);
- }
- __BID_INLINE__ void
- long_sqrt128 (UINT128 * pCS, UINT256 C256) {
- UINT512 ARS0, ARS;
- UINT256 ARS00, AE, AE2, S;
- UINT128 ES, ES2, ARS1;
- UINT64 ES32, CY, MY;
- double l64, l128, lx, l2, l1, l0;
- int_double f64, ly;
- int ey, k, k2;
- // 2^64
- f64.i = 0x43f0000000000000ull;
- l64 = f64.d;
- l128 = l64 * l64;
- lx = (double) C256.w[3] * l64 * l128;
- l2 = (double) C256.w[2] * l128;
- lx = FENCE (lx + l2);
- l1 = (double) C256.w[1] * l64;
- lx = FENCE (lx + l1);
- l0 = (double) C256.w[0];
- lx = FENCE (lx + l0);
- // sqrt(C256)
- ly.d = 1.0 / sqrt (lx);
- MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull;
- ey = 0x3ff - (ly.i >> 52);
- // A10*RS^2, scaled by 2^(2*ey+104)
- __mul_64x256_to_320 (ARS0, MY, C256);
- __mul_64x320_to_384 (ARS, MY, ARS0);
- // shr by k=(2*ey+104)-128
- // expect k is in the range (192, 256) if result in [10^33, 10^34)
- // apply an additional signed shift by 1 at the same time (to get eps=eps0/2)
- k = (ey << 1) + 104 - 128 - 192;
- k2 = 64 - k;
- ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1));
- ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2);
- ES.w[1] = ((SINT64) ES.w[1]) >> 1;
- // A*RS >> 192 (for error term computation)
- ARS1.w[0] = ARS0.w[3];
- ARS1.w[1] = ARS0.w[4];
- // A*RS>>64
- ARS00.w[0] = ARS0.w[1];
- ARS00.w[1] = ARS0.w[2];
- ARS00.w[2] = ARS0.w[3];
- ARS00.w[3] = ARS0.w[4];
- if (((SINT64) ES.w[1]) < 0) {
- ES.w[0] = -ES.w[0];
- ES.w[1] = -ES.w[1];
- if (ES.w[0])
- ES.w[1]--;
- // A*RS*eps
- __mul_128x128_to_256 (AE, ES, ARS1);
- __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
- __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
- __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
- S.w[3] = ARS00.w[3] + AE.w[3] + CY;
- } else {
- // A*RS*eps
- __mul_128x128_to_256 (AE, ES, ARS1);
- __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]);
- __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY);
- __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY);
- S.w[3] = ARS00.w[3] - AE.w[3] - CY;
- }
- // 3/2*eps^2, scaled by 2^128
- ES32 = ES.w[1] + (ES.w[1] >> 1);
- __mul_64x64_to_128 (ES2, ES32, ES.w[1]);
- // A*RS*3/2*eps^2
- __mul_128x128_to_256 (AE2, ES2, ARS1);
- // result, scaled by 2^(ey+52-64)
- __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]);
- __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY);
- __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY);
- S.w[3] = S.w[3] + AE2.w[3] + CY;
- // k in (0, 64)
- k = ey + 51 - 128;
- k2 = 64 - k;
- S.w[0] = (S.w[1] >> k) | (S.w[2] << k2);
- S.w[1] = (S.w[2] >> k) | (S.w[3] << k2);
- // round to nearest
- S.w[0]++;
- if (!S.w[0])
- S.w[1]++;
- pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1);
- pCS->w[1] = S.w[1] >> 1;
- }
- #endif
- #endif
|