X-Git-Url: https://oss.titaniummirror.com/gitweb/?a=blobdiff_plain;f=libgcc%2Fconfig%2Flibbid%2Fbid_sqrt_macros.h;fp=libgcc%2Fconfig%2Flibbid%2Fbid_sqrt_macros.h;h=19150a2274765ff4e364bf198b8c2d75310230c0;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/libgcc/config/libbid/bid_sqrt_macros.h b/libgcc/config/libbid/bid_sqrt_macros.h new file mode 100644 index 00000000..19150a22 --- /dev/null +++ b/libgcc/config/libbid/bid_sqrt_macros.h @@ -0,0 +1,331 @@ +/* Copyright (C) 2007, 2009 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +#ifndef _SQRT_MACROS_H_ +#define _SQRT_MACROS_H_ + +#define FENCE __fence + +#if DOUBLE_EXTENDED_ON + +extern BINARY80 SQRT80 (BINARY80); + + +__BID_INLINE__ UINT64 +short_sqrt128 (UINT128 A10) { + BINARY80 lx, ly, l64; + int_float f64; + + // 2^64 + f64.i = 0x5f800000; + l64 = (BINARY80) f64.d; + lx = (BINARY80) A10.w[1] * l64 + (BINARY80) A10.w[0]; + ly = SQRT80 (lx); + return (UINT64) ly; +} + + +__BID_INLINE__ void +long_sqrt128 (UINT128 * pCS, UINT256 C256) { + UINT256 C4; + UINT128 CS; + UINT64 X; + SINT64 SE; + BINARY80 l64, lm64, l128, lxL, lx, ly, lS, lSH, lSL, lE, l3, l2, + l1, l0, lp, lCl; + int_float fx, f64, fm64; + int *ple = (int *) &lx; + + // 2^64 + f64.i = 0x5f800000; + l64 = (BINARY80) f64.d; + + l128 = l64 * l64; + lx = l3 = (BINARY80) C256.w[3] * l64 * l128; + l2 = (BINARY80) C256.w[2] * l128; + lx = FENCE (lx + l2); + l1 = (BINARY80) C256.w[1] * l64; + lx = FENCE (lx + l1); + l0 = (BINARY80) C256.w[0]; + lx = FENCE (lx + l0); + // sqrt(C256) + lS = SQRT80 (lx); + + // get coefficient + // 2^(-64) + fm64.i = 0x1f800000; + lm64 = (BINARY80) fm64.d; + CS.w[1] = (UINT64) (lS * lm64); + CS.w[0] = (UINT64) (lS - (BINARY80) CS.w[1] * l64); + + /////////////////////////////////////// + // CAUTION! + // little endian code only + // add solution for big endian + ////////////////////////////////////// + lSH = lS; + *((UINT64 *) & lSH) &= 0xffffffff00000000ull; + + // correction for C256 rounding + lCl = FENCE (l3 - lx); + lCl = FENCE (lCl + l2); + lCl = FENCE (lCl + l1); + lCl = FENCE (lCl + l0); + + lSL = lS - lSH; + + ////////////////////////////////////////// + // Watch for compiler re-ordering + // + ///////////////////////////////////////// + // C256-S^2 + lxL = FENCE (lx - lSH * lSH); + lp = lSH * lSL; + lp += lp; + lxL = FENCE (lxL - lp); + lSL *= lSL; + lxL = FENCE (lxL - lSL); + lCl += lxL; + + // correction term + lE = lCl / (lS + lS); + + // get low part of coefficient + X = CS.w[0]; + if (lCl >= 0) { + SE = (SINT64) (lE); + CS.w[0] += SE; + if (CS.w[0] < X) + CS.w[1]++; + } else { + SE = (SINT64) (-lE); + CS.w[0] -= SE; + if (CS.w[0] > X) + CS.w[1]--; + } + + pCS->w[0] = CS.w[0]; + pCS->w[1] = CS.w[1]; +} + +#else + +extern double sqrt (double); + +__BID_INLINE__ UINT64 +short_sqrt128 (UINT128 A10) { + UINT256 ARS, ARS0, AE0, AE, S; + + UINT64 MY, ES, CY; + double lx, l64; + int_double f64, ly; + int ey, k; + + // 2^64 + f64.i = 0x43f0000000000000ull; + l64 = f64.d; + lx = (double) A10.w[1] * l64 + (double) A10.w[0]; + ly.d = 1.0 / sqrt (lx); + + MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull; + ey = 0x3ff - (ly.i >> 52); + + // A10*RS^2 + __mul_64x128_to_192 (ARS0, MY, A10); + __mul_64x192_to_256 (ARS, MY, ARS0); + + // shr by 2*ey+40, to get a 64-bit value + k = (ey << 1) + 104 - 64; + if (k >= 128) { + if (k > 128) + ES = (ARS.w[2] >> (k - 128)) | (ARS.w[3] << (192 - k)); + else + ES = ARS.w[2]; + } else { + if (k >= 64) { + ARS.w[0] = ARS.w[1]; + ARS.w[1] = ARS.w[2]; + k -= 64; + } + if (k) { + __shr_128 (ARS, ARS, k); + } + ES = ARS.w[0]; + } + + ES = ((SINT64) ES) >> 1; + + if (((SINT64) ES) < 0) { + ES = -ES; + + // A*RS*eps (scaled by 2^64) + __mul_64x192_to_256 (AE0, ES, ARS0); + + AE.w[0] = AE0.w[1]; + AE.w[1] = AE0.w[2]; + AE.w[2] = AE0.w[3]; + + __add_carry_out (S.w[0], CY, ARS0.w[0], AE.w[0]); + __add_carry_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY); + S.w[2] = ARS0.w[2] + AE.w[2] + CY; + } else { + // A*RS*eps (scaled by 2^64) + __mul_64x192_to_256 (AE0, ES, ARS0); + + AE.w[0] = AE0.w[1]; + AE.w[1] = AE0.w[2]; + AE.w[2] = AE0.w[3]; + + __sub_borrow_out (S.w[0], CY, ARS0.w[0], AE.w[0]); + __sub_borrow_in_out (S.w[1], CY, ARS0.w[1], AE.w[1], CY); + S.w[2] = ARS0.w[2] - AE.w[2] - CY; + } + + k = ey + 51; + + if (k >= 64) { + if (k >= 128) { + S.w[0] = S.w[2]; + S.w[1] = 0; + k -= 128; + } else { + S.w[0] = S.w[1]; + S.w[1] = S.w[2]; + } + k -= 64; + } + if (k) { + __shr_128 (S, S, k); + } + + + return (UINT64) ((S.w[0] + 1) >> 1); + +} + + + +__BID_INLINE__ void +long_sqrt128 (UINT128 * pCS, UINT256 C256) { + UINT512 ARS0, ARS; + UINT256 ARS00, AE, AE2, S; + UINT128 ES, ES2, ARS1; + UINT64 ES32, CY, MY; + double l64, l128, lx, l2, l1, l0; + int_double f64, ly; + int ey, k, k2; + + // 2^64 + f64.i = 0x43f0000000000000ull; + l64 = f64.d; + + l128 = l64 * l64; + lx = (double) C256.w[3] * l64 * l128; + l2 = (double) C256.w[2] * l128; + lx = FENCE (lx + l2); + l1 = (double) C256.w[1] * l64; + lx = FENCE (lx + l1); + l0 = (double) C256.w[0]; + lx = FENCE (lx + l0); + // sqrt(C256) + ly.d = 1.0 / sqrt (lx); + + MY = (ly.i & 0x000fffffffffffffull) | 0x0010000000000000ull; + ey = 0x3ff - (ly.i >> 52); + + // A10*RS^2, scaled by 2^(2*ey+104) + __mul_64x256_to_320 (ARS0, MY, C256); + __mul_64x320_to_384 (ARS, MY, ARS0); + + // shr by k=(2*ey+104)-128 + // expect k is in the range (192, 256) if result in [10^33, 10^34) + // apply an additional signed shift by 1 at the same time (to get eps=eps0/2) + k = (ey << 1) + 104 - 128 - 192; + k2 = 64 - k; + ES.w[0] = (ARS.w[3] >> (k + 1)) | (ARS.w[4] << (k2 - 1)); + ES.w[1] = (ARS.w[4] >> k) | (ARS.w[5] << k2); + ES.w[1] = ((SINT64) ES.w[1]) >> 1; + + // A*RS >> 192 (for error term computation) + ARS1.w[0] = ARS0.w[3]; + ARS1.w[1] = ARS0.w[4]; + + // A*RS>>64 + ARS00.w[0] = ARS0.w[1]; + ARS00.w[1] = ARS0.w[2]; + ARS00.w[2] = ARS0.w[3]; + ARS00.w[3] = ARS0.w[4]; + + if (((SINT64) ES.w[1]) < 0) { + ES.w[0] = -ES.w[0]; + ES.w[1] = -ES.w[1]; + if (ES.w[0]) + ES.w[1]--; + + // A*RS*eps + __mul_128x128_to_256 (AE, ES, ARS1); + + __add_carry_out (S.w[0], CY, ARS00.w[0], AE.w[0]); + __add_carry_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY); + __add_carry_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY); + S.w[3] = ARS00.w[3] + AE.w[3] + CY; + } else { + // A*RS*eps + __mul_128x128_to_256 (AE, ES, ARS1); + + __sub_borrow_out (S.w[0], CY, ARS00.w[0], AE.w[0]); + __sub_borrow_in_out (S.w[1], CY, ARS00.w[1], AE.w[1], CY); + __sub_borrow_in_out (S.w[2], CY, ARS00.w[2], AE.w[2], CY); + S.w[3] = ARS00.w[3] - AE.w[3] - CY; + } + + // 3/2*eps^2, scaled by 2^128 + ES32 = ES.w[1] + (ES.w[1] >> 1); + __mul_64x64_to_128 (ES2, ES32, ES.w[1]); + // A*RS*3/2*eps^2 + __mul_128x128_to_256 (AE2, ES2, ARS1); + + // result, scaled by 2^(ey+52-64) + __add_carry_out (S.w[0], CY, S.w[0], AE2.w[0]); + __add_carry_in_out (S.w[1], CY, S.w[1], AE2.w[1], CY); + __add_carry_in_out (S.w[2], CY, S.w[2], AE2.w[2], CY); + S.w[3] = S.w[3] + AE2.w[3] + CY; + + // k in (0, 64) + k = ey + 51 - 128; + k2 = 64 - k; + S.w[0] = (S.w[1] >> k) | (S.w[2] << k2); + S.w[1] = (S.w[2] >> k) | (S.w[3] << k2); + + // round to nearest + S.w[0]++; + if (!S.w[0]) + S.w[1]++; + + pCS->w[0] = (S.w[1] << 63) | (S.w[0] >> 1); + pCS->w[1] = S.w[1] >> 1; + +} + +#endif +#endif