From 0a95c33d1160a6a284cec310ecf00ae04a29137c Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Tue, 13 Aug 2019 23:25:14 +0000 Subject: [PATCH] Add fma, fused-multiply/add. Not yet open-coded anywhere. Will be a huge pain on x86. No aarch64 flonum open-coding at all yet. (Maybe flo:fast-fma? should return false if it's not open-coded...) --- doc/ref-manual/numbers.texi | 27 +++ src/microcode/configure.ac | 2 +- src/microcode/flonum.c | 17 ++ src/microcode/fma.c | 315 +++++++++++++++++++++++++++ src/microcode/fma.h | 40 ++++ src/microcode/makegen/files-core.scm | 1 + src/relnotes/fma | 8 + src/runtime/primitive-arithmetic.scm | 3 + src/runtime/runtime.pkg | 3 + tests/runtime/test-arith.scm | 11 +- 10 files changed, 425 insertions(+), 2 deletions(-) create mode 100644 src/microcode/fma.c create mode 100644 src/microcode/fma.h create mode 100644 src/relnotes/fma diff --git a/doc/ref-manual/numbers.texi b/doc/ref-manual/numbers.texi index ed0d6244d..5c81be967 100644 --- a/doc/ref-manual/numbers.texi +++ b/doc/ref-manual/numbers.texi @@ -2025,6 +2025,33 @@ These procedures are the standard arithmetic operations on flonums. When compiled, they do not check the types of their arguments. @end deffn +@deffn procedure flo:*+ flonum1 flonum2 flonum3 +@deffnx procedure flo:fma flonum1 flonum2 flonum3 +@deffnx procedure flo:fast-fma? +Fused multiply-add: +@code{(flo:*+ @var{u} @var{v} @var{a})} computes @math{uv+a} correctly +rounded, with no intermediate overflow or underflow arising from +@math{uv}. +In contrast, @code{(flo:+ (flo:* @var{u} @var{v}) @var{a})} may have +two rounding errors, and can overflow or underflow if @math{uv} is too +large or too small even if @math{uv + a} is normal. +@code{Flo:fma} is an alias for @code{flo:*+} with the more familiar +name used in other languages like C. + +@code{Flo:fast-fma?} returns true if the implementation of fused +multiply-add is supported by fast hardware, and false if it is +emulated using Dekker's double-precision algorithm in software. + +@example +@group +(flo:+ (flo:* 1.2e100 2e208) -1.4e308) + @result{} +inf.0 @r{; (raises overflow)} +(flo:*+ 1.2e100 2e208 -1.4e308) + @result{} 1e308 +@end group +@end example +@end deffn + @deffn procedure flo:negate flonum This procedure returns the negation of its argument. When compiled, it does not check the type of its argument. diff --git a/src/microcode/configure.ac b/src/microcode/configure.ac index 87e01329c..213685d38 100644 --- a/src/microcode/configure.ac +++ b/src/microcode/configure.ac @@ -725,7 +725,7 @@ AC_CHECK_FUNCS([clock_gettime closefrom ctermid]) AC_CHECK_FUNCS([dup2]) AC_CHECK_FUNCS([expm1]) AC_CHECK_FUNCS([fcntl fdatasync]) -AC_CHECK_FUNCS([floor fmod fpathconf]) +AC_CHECK_FUNCS([floor fma fmod fpathconf]) AC_CHECK_FUNCS([frexp fsync fsync_range ftruncate]) AC_CHECK_FUNCS([getcwd gethostbyname gethostname getlogin getpagesize getpgrp]) AC_CHECK_FUNCS([getpt gettimeofday getwd grantpt]) diff --git a/src/microcode/flonum.c b/src/microcode/flonum.c index b8c43fe0e..a5c6df166 100644 --- a/src/microcode/flonum.c +++ b/src/microcode/flonum.c @@ -32,6 +32,7 @@ USA. #include "osscheme.h" /* error_unimplemented_primitive -- foo */ #include "prims.h" #include "ctassert.h" +#include "fma.h" double arg_flonum (int arg_number) @@ -130,6 +131,22 @@ DEFINE_PRIMITIVE ("FLONUM-ABS", Prim_flonum_abs, 1, 1, 0) ((UINT64_C (0x7fffffffffffffff)) & (arg_flonum_binary64 (1))); } +DEFINE_PRIMITIVE ("FLONUM-FMA", Prim_flonum_fma, 3, 3, 0) +{ + PRIMITIVE_HEADER (3); + FLONUM_RESULT (fma ((arg_flonum (1)), (arg_flonum (2)), (arg_flonum (3)))); +} + +DEFINE_PRIMITIVE ("FLONUM-FAST-FMA?", Prim_flonum_fast_fma_p, 0, 0, 0) +{ + PRIMITIVE_HEADER (0); +#ifdef FP_FAST_FMA + BOOLEAN_RESULT (1); +#else + BOOLEAN_RESULT (0); +#endif +} + static inline void invalid_if_unordered (double x, double y) { diff --git a/src/microcode/fma.c b/src/microcode/fma.c new file mode 100644 index 000000000..d81827efd --- /dev/null +++ b/src/microcode/fma.c @@ -0,0 +1,315 @@ +/* $NetBSD: s_fma.c,v 1.7 2017/05/06 18:02:52 christos Exp $ */ + +/*- + * Copyright (c) 2005-2011 David Schultz + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* Fused Multiply-Add */ + +#include "fma.h" + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include + +/* XXX Must come before other local includes because cmpintmd.h is busted. */ +#include "scheme.h" + +#include "floenv.h" + +#ifndef HAVE_FMA + +static inline uint64_t +extract_word64(double d) +{ + union { uint64_t w64; double d; } u; + (u.d) = d; + return (u.w64); +} + +static inline double +insert_word64(uint64_t w64) +{ + union { uint64_t w64; double d; } u; + (u.w64) = w64; + return (u.d); +} + +#define EXTRACT_WORD64(w64, d) (w64) = (extract_word64 (d)) +#define INSERT_WORD64(d, w64) (d) = (insert_word64 (w64)) + +/*****************************************************************************/ + +/* + * A struct dd represents a floating-point number with twice the precision + * of a double. We maintain the invariant that "hi" stores the 53 high-order + * bits of the result. + */ +struct dd { + double hi; + double lo; +}; + +/* + * Compute a+b exactly, returning the exact result in a struct dd. We assume + * that both a and b are finite, but make no assumptions about their relative + * magnitudes. + */ +static inline struct dd +dd_add(double a, double b) +{ + struct dd ret; + double s; + + ret.hi = a + b; + s = ret.hi - a; + ret.lo = (a - (ret.hi - s)) + (b - s); + return (ret); +} + +/* + * Compute a+b, with a small tweak: The least significant bit of the + * result is adjusted into a sticky bit summarizing all the bits that + * were lost to rounding. This adjustment negates the effects of double + * rounding when the result is added to another number with a higher + * exponent. For an explanation of round and sticky bits, see any reference + * on FPU design, e.g., + * + * J. Coonen. An Implementation Guide to a Proposed Standard for + * Floating-Point Arithmetic. Computer, vol. 13, no. 1, Jan 1980. + */ +static inline double +add_adjusted(double a, double b) +{ + struct dd sum; + uint64_t hibits, lobits; + + sum = dd_add(a, b); + if (sum.lo != 0) { + EXTRACT_WORD64(hibits, sum.hi); + if ((hibits & 1) == 0) { + /* hibits += (int)copysign(1.0, sum.hi * sum.lo) */ + EXTRACT_WORD64(lobits, sum.lo); + hibits += 1 - ((hibits ^ lobits) >> 62); + INSERT_WORD64(sum.hi, hibits); + } + } + return (sum.hi); +} + +/* + * Compute ldexp(a+b, scale) with a single rounding error. It is assumed + * that the result will be subnormal, and care is taken to ensure that + * double rounding does not occur. + */ +static inline double +add_and_denormalize(double a, double b, int scale) +{ + struct dd sum; + uint64_t hibits, lobits; + int bits_lost; + + sum = dd_add(a, b); + + /* + * If we are losing at least two bits of accuracy to denormalization, + * then the first lost bit becomes a round bit, and we adjust the + * lowest bit of sum.hi to make it a sticky bit summarizing all the + * bits in sum.lo. With the sticky bit adjusted, the hardware will + * break any ties in the correct direction. + * + * If we are losing only one bit to denormalization, however, we must + * break the ties manually. + */ + if (sum.lo != 0) { + EXTRACT_WORD64(hibits, sum.hi); + bits_lost = -((int)(hibits >> 52) & 0x7ff) - scale + 1; + if ((bits_lost != 1) ^ (int)(hibits & 1)) { + /* hibits += (int)copysign(1.0, sum.hi * sum.lo) */ + EXTRACT_WORD64(lobits, sum.lo); + hibits += 1 - (((hibits ^ lobits) >> 62) & 2); + INSERT_WORD64(sum.hi, hibits); + } + } + return (ldexp(sum.hi, scale)); +} + +/* + * Compute a*b exactly, returning the exact result in a struct dd. We assume + * that both a and b are normalized, so no underflow or overflow will occur. + * The current rounding mode must be round-to-nearest. + */ +static inline struct dd +dd_mul(double a, double b) +{ + static const double split = 0x1p27 + 1.0; + struct dd ret; + double ha, hb, la, lb, p, q; + + p = a * split; + ha = a - p; + ha += p; + la = a - ha; + + p = b * split; + hb = b - p; + hb += p; + lb = b - hb; + + p = ha * hb; + q = ha * lb + la * hb; + + ret.hi = p + q; + ret.lo = p - ret.hi + q + la * lb; + return (ret); +} + +/* + * Fused multiply-add: Compute x * y + z with a single rounding error. + * + * We use scaling to avoid overflow/underflow, along with the + * canonical precision-doubling technique adapted from: + * + * Dekker, T. A Floating-Point Technique for Extending the + * Available Precision. Numer. Math. 18, 224-242 (1971). + * + * This algorithm is sensitive to the rounding precision. FPUs such + * as the i387 must be set in double-precision mode if variables are + * to be stored in FP registers in order to avoid incorrect results. + * This is the default on FreeBSD, but not on many other systems. + * + * Hardware instructions should be used on architectures that support it, + * since this implementation will likely be several times slower. + */ +double +fma(double x, double y, double z) +{ + double xs, ys, zs, adj; + struct dd xy, r; + int oround; + int ex, ey, ez; + int spread; + + /* + * Handle special cases. The order of operations and the particular + * return values here are crucial in handling special cases involving + * infinities, NaNs, overflows, and signed zeroes correctly. + */ + if (x == 0.0 || y == 0.0) + return (x * y + z); + if (z == 0.0) + return (x * y); + if (!isfinite(x) || !isfinite(y)) + return (x * y + z); + if (!isfinite(z)) + return (z); + + xs = frexp(x, &ex); + ys = frexp(y, &ey); + zs = frexp(z, &ez); + oround = fegetround(); + spread = ex + ey - ez; + + /* + * If x * y and z are many orders of magnitude apart, the scaling + * will overflow, so we handle these cases specially. Rounding + * modes other than FE_TONEAREST are painful. + */ + if (spread < -DBL_MANT_DIG) { + feraiseexcept(FE_INEXACT); + if (!isnormal(z)) + feraiseexcept(FE_UNDERFLOW); + switch (oround) { + case FE_TONEAREST: + return (z); + case FE_TOWARDZERO: + if ((x > 0.0) ^ (y < 0.0) ^ (z < 0.0)) + return (z); + else + return (nextafter(z, 0)); + case FE_DOWNWARD: + if ((x > 0.0) ^ (y < 0.0)) + return (z); + else + return (nextafter(z, -INFINITY)); + default: /* FE_UPWARD */ + if ((x > 0.0) ^ (y < 0.0)) + return (nextafter(z, INFINITY)); + else + return (z); + } + } + if (spread <= DBL_MANT_DIG * 2) + zs = ldexp(zs, -spread); + else + zs = copysign(DBL_MIN, zs); + + fesetround(FE_TONEAREST); + + /* + * Basic approach for round-to-nearest: + * + * (xy.hi, xy.lo) = x * y (exact) + * (r.hi, r.lo) = xy.hi + z (exact) + * adj = xy.lo + r.lo (inexact; low bit is sticky) + * result = r.hi + adj (correctly rounded) + */ + xy = dd_mul(xs, ys); + r = dd_add(xy.hi, zs); + + spread = ex + ey; + + if (r.hi == 0.0) { + /* + * When the addends cancel to 0, ensure that the result has + * the correct sign. + */ + fesetround(oround); + { + volatile double vzs = zs; /* XXX gcc CSE bug workaround */ + return (xy.hi + vzs + ldexp(xy.lo, spread)); + } + } + + if (oround != FE_TONEAREST) { + /* + * There is no need to worry about double rounding in directed + * rounding modes. + */ + fesetround(oround); + adj = r.lo + xy.lo; + return (ldexp(r.hi + adj, spread)); + } + + adj = add_adjusted(r.lo, xy.lo); + if (spread + ilogb(r.hi) > -1023) + return (ldexp(r.hi + adj, spread)); + else + return (add_and_denormalize(r.hi, adj, spread)); +} + +#endif diff --git a/src/microcode/fma.h b/src/microcode/fma.h new file mode 100644 index 000000000..2e51396f6 --- /dev/null +++ b/src/microcode/fma.h @@ -0,0 +1,40 @@ +/* -*-C-*- + +Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, + 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, + 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, + 2017, 2018, 2019 Massachusetts Institute of Technology + +This file is part of MIT/GNU Scheme. + +MIT/GNU Scheme is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at +your option) any later version. + +MIT/GNU Scheme is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with MIT/GNU Scheme; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, +USA. + +*/ + +/* Fused Multiply-Add */ + +#ifndef SCHEME_FMA_H +#define SCHEME_FMA_H 1 + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#ifndef HAVE_FMA +double fma(double, double, double); +#endif + +#endif /* SCHEME_FMA_H */ diff --git a/src/microcode/makegen/files-core.scm b/src/microcode/makegen/files-core.scm index 5afb1835b..adc839312 100644 --- a/src/microcode/makegen/files-core.scm +++ b/src/microcode/makegen/files-core.scm @@ -49,6 +49,7 @@ USA. "fixnum" "floenv" "flonum" +"fma" "gcloop" "generic" "hooks" diff --git a/src/relnotes/fma b/src/relnotes/fma new file mode 100644 index 000000000..cb2cfaccb --- /dev/null +++ b/src/relnotes/fma @@ -0,0 +1,8 @@ +New procedures for floating-point fused-multiply/add: + +- (flo:*+ u v a) computes u*v + a correctly rounded with no + intermediate overflow or underflow +- (flo:fma u v) is the same, with a more familiar name +- (flo:fast-fma?) returns true if fma is hardware-supported and false + if it is emulated in software with Dekker's double-precision + algorithm diff --git a/src/runtime/primitive-arithmetic.scm b/src/runtime/primitive-arithmetic.scm index b535bf196..31363b65f 100644 --- a/src/runtime/primitive-arithmetic.scm +++ b/src/runtime/primitive-arithmetic.scm @@ -178,6 +178,9 @@ USA. (flo:- flonum-subtract 2) (flo:* flonum-multiply 2) (flo:/ flonum-divide 2) + (flo:*+ flonum-fma 3) + (flo:fma flonum-fma 3) + (flo:fast-fma? flonum-fast-fma? 0) (flo:modulo flonum-modulo 2) (flo:negate flonum-negate 1) (flo:abs flonum-abs 1) diff --git a/src/runtime/runtime.pkg b/src/runtime/runtime.pkg index 9b54f51e5..f40e82c1d 100644 --- a/src/runtime/runtime.pkg +++ b/src/runtime/runtime.pkg @@ -289,6 +289,7 @@ USA. fix:zero? fixnum? flo:* + flo:*+ flo:+ flo:- flo:/ @@ -319,10 +320,12 @@ USA. flo:exp flo:expm1 flo:expt + flo:fast-fma? flo:finite? flo:flonum? flo:floor flo:floor->exact + flo:fma flo:gamma flo:hypot flo:infinite? diff --git a/tests/runtime/test-arith.scm b/tests/runtime/test-arith.scm index 5cc1f2370..8629177af 100644 --- a/tests/runtime/test-arith.scm +++ b/tests/runtime/test-arith.scm @@ -1105,4 +1105,13 @@ USA. (receive (log-gamma sign) (flo:signed-lgamma x) (assert-eqv (flo:lgamma x) log-gamma) (let ((gamma (* sign (exp log-gamma)))) - (assert-<= (relerr (flo:gamma x) gamma) 1e-15))))) \ No newline at end of file + (assert-<= (relerr (flo:gamma x) gamma) 1e-15))))) + +(define-enumerated-test 'flo:fma + (list (list 1.2e100 2e208 -1.4e308 1e308)) + (lambda (x y z w) + (assert-<= (relerr (no-traps (lambda () (flo:*+ x y z))) w) 1e-15))) + +(define-test 'flo:fast-fma? + (lambda () + ((predicate-assertion boolean? "boolean") (flo:fast-fma?)))) \ No newline at end of file -- 2.25.1