From 0b24188b754826e6a8202efcdf2d877f1b833a10 Mon Sep 17 00:00:00 2001
From: "Taylor R. Campbell" <net/mumble/campbell>
Date: Tue, 24 Feb 2009 23:09:56 +0000
Subject: [PATCH] Re-enable open-coding of FLONUM-ASIN and FLONUM-ACOS on i386,
 and implement FLONUM-EXP using F2XM1 for the fractional part and FSCALE for
 the integral part of the argument.  This about trebles the speed of these
 primitives in compiled code, without giving answers substantially different
 from what the microcode computes.

When open-coding of FLONUM-EXP was disabled in 1992, a comment was
added to the effect that some i387 implementations had bugs in their
F2XM1 instructions.  I imagine that these bugs have been fixed in more
recent hardware, and that few users care about seventeen-year-old i387
units.  If this is a problem we can always disable it again.
---
 v7/src/compiler/machines/i386/machin.scm | 14 +++---
 v7/src/compiler/machines/i386/rulflo.scm | 56 +++++++++++++++++-------
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/v7/src/compiler/machines/i386/machin.scm b/v7/src/compiler/machines/i386/machin.scm
index 195cfbfff..5e617bb8d 100644
--- a/v7/src/compiler/machines/i386/machin.scm
+++ b/v7/src/compiler/machines/i386/machin.scm
@@ -1,6 +1,6 @@
 #| -*-Scheme-*-
 
-$Id: machin.scm,v 1.25 2008/06/09 01:39:28 cph Exp $
+$Id: machin.scm,v 1.26 2009/02/24 23:09:56 riastradh Exp $
 
 Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
     1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
@@ -351,13 +351,9 @@ USA.
 (define compiler:primitives-with-no-open-coding
   '(DIVIDE-FIXNUM GCD-FIXNUM &/
 		  ;; Disabled: trig instructions are limited to an
-		  ;; input range of 0 <= X <= pi*2^62.
+		  ;; input range of 0 <= |X| <= pi*2^62, and yield
+		  ;; inaccurate answers for an input range of 0 <= |X|
+		  ;; <= pi/4.  Correct argument reduction requires a
+		  ;; better approximation of pi than the i387 has.
 		  FLONUM-SIN FLONUM-COS FLONUM-TAN
-		  ;; The rewriting rules in rulrew.scm don't work.
-		  ;; Treat as not available.
-		  FLONUM-ASIN FLONUM-ACOS
-		  ;; Disabled for now.  The F2XM1 instruction is
-		  ;; broken on the 387 (or at least some of them), and
-		  ;; in general has a very limited input range.
-		  FLONUM-EXP
 		  VECTOR-CONS STRING-ALLOCATE FLOATING-VECTOR-CONS))
\ No newline at end of file
diff --git a/v7/src/compiler/machines/i386/rulflo.scm b/v7/src/compiler/machines/i386/rulflo.scm
index fb2791474..46c7e3259 100644
--- a/v7/src/compiler/machines/i386/rulflo.scm
+++ b/v7/src/compiler/machines/i386/rulflo.scm
@@ -1,6 +1,6 @@
 #| -*-Scheme-*-
 
-$Id: rulflo.scm,v 1.31 2008/06/09 01:39:29 cph Exp $
+$Id: rulflo.scm,v 1.32 2009/02/24 23:09:56 riastradh Exp $
 
 Copyright (C) 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
     1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
@@ -321,18 +321,47 @@ USA.
 	  (FXCH (ST 0) (ST 1))
 	  (FYL2X)))))
 
-#|
-;; Disabled: F2XM1 is limited to -1 <= X <= +1.
 (define-arithmetic-method 'FLONUM-EXP flonum-methods/1-arg
   (flonum-unary-operation/stack-top
    (lambda ()
-     (LAP (FLDL2E)
-	  (FMULP (ST 1) (ST 0))
-	  (F2XM1)
-	  (FLD1)
-	  (FADDP (ST 1) (ST 0))))))
-|#
-
+     ;; Hair to avoid arithmetic for non-finite inputs: exp(-inf) = 0,
+     ;; but exp(x) = x for any other non-finite x.  We use the first
+     ;; free slot (1) to pick apart the double format to check for
+     ;; non-finite inputs, and (2) to avoid using two stack slots.
+     (let ((temp (temporary-register-reference))
+	   (infinity-or-nan (generate-label 'INFINITY-OR-NAN))
+	   (join (generate-label 'JOIN))
+	   (temp-pointer regnum:free-pointer))
+       (LAP (FST D (@R ,temp-pointer))
+	    (MOV W ,temp (@RO W ,temp-pointer 4))
+	    (AND W ,temp (&U #x7FFFFFFF))
+	    (CMP W ,temp (&U #x7FF00000))
+	    (JAE B (@PCR ,infinity-or-nan))
+	    ;; Compute 2^(x log_2 e) with F2XM1 and FSCALE.
+	    (FLDL2E)			;st0 = lg e, st1 = x
+	    (FMULP (ST 1) (ST 0))	;st0 = x lg e
+	    (FLD (ST 0))		;st0 = x lg e, st1 = x lg e
+	    (FRNDINT)			;st0 = I(x lg e), st1 = x lg e
+	    (FSUB (ST 1) (ST 0))	;st0 = I(x lg e), st1 = F(x lg e)
+	    (FSTP D (@R ,temp-pointer)) ;st0 = F(x lg e), save I(x lg e)
+	    (F2XM1)			;st0 = 2^F(x lg e) - 1
+	    (FLD1)			;st0 = 1, st1 = 2^F(x lg e) - 1
+	    (FADD)			;st0 = 2^F(x lg e)
+	    (FLD D (@R ,temp-pointer))	;st0 = I(x lg e), st1 = 2^F(x lg e)
+	    (FXCH (ST 0) (ST 1))	;st0 = 2^F(x lg e), st1 = I(x lg e)
+	    (FSCALE)			;st0 = 2^F(x lg e) * 2^I(x lg e),
+					;st1 = I(x lg e)
+	    (FSTP (ST 1))		;Drop st1, leaving in st0 the value
+	    (JMP B (@PCR ,join))	; 2^(F(x lg e) + I(x lg e)) = e^x.
+	  (LABEL ,infinity-or-nan)
+	    (CMP W (@RO W ,temp-pointer 4) (&U #xFFF00000))
+	    (JNE B (@PCR ,join))
+	    (CMP W (@RO W ,temp-pointer 0) (& 0))
+	    (JNE B (@PCR ,join))
+	    (FSTP (ST 0))		;Pop argument.
+	    (FLDZ)			;Return zero.
+	  (LABEL ,join))))))
+
 #|
 ;; Disabled: FPTAN limited to pi * 2^62.
 (define-arithmetic-method 'FLONUM-TAN flonum-methods/1-arg
@@ -342,17 +371,13 @@ USA.
 	  (FSTP (ST 0))			; FPOP
 	  ))))
 |#
-
+
 (define-arithmetic-method 'FLONUM-ATAN flonum-methods/1-arg
   (flonum-unary-operation/stack-top
    (lambda ()
      (LAP (FLD1)
 	  (FPATAN)))))
 
-#|
-;; Disabled: these aren't used due to broken rewriting rules.  See
-;; rulrew.scm for details.
-
 ;; For now, these preserve values in memory
 ;; in order to avoid flushing a stack location.
 
@@ -378,7 +403,6 @@ USA.
 	  (FLD D (@R ,regnum:free-pointer))
 	  (FXCH (ST 0) (ST 1))
 	  (FPATAN)))))
-|#
 
 (define-rule statement
   (ASSIGN (REGISTER (? target))
-- 
2.25.1