From 878e299e821d90179394013eced01a01f9209406 Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <campbell@mumble.net>
Date: Mon, 26 Aug 2019 04:50:32 +0000
Subject: [PATCH] Open-code flonum-fma (fused multiply-add) on aarch64.

The fused multiply-subtract doesn't kick in right now for reasons I
don't understand in rcompr.scm; maybe someone who understands that
code better can help.
---
 src/compiler/machines/C/machin.scm       |  1 +
 src/compiler/machines/aarch64/instrf.scm | 28 +++++++++
 src/compiler/machines/aarch64/rulflo.scm | 77 ++++++++++++++++++++++++
 src/compiler/machines/i386/machin.scm    |  3 +
 src/compiler/machines/svm/machine.scm    |  1 +
 src/compiler/machines/x86-64/machin.scm  |  2 +-
 src/compiler/rtlbase/rtlcon.scm          | 17 +++++-
 src/compiler/rtlbase/rtlexp.scm          |  3 +-
 src/compiler/rtlbase/rtlty1.scm          |  2 +
 src/compiler/rtlgen/opncod.scm           | 30 +++++++++
 src/sf/gconst.scm                        |  3 +
 11 files changed, 164 insertions(+), 3 deletions(-)

diff --git a/src/compiler/machines/C/machin.scm b/src/compiler/machines/C/machin.scm
index e5a48b70b..e6825fd8f 100644
--- a/src/compiler/machines/C/machin.scm
+++ b/src/compiler/machines/C/machin.scm
@@ -305,6 +305,7 @@ USA.
 (define compiler:primitives-with-no-open-coding
   '(DIVIDE-FIXNUM GCD-FIXNUM &/
     VECTOR-CONS STRING-ALLOCATE FLOATING-VECTOR-CONS
+    FLONUM-FMA
     FLONUM-IS-LESS? FLONUM-IS-LESS-OR-EQUAL?
     FLONUM-IS-GREATER? FLONUM-IS-GREATER-OR-EQUAL?
     FLONUM-IS-LESS-OR-GREATER? FLONUM-IS-UNORDERED?
diff --git a/src/compiler/machines/aarch64/instrf.scm b/src/compiler/machines/aarch64/instrf.scm
index dc15dc093..8c8155fdc 100644
--- a/src/compiler/machines/aarch64/instrf.scm
+++ b/src/compiler/machines/aarch64/instrf.scm
@@ -477,6 +477,34 @@ USA.
   (define-fp-binary-instruction FMAXNM #b0110 0 0 0 #b000)
   (define-fp-binary-instruction FMINNM #b0111 0 1 1 #b000))
 
+(let-syntax
+    ((define-fp-ternary-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment
+         (receive (mnemonic o1 o0) (apply values (cdr form))
+           `(define-instruction ,mnemonic
+              (((? type fp-scalar-size)
+                (? Rd vregister)
+                (? Rn vregister)
+                (? Rm vregister)
+                (? Ra vregister))
+               (BITS (1 0)
+                     (1 0)
+                     (1 0)
+                     (5 #b11111)
+                     (2 type)
+                     (1 ,o1)
+                     (5 Rm)
+                     (1 ,o0)
+                     (5 Ra)
+                     (5 Rn)
+                     (5 Rd)))))))))
+  (define-fp-ternary-instruction FMADD 0 0)
+  (define-fp-ternary-instruction FMSUB 0 1)
+  (define-fp-ternary-instruction FNMADD 1 0)
+  (define-fp-ternary-instruction FNMSUB 1 1))
+
 (let-syntax
     ((define-fp-compare-instruction
       (sc-macro-transformer
diff --git a/src/compiler/machines/aarch64/rulflo.scm b/src/compiler/machines/aarch64/rulflo.scm
index cc565fbf3..5698d2af1 100644
--- a/src/compiler/machines/aarch64/rulflo.scm
+++ b/src/compiler/machines/aarch64/rulflo.scm
@@ -260,6 +260,83 @@ USA.
             ;; target[i] := source2[i] if signbit[i] else source1[i]
             (BSL B8 ,target ,source2 ,source1))))))
 
+;;;; Fused multiply/add/negate
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-3-ARGS FLONUM-FMA
+                         (REGISTER (? factor1))
+                         (REGISTER (? factor2))
+                         (REGISTER (? addend))
+                         (? overflow?)))
+  overflow?                             ;ignore
+  ((flonum-3-args/standard
+    (lambda (target factor1 factor2 addend)
+      (LAP (FMADD D ,target ,factor1 ,factor2 ,addend))))
+   target factor1 factor2 addend))
+
+;;; XXX The following rules are busted because RTL instruction folding
+;;; (rcompr.scm) doesn't know how to search through flonum operations
+;;; or something -- and FIND-REFERENCE-INSTRUCTION is too mysterious
+;;; for me to understand at this hour!
+
+;;; XXX What about (fma x y (- 0. z)) or (- 0. (fma x y z))?  Need to
+;;; check sign of zero for results.
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-3-ARGS FLONUM-FMA
+                         (REGISTER (? factor1))
+                         (REGISTER (? factor2))
+                         (FLONUM-1-ARG FLONUM-NEGATE
+                                       (REGISTER (? subtrahend))
+                                       (? overflow0?))
+                         (? overflow1?)))
+  overflow0? overflow1?                 ;ignore
+  ((flonum-3-args/standard
+    (lambda (target factor1 factor2 subtrahend)
+      (LAP (FMSUB D ,target ,factor1 ,factor2 ,subtrahend))))
+   target factor1 factor2 subtrahend))
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-1-ARG FLONUM-NEGATE
+                        (FLONUM-3-ARGS FLONUM-FMA
+                                       (REGISTER (? factor1))
+                                       (REGISTER (? factor2))
+                                       (REGISTER (? addend))
+                                       (? overflow0?))
+                        (? overflow1?)))
+  overflow0? overflow1?                 ;ignore
+  ((flonum-3-args/standard
+    (lambda (target factor1 factor2 addend)
+      (LAP (FNMADD D ,target ,factor1 ,factor2 ,addend))))
+   target factor1 factor2 addend))
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-1-ARG FLONUM-NEGATE
+                        (FLONUM-3-ARGS FLONUM-FMA
+                                       (REGISTER (? factor1))
+                                       (REGISTER (? factor2))
+                                       (FLONUM-1-ARG FLONUM-NEGATE
+                                                     (REGISTER (? subtrahend))
+                                                     (? overflow0?))
+                                       (? overflow1?))
+                        (? overflow2?)))
+  overflow0? overflow1? overflow2?      ;ignore
+  ((flonum-3-args/standard
+    (lambda (target factor1 factor2 subtrahend)
+      (LAP (FNMSUB D ,target ,factor1 ,factor2 ,subtrahend))))
+   target factor1 factor2 subtrahend))
+
+(define ((flonum-3-args/standard operate) target source1 source2 source3)
+  (let* ((source1 (float-source-fpr! source1))
+         (source2 (float-source-fpr! source2))
+         (source3 (float-source-fpr! source3))
+         (target (float-target-fpr! target)))
+    (operate target source1 source2 source3)))
+
 ;;;; Flonum Predicates
 
 (define-rule predicate
diff --git a/src/compiler/machines/i386/machin.scm b/src/compiler/machines/i386/machin.scm
index 3c8f2835a..60acc8860 100644
--- a/src/compiler/machines/i386/machin.scm
+++ b/src/compiler/machines/i386/machin.scm
@@ -357,6 +357,9 @@ USA.
 		  ;; <= pi/4.  Correct argument reduction requires a
 		  ;; better approximation of pi than the i387 has.
 		  FLONUM-SIN FLONUM-COS FLONUM-TAN
+		  ;; Disabled: need to futz with cpuid flags to
+		  ;; determine whether we can use it.
+		  FLONUM-FMA
 		  ;; Disabled: exp is too much trouble to get right in
 		  ;; i387; need 64-bit precision.  Let libm do it.
 		  FLONUM-EXP
diff --git a/src/compiler/machines/svm/machine.scm b/src/compiler/machines/svm/machine.scm
index f4bcb56fa..c7e7710a3 100644
--- a/src/compiler/machines/svm/machine.scm
+++ b/src/compiler/machines/svm/machine.scm
@@ -491,6 +491,7 @@ USA.
 (define compiler:primitives-with-no-open-coding
   '(DIVIDE-FIXNUM GCD-FIXNUM &/ FLONUM-EXPM1 FLONUM-LOG1P
 		  VECTOR-CONS STRING-ALLOCATE FLOATING-VECTOR-CONS
+		  FLONUM-FMA
 		  FLONUM-IS-LESS? FLONUM-IS-LESS-OR-EQUAL?
 		  FLONUM-IS-GREATER? FLONUM-IS-GREATER-OR-EQUAL?
 		  FLONUM-IS-LESS-OR-GREATER? FLONUM-IS-UNORDERED?
diff --git a/src/compiler/machines/x86-64/machin.scm b/src/compiler/machines/x86-64/machin.scm
index a8078859d..02a891207 100644
--- a/src/compiler/machines/x86-64/machin.scm
+++ b/src/compiler/machines/x86-64/machin.scm
@@ -423,5 +423,5 @@ USA.
     &/
     FLOATING-VECTOR-CONS FLONUM-ACOS FLONUM-ASIN FLONUM-ATAN
     FLONUM-ATAN2 FLONUM-CEILING FLONUM-COS FLONUM-EXP FLONUM-EXPM1
-    FLONUM-FLOOR FLONUM-LOG FLONUM-LOG1P FLONUM-ROUND FLONUM-SIN
+    FLONUM-FLOOR FLONUM-FMA FLONUM-LOG FLONUM-LOG1P FLONUM-ROUND FLONUM-SIN
     FLONUM-TAN FLONUM-TRUNCATE GCD-FIXNUM STRING-ALLOCATE VECTOR-CONS))
\ No newline at end of file
diff --git a/src/compiler/rtlbase/rtlcon.scm b/src/compiler/rtlbase/rtlcon.scm
index 61f11c38c..0f830f3f8 100644
--- a/src/compiler/rtlbase/rtlcon.scm
+++ b/src/compiler/rtlbase/rtlcon.scm
@@ -706,7 +706,7 @@ USA.
 				     operand1
 				     operand2
 				     overflow?))))))))
-
+
 (define-expression-method 'FIXNUM-1-ARG
   (lambda (receiver scfg-append! operator operand overflow?)
     (expression-simplify operand scfg-append!
@@ -734,5 +734,20 @@ USA.
 		       s-operand2
 		       overflow?))))))))
 
+(define-expression-method 'FLONUM-3-ARGS
+  (lambda (receiver scfg-append! operator operand1 operand2 operand3 overflow?)
+    (expression-simplify operand1 scfg-append!
+      (lambda (s-operand1)
+	(expression-simplify operand2 scfg-append!
+	  (lambda (s-operand2)
+	    (expression-simplify operand3 scfg-append!
+	      (lambda (s-operand3)
+		(receiver (rtl:make-flonum-3-args
+			   operator
+			   s-operand1
+			   s-operand2
+			   s-operand3
+			   overflow?))))))))))
+
 ;;; end EXPRESSION-SIMPLIFY package
 )
\ No newline at end of file
diff --git a/src/compiler/rtlbase/rtlexp.scm b/src/compiler/rtlbase/rtlexp.scm
index 259aabadc..63f1f48ad 100644
--- a/src/compiler/rtlbase/rtlexp.scm
+++ b/src/compiler/rtlbase/rtlexp.scm
@@ -75,7 +75,7 @@ USA.
      value-class=fixnum)
     ((OBJECT->TYPE)
      value-class=type)
-    ((OBJECT->FLOAT FLONUM-1-ARG FLONUM-2-ARGS FLOAT-OFFSET)
+    ((OBJECT->FLOAT FLONUM-1-ARG FLONUM-2-ARGS FLONUM-3-ARGS FLOAT-OFFSET)
      value-class=float)
     (else
      (error "unknown RTL expression type" expression))))
@@ -275,6 +275,7 @@ USA.
       FLOAT-OFFSET-ADDRESS
       FLONUM-1-ARG
       FLONUM-2-ARGS
+      FLONUM-3-ARGS
       GENERIC-BINARY
       GENERIC-UNARY
       OBJECT->ADDRESS
diff --git a/src/compiler/rtlbase/rtlty1.scm b/src/compiler/rtlbase/rtlty1.scm
index 736aa5726..3ee68bcbd 100644
--- a/src/compiler/rtlbase/rtlty1.scm
+++ b/src/compiler/rtlbase/rtlty1.scm
@@ -116,6 +116,8 @@ USA.
   operator operand overflow?)
 (define-rtl-expression flonum-2-args rtl:
   operator operand-1 operand-2 overflow?)
+(define-rtl-expression flonum-3-args rtl:
+  operator operand-1 operand-2 operand-3 overflow?)
 
 (define-rtl-predicate fixnum-pred-1-arg %
   predicate operand)
diff --git a/src/compiler/rtlgen/opncod.scm b/src/compiler/rtlgen/opncod.scm
index 0de464e9e..900e635b5 100644
--- a/src/compiler/rtlgen/opncod.scm
+++ b/src/compiler/rtlgen/opncod.scm
@@ -1498,6 +1498,36 @@ USA.
  '(FLONUM-ADD FLONUM-SUBTRACT FLONUM-MULTIPLY FLONUM-DIVIDE FLONUM-ATAN2
    FLONUM-COPYSIGN))
 
+(for-each
+ (lambda (flonum-operator)
+   (define-open-coder/value flonum-operator
+     (floating-point-open-coder
+      (lambda (combination expressions finish)
+	(let ((arg1 (car expressions))
+	      (arg2 (cadr expressions))
+	      (arg3 (caddr expressions)))
+	  (open-code:with-checks
+	   combination
+	   (let ((name flonum-operator)
+		 (block (combination/block combination)))
+	     (list (open-code:type-check arg1 (ucode-type flonum) name block)
+		   (open-code:type-check arg2 (ucode-type flonum) name block)
+		   (open-code:type-check arg3 (ucode-type flonum) name block)))
+	   (finish
+	    (rtl:make-float->object
+	     (rtl:make-flonum-3-args
+	      flonum-operator
+	      (rtl:make-object->float arg1)
+	      (rtl:make-object->float arg2)
+	      (rtl:make-object->float arg3)
+	      false)))
+	   finish
+	   flonum-operator
+	   expressions)))
+      '(0 1 2)
+      internal-close-coding-for-type-checks)))
+ '(FLONUM-FMA))
+
 (for-each
  (lambda (flonum-pred)
    (define-open-coder/predicate flonum-pred
diff --git a/src/sf/gconst.scm b/src/sf/gconst.scm
index 31933924e..01d90f569 100644
--- a/src/sf/gconst.scm
+++ b/src/sf/gconst.scm
@@ -112,6 +112,7 @@ USA.
     (fix:zero? zero-fixnum?)
     (fixnum? fixnum?)
     (flo:* flonum-multiply)
+    (flo:*+ flonum-fma 3)
     (flo:+ flonum-add)
     (flo:- flonum-subtract)
     (flo:/ flonum-divide)
@@ -130,10 +131,12 @@ USA.
     (flo:exp flonum-exp)
     (flo:expm1 flonum-expm1)
     (flo:expt flonum-expt)
+    (flo:fast-fma? flonum-fast-fma? 0)
     (flo:finite? flonum-is-finite? 1)
     (flo:flonum? flonum?)
     (flo:floor flonum-floor)
     (flo:floor->exact flonum-floor->exact)
+    (flo:fma flonum-fma 3)
     (flo:infinite? flonum-is-infinite? 1)
     (flo:log flonum-log)
     (flo:log1p flonum-log1p)
-- 
2.25.1