Open-code flonum-fma (fused multiply-add) on aarch64.
authorTaylor R Campbell <campbell@mumble.net>
Mon, 26 Aug 2019 04:50:32 +0000 (04:50 +0000)
committerTaylor R Campbell <campbell@mumble.net>
Mon, 26 Aug 2019 04:51:58 +0000 (04:51 +0000)
The fused multiply-subtract doesn't kick in right now for reasons I
don't understand in rcompr.scm; maybe someone who understands that
code better can help.

src/compiler/machines/C/machin.scm
src/compiler/machines/aarch64/instrf.scm
src/compiler/machines/aarch64/rulflo.scm
src/compiler/machines/i386/machin.scm
src/compiler/machines/svm/machine.scm
src/compiler/machines/x86-64/machin.scm
src/compiler/rtlbase/rtlcon.scm
src/compiler/rtlbase/rtlexp.scm
src/compiler/rtlbase/rtlty1.scm
src/compiler/rtlgen/opncod.scm
src/sf/gconst.scm

index e5a48b70b22aa59aa71d54ecbd3e84d443079601..e6825fd8fd31760bd65b037e996eedc865de8b68 100644 (file)
@@ -305,6 +305,7 @@ USA.
 (define compiler:primitives-with-no-open-coding
   '(DIVIDE-FIXNUM GCD-FIXNUM &/
     VECTOR-CONS STRING-ALLOCATE FLOATING-VECTOR-CONS
+    FLONUM-FMA
     FLONUM-IS-LESS? FLONUM-IS-LESS-OR-EQUAL?
     FLONUM-IS-GREATER? FLONUM-IS-GREATER-OR-EQUAL?
     FLONUM-IS-LESS-OR-GREATER? FLONUM-IS-UNORDERED?
index dc15dc093169c860d13df40398d44a9c5aa3fc38..8c8155fdcd62f1d0b25e43187b9bc5affa0c32d5 100644 (file)
@@ -477,6 +477,34 @@ USA.
   (define-fp-binary-instruction FMAXNM #b0110 0 0 0 #b000)
   (define-fp-binary-instruction FMINNM #b0111 0 1 1 #b000))
 
+(let-syntax
+    ((define-fp-ternary-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment
+         (receive (mnemonic o1 o0) (apply values (cdr form))
+           `(define-instruction ,mnemonic
+              (((? type fp-scalar-size)
+                (? Rd vregister)
+                (? Rn vregister)
+                (? Rm vregister)
+                (? Ra vregister))
+               (BITS (1 0)
+                     (1 0)
+                     (1 0)
+                     (5 #b11111)
+                     (2 type)
+                     (1 ,o1)
+                     (5 Rm)
+                     (1 ,o0)
+                     (5 Ra)
+                     (5 Rn)
+                     (5 Rd)))))))))
+  (define-fp-ternary-instruction FMADD 0 0)
+  (define-fp-ternary-instruction FMSUB 0 1)
+  (define-fp-ternary-instruction FNMADD 1 0)
+  (define-fp-ternary-instruction FNMSUB 1 1))
+
 (let-syntax
     ((define-fp-compare-instruction
       (sc-macro-transformer
index cc565fbf3b7549a31993e97d2be1bc61a6c35c8e..5698d2af1851457d9a94ab6f05cafcc553395142 100644 (file)
@@ -260,6 +260,83 @@ USA.
             ;; target[i] := source2[i] if signbit[i] else source1[i]
             (BSL B8 ,target ,source2 ,source1))))))
 \f
+;;;; Fused multiply/add/negate
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-3-ARGS FLONUM-FMA
+                         (REGISTER (? factor1))
+                         (REGISTER (? factor2))
+                         (REGISTER (? addend))
+                         (? overflow?)))
+  overflow?                             ;ignore
+  ((flonum-3-args/standard
+    (lambda (target factor1 factor2 addend)
+      (LAP (FMADD D ,target ,factor1 ,factor2 ,addend))))
+   target factor1 factor2 addend))
+
+;;; XXX The following rules are busted because RTL instruction folding
+;;; (rcompr.scm) doesn't know how to search through flonum operations
+;;; or something -- and FIND-REFERENCE-INSTRUCTION is too mysterious
+;;; for me to understand at this hour!
+
+;;; XXX What about (fma x y (- 0. z)) or (- 0. (fma x y z))?  Need to
+;;; check sign of zero for results.
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-3-ARGS FLONUM-FMA
+                         (REGISTER (? factor1))
+                         (REGISTER (? factor2))
+                         (FLONUM-1-ARG FLONUM-NEGATE
+                                       (REGISTER (? subtrahend))
+                                       (? overflow0?))
+                         (? overflow1?)))
+  overflow0? overflow1?                 ;ignore
+  ((flonum-3-args/standard
+    (lambda (target factor1 factor2 subtrahend)
+      (LAP (FMSUB D ,target ,factor1 ,factor2 ,subtrahend))))
+   target factor1 factor2 subtrahend))
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-1-ARG FLONUM-NEGATE
+                        (FLONUM-3-ARGS FLONUM-FMA
+                                       (REGISTER (? factor1))
+                                       (REGISTER (? factor2))
+                                       (REGISTER (? addend))
+                                       (? overflow0?))
+                        (? overflow1?)))
+  overflow0? overflow1?                 ;ignore
+  ((flonum-3-args/standard
+    (lambda (target factor1 factor2 addend)
+      (LAP (FNMADD D ,target ,factor1 ,factor2 ,addend))))
+   target factor1 factor2 addend))
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-1-ARG FLONUM-NEGATE
+                        (FLONUM-3-ARGS FLONUM-FMA
+                                       (REGISTER (? factor1))
+                                       (REGISTER (? factor2))
+                                       (FLONUM-1-ARG FLONUM-NEGATE
+                                                     (REGISTER (? subtrahend))
+                                                     (? overflow0?))
+                                       (? overflow1?))
+                        (? overflow2?)))
+  overflow0? overflow1? overflow2?      ;ignore
+  ((flonum-3-args/standard
+    (lambda (target factor1 factor2 subtrahend)
+      (LAP (FNMSUB D ,target ,factor1 ,factor2 ,subtrahend))))
+   target factor1 factor2 subtrahend))
+
+(define ((flonum-3-args/standard operate) target source1 source2 source3)
+  (let* ((source1 (float-source-fpr! source1))
+         (source2 (float-source-fpr! source2))
+         (source3 (float-source-fpr! source3))
+         (target (float-target-fpr! target)))
+    (operate target source1 source2 source3)))
+\f
 ;;;; Flonum Predicates
 
 (define-rule predicate
index 3c8f2835aeb1b1c31d2af869d2cff22d9137d169..60acc8860748681d34f88b1df7d74e120c27a78b 100644 (file)
@@ -357,6 +357,9 @@ USA.
                  ;; <= pi/4.  Correct argument reduction requires a
                  ;; better approximation of pi than the i387 has.
                  FLONUM-SIN FLONUM-COS FLONUM-TAN
+                 ;; Disabled: need to futz with cpuid flags to
+                 ;; determine whether we can use it.
+                 FLONUM-FMA
                  ;; Disabled: exp is too much trouble to get right in
                  ;; i387; need 64-bit precision.  Let libm do it.
                  FLONUM-EXP
index f4bcb56fa1d2833f7b9d7dd63ade113bc9447381..c7e7710a3844d41cd0550a8d0433c2cdc97f13eb 100644 (file)
@@ -491,6 +491,7 @@ USA.
 (define compiler:primitives-with-no-open-coding
   '(DIVIDE-FIXNUM GCD-FIXNUM &/ FLONUM-EXPM1 FLONUM-LOG1P
                  VECTOR-CONS STRING-ALLOCATE FLOATING-VECTOR-CONS
+                 FLONUM-FMA
                  FLONUM-IS-LESS? FLONUM-IS-LESS-OR-EQUAL?
                  FLONUM-IS-GREATER? FLONUM-IS-GREATER-OR-EQUAL?
                  FLONUM-IS-LESS-OR-GREATER? FLONUM-IS-UNORDERED?
index a8078859d7b002fd984c629ded6c2c4201a6eb2a..02a891207197bb7d272c887b710e2244ede645fc 100644 (file)
@@ -423,5 +423,5 @@ USA.
     &/
     FLOATING-VECTOR-CONS FLONUM-ACOS FLONUM-ASIN FLONUM-ATAN
     FLONUM-ATAN2 FLONUM-CEILING FLONUM-COS FLONUM-EXP FLONUM-EXPM1
-    FLONUM-FLOOR FLONUM-LOG FLONUM-LOG1P FLONUM-ROUND FLONUM-SIN
+    FLONUM-FLOOR FLONUM-FMA FLONUM-LOG FLONUM-LOG1P FLONUM-ROUND FLONUM-SIN
     FLONUM-TAN FLONUM-TRUNCATE GCD-FIXNUM STRING-ALLOCATE VECTOR-CONS))
\ No newline at end of file
index 61f11c38c2578e25cd80f70518f88df0e13b6a56..0f830f3f84aec858e36b54f5150bbeec8544dab0 100644 (file)
@@ -706,7 +706,7 @@ USA.
                                     operand1
                                     operand2
                                     overflow?))))))))
-
+\f
 (define-expression-method 'FIXNUM-1-ARG
   (lambda (receiver scfg-append! operator operand overflow?)
     (expression-simplify operand scfg-append!
@@ -734,5 +734,20 @@ USA.
                       s-operand2
                       overflow?))))))))
 
+(define-expression-method 'FLONUM-3-ARGS
+  (lambda (receiver scfg-append! operator operand1 operand2 operand3 overflow?)
+    (expression-simplify operand1 scfg-append!
+      (lambda (s-operand1)
+       (expression-simplify operand2 scfg-append!
+         (lambda (s-operand2)
+           (expression-simplify operand3 scfg-append!
+             (lambda (s-operand3)
+               (receiver (rtl:make-flonum-3-args
+                          operator
+                          s-operand1
+                          s-operand2
+                          s-operand3
+                          overflow?))))))))))
+
 ;;; end EXPRESSION-SIMPLIFY package
 )
\ No newline at end of file
index 259aabadc318e480a7afbcdef57330da724ae4f1..63f1f48ad65adaac251d438f9bd860dc441807af 100644 (file)
@@ -75,7 +75,7 @@ USA.
      value-class=fixnum)
     ((OBJECT->TYPE)
      value-class=type)
-    ((OBJECT->FLOAT FLONUM-1-ARG FLONUM-2-ARGS FLOAT-OFFSET)
+    ((OBJECT->FLOAT FLONUM-1-ARG FLONUM-2-ARGS FLONUM-3-ARGS FLOAT-OFFSET)
      value-class=float)
     (else
      (error "unknown RTL expression type" expression))))
@@ -275,6 +275,7 @@ USA.
       FLOAT-OFFSET-ADDRESS
       FLONUM-1-ARG
       FLONUM-2-ARGS
+      FLONUM-3-ARGS
       GENERIC-BINARY
       GENERIC-UNARY
       OBJECT->ADDRESS
index 736aa5726c8e2d6cc4a8e3e0f85b76439f335f08..3ee68bcbda0eec04cbe8a2842a462b149e8573b3 100644 (file)
@@ -116,6 +116,8 @@ USA.
   operator operand overflow?)
 (define-rtl-expression flonum-2-args rtl:
   operator operand-1 operand-2 overflow?)
+(define-rtl-expression flonum-3-args rtl:
+  operator operand-1 operand-2 operand-3 overflow?)
 
 (define-rtl-predicate fixnum-pred-1-arg %
   predicate operand)
index 0de464e9ea47875f74a3f59cb7ce751ba46dbd79..900e635b50a39415b12c400d76c9b7dae21203aa 100644 (file)
@@ -1498,6 +1498,36 @@ USA.
  '(FLONUM-ADD FLONUM-SUBTRACT FLONUM-MULTIPLY FLONUM-DIVIDE FLONUM-ATAN2
    FLONUM-COPYSIGN))
 \f
+(for-each
+ (lambda (flonum-operator)
+   (define-open-coder/value flonum-operator
+     (floating-point-open-coder
+      (lambda (combination expressions finish)
+       (let ((arg1 (car expressions))
+             (arg2 (cadr expressions))
+             (arg3 (caddr expressions)))
+         (open-code:with-checks
+          combination
+          (let ((name flonum-operator)
+                (block (combination/block combination)))
+            (list (open-code:type-check arg1 (ucode-type flonum) name block)
+                  (open-code:type-check arg2 (ucode-type flonum) name block)
+                  (open-code:type-check arg3 (ucode-type flonum) name block)))
+          (finish
+           (rtl:make-float->object
+            (rtl:make-flonum-3-args
+             flonum-operator
+             (rtl:make-object->float arg1)
+             (rtl:make-object->float arg2)
+             (rtl:make-object->float arg3)
+             false)))
+          finish
+          flonum-operator
+          expressions)))
+      '(0 1 2)
+      internal-close-coding-for-type-checks)))
+ '(FLONUM-FMA))
+\f
 (for-each
  (lambda (flonum-pred)
    (define-open-coder/predicate flonum-pred
index 31933924e636ff26d125545e469c1785cf55f17b..01d90f5695b489c7cee3979b00fd48c18f914025 100644 (file)
@@ -112,6 +112,7 @@ USA.
     (fix:zero? zero-fixnum?)
     (fixnum? fixnum?)
     (flo:* flonum-multiply)
+    (flo:*+ flonum-fma 3)
     (flo:+ flonum-add)
     (flo:- flonum-subtract)
     (flo:/ flonum-divide)
@@ -130,10 +131,12 @@ USA.
     (flo:exp flonum-exp)
     (flo:expm1 flonum-expm1)
     (flo:expt flonum-expt)
+    (flo:fast-fma? flonum-fast-fma? 0)
     (flo:finite? flonum-is-finite? 1)
     (flo:flonum? flonum?)
     (flo:floor flonum-floor)
     (flo:floor->exact flonum-floor->exact)
+    (flo:fma flonum-fma 3)
     (flo:infinite? flonum-is-infinite? 1)
     (flo:log flonum-log)
     (flo:log1p flonum-log1p)