From: Taylor R Campbell Date: Mon, 26 Aug 2019 04:50:32 +0000 (+0000) Subject: Open-code flonum-fma (fused multiply-add) on aarch64. X-Git-Tag: mit-scheme-pucked-10.1.20~11^2~55 X-Git-Url: https://birchwood-abbey.net/git?a=commitdiff_plain;h=878e299e821d90179394013eced01a01f9209406;p=mit-scheme.git Open-code flonum-fma (fused multiply-add) on aarch64. The fused multiply-subtract doesn't kick in right now for reasons I don't understand in rcompr.scm; maybe someone who understands that code better can help. --- diff --git a/src/compiler/machines/C/machin.scm b/src/compiler/machines/C/machin.scm index e5a48b70b..e6825fd8f 100644 --- a/src/compiler/machines/C/machin.scm +++ b/src/compiler/machines/C/machin.scm @@ -305,6 +305,7 @@ USA. (define compiler:primitives-with-no-open-coding '(DIVIDE-FIXNUM GCD-FIXNUM &/ VECTOR-CONS STRING-ALLOCATE FLOATING-VECTOR-CONS + FLONUM-FMA FLONUM-IS-LESS? FLONUM-IS-LESS-OR-EQUAL? FLONUM-IS-GREATER? FLONUM-IS-GREATER-OR-EQUAL? FLONUM-IS-LESS-OR-GREATER? FLONUM-IS-UNORDERED? diff --git a/src/compiler/machines/aarch64/instrf.scm b/src/compiler/machines/aarch64/instrf.scm index dc15dc093..8c8155fdc 100644 --- a/src/compiler/machines/aarch64/instrf.scm +++ b/src/compiler/machines/aarch64/instrf.scm @@ -477,6 +477,34 @@ USA. (define-fp-binary-instruction FMAXNM #b0110 0 0 0 #b000) (define-fp-binary-instruction FMINNM #b0111 0 1 1 #b000)) +(let-syntax + ((define-fp-ternary-instruction + (sc-macro-transformer + (lambda (form environment) + environment + (receive (mnemonic o1 o0) (apply values (cdr form)) + `(define-instruction ,mnemonic + (((? type fp-scalar-size) + (? Rd vregister) + (? Rn vregister) + (? Rm vregister) + (? Ra vregister)) + (BITS (1 0) + (1 0) + (1 0) + (5 #b11111) + (2 type) + (1 ,o1) + (5 Rm) + (1 ,o0) + (5 Ra) + (5 Rn) + (5 Rd))))))))) + (define-fp-ternary-instruction FMADD 0 0) + (define-fp-ternary-instruction FMSUB 0 1) + (define-fp-ternary-instruction FNMADD 1 0) + (define-fp-ternary-instruction FNMSUB 1 1)) + (let-syntax ((define-fp-compare-instruction (sc-macro-transformer diff --git a/src/compiler/machines/aarch64/rulflo.scm b/src/compiler/machines/aarch64/rulflo.scm index cc565fbf3..5698d2af1 100644 --- a/src/compiler/machines/aarch64/rulflo.scm +++ b/src/compiler/machines/aarch64/rulflo.scm @@ -260,6 +260,83 @@ USA. ;; target[i] := source2[i] if signbit[i] else source1[i] (BSL B8 ,target ,source2 ,source1)))))) +;;;; Fused multiply/add/negate + +(define-rule statement + (ASSIGN (REGISTER (? target)) + (FLONUM-3-ARGS FLONUM-FMA + (REGISTER (? factor1)) + (REGISTER (? factor2)) + (REGISTER (? addend)) + (? overflow?))) + overflow? ;ignore + ((flonum-3-args/standard + (lambda (target factor1 factor2 addend) + (LAP (FMADD D ,target ,factor1 ,factor2 ,addend)))) + target factor1 factor2 addend)) + +;;; XXX The following rules are busted because RTL instruction folding +;;; (rcompr.scm) doesn't know how to search through flonum operations +;;; or something -- and FIND-REFERENCE-INSTRUCTION is too mysterious +;;; for me to understand at this hour! + +;;; XXX What about (fma x y (- 0. z)) or (- 0. (fma x y z))? Need to +;;; check sign of zero for results. + +(define-rule statement + (ASSIGN (REGISTER (? target)) + (FLONUM-3-ARGS FLONUM-FMA + (REGISTER (? factor1)) + (REGISTER (? factor2)) + (FLONUM-1-ARG FLONUM-NEGATE + (REGISTER (? subtrahend)) + (? overflow0?)) + (? overflow1?))) + overflow0? overflow1? ;ignore + ((flonum-3-args/standard + (lambda (target factor1 factor2 subtrahend) + (LAP (FMSUB D ,target ,factor1 ,factor2 ,subtrahend)))) + target factor1 factor2 subtrahend)) + +(define-rule statement + (ASSIGN (REGISTER (? target)) + (FLONUM-1-ARG FLONUM-NEGATE + (FLONUM-3-ARGS FLONUM-FMA + (REGISTER (? factor1)) + (REGISTER (? factor2)) + (REGISTER (? addend)) + (? overflow0?)) + (? overflow1?))) + overflow0? overflow1? ;ignore + ((flonum-3-args/standard + (lambda (target factor1 factor2 addend) + (LAP (FNMADD D ,target ,factor1 ,factor2 ,addend)))) + target factor1 factor2 addend)) + +(define-rule statement + (ASSIGN (REGISTER (? target)) + (FLONUM-1-ARG FLONUM-NEGATE + (FLONUM-3-ARGS FLONUM-FMA + (REGISTER (? factor1)) + (REGISTER (? factor2)) + (FLONUM-1-ARG FLONUM-NEGATE + (REGISTER (? subtrahend)) + (? overflow0?)) + (? overflow1?)) + (? overflow2?))) + overflow0? overflow1? overflow2? ;ignore + ((flonum-3-args/standard + (lambda (target factor1 factor2 subtrahend) + (LAP (FNMSUB D ,target ,factor1 ,factor2 ,subtrahend)))) + target factor1 factor2 subtrahend)) + +(define ((flonum-3-args/standard operate) target source1 source2 source3) + (let* ((source1 (float-source-fpr! source1)) + (source2 (float-source-fpr! source2)) + (source3 (float-source-fpr! source3)) + (target (float-target-fpr! target))) + (operate target source1 source2 source3))) + ;;;; Flonum Predicates (define-rule predicate diff --git a/src/compiler/machines/i386/machin.scm b/src/compiler/machines/i386/machin.scm index 3c8f2835a..60acc8860 100644 --- a/src/compiler/machines/i386/machin.scm +++ b/src/compiler/machines/i386/machin.scm @@ -357,6 +357,9 @@ USA. ;; <= pi/4. Correct argument reduction requires a ;; better approximation of pi than the i387 has. FLONUM-SIN FLONUM-COS FLONUM-TAN + ;; Disabled: need to futz with cpuid flags to + ;; determine whether we can use it. + FLONUM-FMA ;; Disabled: exp is too much trouble to get right in ;; i387; need 64-bit precision. Let libm do it. FLONUM-EXP diff --git a/src/compiler/machines/svm/machine.scm b/src/compiler/machines/svm/machine.scm index f4bcb56fa..c7e7710a3 100644 --- a/src/compiler/machines/svm/machine.scm +++ b/src/compiler/machines/svm/machine.scm @@ -491,6 +491,7 @@ USA. (define compiler:primitives-with-no-open-coding '(DIVIDE-FIXNUM GCD-FIXNUM &/ FLONUM-EXPM1 FLONUM-LOG1P VECTOR-CONS STRING-ALLOCATE FLOATING-VECTOR-CONS + FLONUM-FMA FLONUM-IS-LESS? FLONUM-IS-LESS-OR-EQUAL? FLONUM-IS-GREATER? FLONUM-IS-GREATER-OR-EQUAL? FLONUM-IS-LESS-OR-GREATER? FLONUM-IS-UNORDERED? diff --git a/src/compiler/machines/x86-64/machin.scm b/src/compiler/machines/x86-64/machin.scm index a8078859d..02a891207 100644 --- a/src/compiler/machines/x86-64/machin.scm +++ b/src/compiler/machines/x86-64/machin.scm @@ -423,5 +423,5 @@ USA. &/ FLOATING-VECTOR-CONS FLONUM-ACOS FLONUM-ASIN FLONUM-ATAN FLONUM-ATAN2 FLONUM-CEILING FLONUM-COS FLONUM-EXP FLONUM-EXPM1 - FLONUM-FLOOR FLONUM-LOG FLONUM-LOG1P FLONUM-ROUND FLONUM-SIN + FLONUM-FLOOR FLONUM-FMA FLONUM-LOG FLONUM-LOG1P FLONUM-ROUND FLONUM-SIN FLONUM-TAN FLONUM-TRUNCATE GCD-FIXNUM STRING-ALLOCATE VECTOR-CONS)) \ No newline at end of file diff --git a/src/compiler/rtlbase/rtlcon.scm b/src/compiler/rtlbase/rtlcon.scm index 61f11c38c..0f830f3f8 100644 --- a/src/compiler/rtlbase/rtlcon.scm +++ b/src/compiler/rtlbase/rtlcon.scm @@ -706,7 +706,7 @@ USA. operand1 operand2 overflow?)))))))) - + (define-expression-method 'FIXNUM-1-ARG (lambda (receiver scfg-append! operator operand overflow?) (expression-simplify operand scfg-append! @@ -734,5 +734,20 @@ USA. s-operand2 overflow?)))))))) +(define-expression-method 'FLONUM-3-ARGS + (lambda (receiver scfg-append! operator operand1 operand2 operand3 overflow?) + (expression-simplify operand1 scfg-append! + (lambda (s-operand1) + (expression-simplify operand2 scfg-append! + (lambda (s-operand2) + (expression-simplify operand3 scfg-append! + (lambda (s-operand3) + (receiver (rtl:make-flonum-3-args + operator + s-operand1 + s-operand2 + s-operand3 + overflow?)))))))))) + ;;; end EXPRESSION-SIMPLIFY package ) \ No newline at end of file diff --git a/src/compiler/rtlbase/rtlexp.scm b/src/compiler/rtlbase/rtlexp.scm index 259aabadc..63f1f48ad 100644 --- a/src/compiler/rtlbase/rtlexp.scm +++ b/src/compiler/rtlbase/rtlexp.scm @@ -75,7 +75,7 @@ USA. value-class=fixnum) ((OBJECT->TYPE) value-class=type) - ((OBJECT->FLOAT FLONUM-1-ARG FLONUM-2-ARGS FLOAT-OFFSET) + ((OBJECT->FLOAT FLONUM-1-ARG FLONUM-2-ARGS FLONUM-3-ARGS FLOAT-OFFSET) value-class=float) (else (error "unknown RTL expression type" expression)))) @@ -275,6 +275,7 @@ USA. FLOAT-OFFSET-ADDRESS FLONUM-1-ARG FLONUM-2-ARGS + FLONUM-3-ARGS GENERIC-BINARY GENERIC-UNARY OBJECT->ADDRESS diff --git a/src/compiler/rtlbase/rtlty1.scm b/src/compiler/rtlbase/rtlty1.scm index 736aa5726..3ee68bcbd 100644 --- a/src/compiler/rtlbase/rtlty1.scm +++ b/src/compiler/rtlbase/rtlty1.scm @@ -116,6 +116,8 @@ USA. operator operand overflow?) (define-rtl-expression flonum-2-args rtl: operator operand-1 operand-2 overflow?) +(define-rtl-expression flonum-3-args rtl: + operator operand-1 operand-2 operand-3 overflow?) (define-rtl-predicate fixnum-pred-1-arg % predicate operand) diff --git a/src/compiler/rtlgen/opncod.scm b/src/compiler/rtlgen/opncod.scm index 0de464e9e..900e635b5 100644 --- a/src/compiler/rtlgen/opncod.scm +++ b/src/compiler/rtlgen/opncod.scm @@ -1498,6 +1498,36 @@ USA. '(FLONUM-ADD FLONUM-SUBTRACT FLONUM-MULTIPLY FLONUM-DIVIDE FLONUM-ATAN2 FLONUM-COPYSIGN)) +(for-each + (lambda (flonum-operator) + (define-open-coder/value flonum-operator + (floating-point-open-coder + (lambda (combination expressions finish) + (let ((arg1 (car expressions)) + (arg2 (cadr expressions)) + (arg3 (caddr expressions))) + (open-code:with-checks + combination + (let ((name flonum-operator) + (block (combination/block combination))) + (list (open-code:type-check arg1 (ucode-type flonum) name block) + (open-code:type-check arg2 (ucode-type flonum) name block) + (open-code:type-check arg3 (ucode-type flonum) name block))) + (finish + (rtl:make-float->object + (rtl:make-flonum-3-args + flonum-operator + (rtl:make-object->float arg1) + (rtl:make-object->float arg2) + (rtl:make-object->float arg3) + false))) + finish + flonum-operator + expressions))) + '(0 1 2) + internal-close-coding-for-type-checks))) + '(FLONUM-FMA)) + (for-each (lambda (flonum-pred) (define-open-coder/predicate flonum-pred diff --git a/src/sf/gconst.scm b/src/sf/gconst.scm index 31933924e..01d90f569 100644 --- a/src/sf/gconst.scm +++ b/src/sf/gconst.scm @@ -112,6 +112,7 @@ USA. (fix:zero? zero-fixnum?) (fixnum? fixnum?) (flo:* flonum-multiply) + (flo:*+ flonum-fma 3) (flo:+ flonum-add) (flo:- flonum-subtract) (flo:/ flonum-divide) @@ -130,10 +131,12 @@ USA. (flo:exp flonum-exp) (flo:expm1 flonum-expm1) (flo:expt flonum-expt) + (flo:fast-fma? flonum-fast-fma? 0) (flo:finite? flonum-is-finite? 1) (flo:flonum? flonum?) (flo:floor flonum-floor) (flo:floor->exact flonum-floor->exact) + (flo:fma flonum-fma 3) (flo:infinite? flonum-is-infinite? 1) (flo:log flonum-log) (flo:log1p flonum-log1p)