From 8659d5cb62a614993c064ddf11ba95b138dd0103 Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <campbell@mumble.net>
Date: Sun, 25 Aug 2019 19:48:23 +0000
Subject: [PATCH] Open-code floating-point arithmetic on aarch64.

Disabled by default for now due to limited testing.
---
 src/compiler/machines/aarch64/instr1.scm  | 275 ----------
 src/compiler/machines/aarch64/instrf.scm  | 636 +++++++++++++++++++++-
 src/compiler/machines/aarch64/insutl.scm  | 117 ++++
 src/compiler/machines/aarch64/lapgen.scm  |  45 +-
 src/compiler/machines/aarch64/machine.scm |  15 +-
 src/compiler/machines/aarch64/rules2.scm  |  14 +
 src/compiler/machines/aarch64/rulflo.scm  | 377 ++++++++++++-
 7 files changed, 1183 insertions(+), 296 deletions(-)

diff --git a/src/compiler/machines/aarch64/instr1.scm b/src/compiler/machines/aarch64/instr1.scm
index 791c65c79..5a07dec7f 100644
--- a/src/compiler/machines/aarch64/instr1.scm
+++ b/src/compiler/machines/aarch64/instr1.scm
@@ -839,281 +839,6 @@ USA.
   (define-load/store-pair-instruction LDP 1)
   (define-load/store-pair-instruction STP 0))
 
-;;; C3.2.9 Load/Store scalar SIMD and floating-point
-
-(let-syntax
-    ((define-simd/fp-load/store-instruction
-      (sc-macro-transformer
-       (lambda (form environment)
-         environment
-         (receive (mnemonic load/store . extra) (apply values (cdr form))
-           `(define-instruction ,mnemonic
-              ;; LDR immediate, SIMD&FP, pre/post-index with signed
-              ;; byte offset (C7.2.176)
-              ;; STR immediate, SIMD&FP, pre/post-index with signed
-              ;; byte offset (C7.2.315)
-              (((? sz load/store-simd/fp-size)
-                (? Vt vregister)
-                ((? pre/post load/store-pre/post-index)
-                 (? Rn register-31=sp)
-                 (& (? offset signed-9))))
-               (BITS (2 sz LOAD/STORE-SIMD/FP-SIZE)
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b00)
-                     (1 sz LOAD/STORE-SIMD/FP-OPCHI)
-                     (1 ,load/store)    ;opc[0]
-                     (1 0)
-                     (9 offset SIGNED)
-                     (2 pre/post)
-                     (5 Rn)
-                     (5 Vt)))
-              ;; LDR immediate, SIMD&FP, zero offset (C7.2.176)
-              ;; STR immediate, SIMD&FP, zero offset (C7.2.315)
-              (((? sz load/store-simd/fp-size)
-                (? Vt vregister)
-                (? Rn register-31=sp))
-               (BITS (2 sz LOAD/STORE-SIMD/FP-SIZE)
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b01)
-                     (1 sz LOAD/STORE-SIMD/FP-OPCHI)
-                     (1 ,load/store)    ;opc[0]
-                     (12 0)             ;offset=0
-                     (5 Rn)
-                     (5 Vt)))
-              ;; LDR immediate, SIMD&FP (B), unsigned byte offset (C7.2.176)
-              ;; STR immediate, SIMD&FP (B), unsigned byte offset (C7.2.315)
-              ((B (? Vt vregister)
-                  (+ (? Rn register-31=sp)
-                     (&U (? offset unsigned-12))))
-               (BITS (2 #b00)           ;size=B, 8-bit
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b01)
-                     (1 0)              ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (12 offset)
-                     (5 Rn)
-                     (5 Vt)))
-              ;; LDR immediate, SIMD&FP (B), unsigned byte offset (C7.2.176)
-              ;; STR immediate, SIMD&FP (B), unsigned byte offset (C7.2.315)
-              ;; [same as above]
-              ((B (? Vt vregister)
-                  (+ (? Rn register-31=sp)
-                     (&U (* 1 (? offset unsigned-12)))))
-               (BITS (2 #b00)           ;size=B, 8-bit
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b01)
-                     (1 0)              ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (12 offset)
-                     (5 Rn)
-                     (5 Vt)))
-
-              ;; LDR immediate, SIMD&FP (H), unsigned 2-byte offset (C7.2.176)
-              ;; STR immediate, SIMD&FP (H), unsigned 2-byte offset (C7.2.315)
-              ((H (? Vt vregister)
-                  (+ (? Rn register-31=sp)
-                     (&U (* 2 (? offset unsigned-12)))))
-               (BITS (2 #b01)           ;size=H, 16-bit
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b01)
-                     (1 0)              ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (12 offset)
-                     (5 Rn)
-                     (5 Vt)))
-              ;; LDR immediate, SIMD&FP (S), unsigned 4-byte offset (C7.2.176)
-              ;; STR immediate, SIMD&FP (S), unsigned 4-byte offset (C7.2.315)
-              ((S (? Vt vregister)
-                  (+ (? Rn register-31=sp)
-                     (&U (* 4 (? offset unsigned-12)))))
-               (BITS (2 #b10)           ;size=S, 32-bit
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b01)
-                     (1 0)              ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (12 offset)
-                     (5 Rn)
-                     (5 Vt)))
-              ;; LDR immediate, SIMD&FP (D), unsigned 8-byte offset (C7.2.176)
-              ;; STR immediate, SIMD&FP (D), unsigned 8-byte offset (C7.2.315)
-              ((D (? Vt vregister)
-                  (+ (? Rn register-31=sp)
-                     (&U (* 8 (? offset unsigned-12)))))
-               (BITS (2 #b11)           ;size=D, 64-bit
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b01)
-                     (1 0)              ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (12 offset)
-                     (5 Rn)
-                     (5 Vt)))
-              ;; LDR immediate, SIMD&FP (Q), unsigned 16-byte offset (C7.2.176)
-              ;; STR immediate, SIMD&FP (Q), unsigned 16-byte offset (C7.2.315)
-              ((Q (? Vt vregister)
-                  (+ (? Rn register-31=sp)
-                     (&U (* 16 (? offset unsigned-12)))))
-               (BITS (2 #b00)           ;`size'
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b01)
-                     (1 0)              ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (12 offset)
-                     (5 Rn)
-                     (5 Vt)))
-
-              ;; LDR register, SIMD&FP, no extend (C7.2.178)
-              ;; STR register, SIMD&FP, no extend (C7.3.316)
-              (((? sz load/store-simd/fp-size)
-                (? Vt vregister)
-                (+ (? Rn register-31=sp)
-                   (? Rm register-31=z)))
-               (BITS (2 sz LOAD/STORE-SIMD/FP-SIZE)
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b00)
-                     (1 sz LOAD/STORE-SIMD/FP-OPCHI) ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (1 1)
-                     (5 Rm)
-                     (3 #b011)          ;option=LSL
-                     (1 0)              ;shift=0
-                     (2 #b10)
-                     (5 Rn)
-                     (5 Vt)))
-              ;; LDR register, SIMD&FP (B), (C7.2.178)
-              ;; STR register, SIMD&FP (B), (C7.2.316)
-              ((B (? Vt vregister)
-                  (+ (? Rn register-31=sp)
-                     ((? option load/store-extend-type)
-                      (? Rm register-31=z)
-                      (? S load/store8-extend-amount))))
-               (BITS (2 #b00)           ;size=B
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b00)
-                     (1 0)              ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (1 1)
-                     (5 Rm)
-                     (3 option)
-                     (1 S)
-                     (2 #b10)
-                     (5 Rn)
-                     (5 Vt)))
-              ;; LDR register, SIMD&FP (H), (C7.2.178)
-              ;; STR register, SIMD&FP (H), (C7.2.316)
-              ((H (? Vt vregister)
-                  (+ (? Rn register-31=sp)
-                     ((? option load/store-extend-type)
-                      (? Rm register-31=z)
-                      (? S load/store16-extend-amount))))
-               (BITS (2 #b01)           ;size=H
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b00)
-                     (1 0)              ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (1 1)
-                     (5 Rm)
-                     (3 option)
-                     (1 S)
-                     (2 #b10)
-                     (5 Rn)
-                     (5 Vt)))
-              ;; LDR register, SIMD&FP (S), (C7.2.178)
-              ;; STR register, SIMD&FP (S), (C7.2.316)
-              ((S (? Vt vregister)
-                  (+ (? Rn register-31=sp)
-                     ((? option load/store-extend-type)
-                      (? Rm register-31=z)
-                      (? S load/store32-extend-amount))))
-               (BITS (2 #b10)           ;size=H
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b00)
-                     (1 0)              ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (1 1)
-                     (5 Rm)
-                     (3 option)
-                     (1 S)
-                     (2 #b10)
-                     (5 Rn)
-                     (5 Vt)))
-
-              ;; LDR register, SIMD&FP (D), (C7.2.178)
-              ;; STR register, SIMD&FP (D), (C7.2.316)
-              ((D (? Vt vregister)
-                  (+ (? Rn register-31=sp)
-                     ((? option load/store-extend-type)
-                      (? Rm register-31=z)
-                      (? S load/store64-extend-amount))))
-               (BITS (2 #b11)           ;size=D
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b00)
-                     (1 0)              ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (1 1)
-                     (5 Rm)
-                     (3 option)
-                     (1 S)
-                     (2 #b10)
-                     (5 Rn)
-                     (5 Vt)))
-              ;; LDR register, SIMD&FP (Q), (C7.2.178)
-              ;; STR register, SIMD&FP (Q), (C7.2.316)
-              ((Q (? Vt vregister)
-                  (+ (? Rn register-31=sp)
-                     ((? option load/store-extend-type)
-                      (? Rm register-31=z)
-                      (? S load/store128-extend-amount))))
-               (BITS (2 #b00)           ;size=Q
-                     (3 #b111)
-                     (1 1)              ;SIMD/FP
-                     (2 #b00)
-                     (1 1)              ;opc[1]
-                     (1 ,load/store)    ;opc[0]
-                     (1 1)
-                     (5 Rm)
-                     (3 option)
-                     (1 S)
-                     (2 #b10)
-                     (5 Rn)
-                     (5 Vt)))
-              ,@extra))))))
-  ;; The ARM assembler has `LDRB w13,...' for byte-sized load into
-  ;; general register 13, and `LDR b13,...' for byte-sized load into
-  ;; vector register 13.  We use a separate mnemonic for general
-  ;; registers and vector registers.
-  (define-simd/fp-load/store-instruction STR.V 0)
-  (define-simd/fp-load/store-instruction LDR.V 1
-    ;; LDR PC-relative literal, SIMD&FP (C7.2.177)
-    (((? opc ldr-literal-simd/fp-size)
-      (? Vt vregister)
-      (@PCO (? offset signed-19)))
-     (BITS (2 opc)
-           (3 #b011)
-           (1 1)                        ;SIMD/FP
-           (2 #b00)
-           (19 offset SIGNED)
-           (5 Vt)))
-    (((? size) (? Vt) (@PCR (? label) (? temp register<31)))
-     (VARIABLE-WIDTH offset `(/ (- ,label *PC*) 4)
-       ((#x-40000 #x3ffff)
-        (MACRO 32 (LDR.V ,size ,Vt (@PCO (* 4 ,offset)))))
-       ((#x-100000000 #xffffffff)
-        (MACRO 64 (ADRP-ADD X ,temp (@PCO ,(* 4 offset)))) ;(*)
-        (MACRO 32 (LDR.V X ,Vt ,temp)))))))
-
 ;; Load register signed (i.e., sign-extended).  This does not detect
 ;; the nonsensical (LDRS W W ...) operand combination -- it will
 ;; assemble into a possibly different instruction.
diff --git a/src/compiler/machines/aarch64/instrf.scm b/src/compiler/machines/aarch64/instrf.scm
index b00bbdd82..dc15dc093 100644
--- a/src/compiler/machines/aarch64/instrf.scm
+++ b/src/compiler/machines/aarch64/instrf.scm
@@ -29,6 +29,638 @@ USA.
 
 (declare (usual-integrations))
 
-;;; XXX not yet
+;;; C3.2.9 Load/Store scalar SIMD and floating-point
 
-#f
+(let-syntax
+    ((define-simd/fp-load/store-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment
+         (receive (mnemonic load/store . extra) (apply values (cdr form))
+           `(define-instruction ,mnemonic
+              ;; LDR immediate, SIMD&FP, pre/post-index with signed
+              ;; byte offset (C7.2.176)
+              ;; STR immediate, SIMD&FP, pre/post-index with signed
+              ;; byte offset (C7.2.315)
+              (((? sz:opchi load/store-simd/fp-size)
+                (? Vt vregister)
+                ((? pre/post load/store-pre/post-index)
+                 (? Rn register-31=sp)
+                 (& (? offset signed-9))))
+               (BITS (2 (fix:lsh sz:opchi -1))
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b00)
+                     (1 (fix:and 1 sz:opchi)) ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (1 0)
+                     (9 offset SIGNED)
+                     (2 pre/post)
+                     (5 Rn)
+                     (5 Vt)))
+              ;; LDR immediate, SIMD&FP, zero offset (C7.2.176)
+              ;; STR immediate, SIMD&FP, zero offset (C7.2.315)
+              (((? sz:opchi load/store-simd/fp-size)
+                (? Vt vregister)
+                (? Rn register-31=sp))
+               (BITS (2 (fix:lsh sz:opchi -1))
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b01)
+                     (1 (fix:and 1 sz:opchi)) ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (12 0)             ;offset=0
+                     (5 Rn)
+                     (5 Vt)))
+              ;; LDR immediate, SIMD&FP (B), unsigned byte offset (C7.2.176)
+              ;; STR immediate, SIMD&FP (B), unsigned byte offset (C7.2.315)
+              ((B (? Vt vregister)
+                  (+ (? Rn register-31=sp)
+                     (&U (? offset unsigned-12))))
+               (BITS (2 #b00)           ;size=B, 8-bit
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b01)
+                     (1 0)              ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (12 offset)
+                     (5 Rn)
+                     (5 Vt)))
+              ;; LDR immediate, SIMD&FP (B), unsigned byte offset (C7.2.176)
+              ;; STR immediate, SIMD&FP (B), unsigned byte offset (C7.2.315)
+              ;; [same as above]
+              ((B (? Vt vregister)
+                  (+ (? Rn register-31=sp)
+                     (&U (* 1 (? offset unsigned-12)))))
+               (BITS (2 #b00)           ;size=B, 8-bit
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b01)
+                     (1 0)              ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (12 offset)
+                     (5 Rn)
+                     (5 Vt)))
+
+              ;; LDR immediate, SIMD&FP (H), unsigned 2-byte offset (C7.2.176)
+              ;; STR immediate, SIMD&FP (H), unsigned 2-byte offset (C7.2.315)
+              ((H (? Vt vregister)
+                  (+ (? Rn register-31=sp)
+                     (&U (* 2 (? offset unsigned-12)))))
+               (BITS (2 #b01)           ;size=H, 16-bit
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b01)
+                     (1 0)              ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (12 offset)
+                     (5 Rn)
+                     (5 Vt)))
+              ;; LDR immediate, SIMD&FP (S), unsigned 4-byte offset (C7.2.176)
+              ;; STR immediate, SIMD&FP (S), unsigned 4-byte offset (C7.2.315)
+              ((S (? Vt vregister)
+                  (+ (? Rn register-31=sp)
+                     (&U (* 4 (? offset unsigned-12)))))
+               (BITS (2 #b10)           ;size=S, 32-bit
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b01)
+                     (1 0)              ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (12 offset)
+                     (5 Rn)
+                     (5 Vt)))
+              ;; LDR immediate, SIMD&FP (D), unsigned 8-byte offset (C7.2.176)
+              ;; STR immediate, SIMD&FP (D), unsigned 8-byte offset (C7.2.315)
+              ((D (? Vt vregister)
+                  (+ (? Rn register-31=sp)
+                     (&U (* 8 (? offset unsigned-12)))))
+               (BITS (2 #b11)           ;size=D, 64-bit
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b01)
+                     (1 0)              ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (12 offset)
+                     (5 Rn)
+                     (5 Vt)))
+              ;; LDR immediate, SIMD&FP (Q), unsigned 16-byte offset (C7.2.176)
+              ;; STR immediate, SIMD&FP (Q), unsigned 16-byte offset (C7.2.315)
+              ((Q (? Vt vregister)
+                  (+ (? Rn register-31=sp)
+                     (&U (* 16 (? offset unsigned-12)))))
+               (BITS (2 #b00)           ;`size'
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b01)
+                     (1 0)              ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (12 offset)
+                     (5 Rn)
+                     (5 Vt)))
+
+              ;; LDR register, SIMD&FP, no extend (C7.2.178)
+              ;; STR register, SIMD&FP, no extend (C7.3.316)
+              (((? sz:opchi load/store-simd/fp-size)
+                (? Vt vregister)
+                (+ (? Rn register-31=sp)
+                   (? Rm register-31=z)))
+               (BITS (2 (fix:lsh sz:opchi -1))
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b00)
+                     (1 (fix:and 1 sz:opchi)) ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (1 1)
+                     (5 Rm)
+                     (3 #b011)          ;option=LSL
+                     (1 0)              ;shift=0
+                     (2 #b10)
+                     (5 Rn)
+                     (5 Vt)))
+              ;; LDR register, SIMD&FP (B), (C7.2.178)
+              ;; STR register, SIMD&FP (B), (C7.2.316)
+              ((B (? Vt vregister)
+                  (+ (? Rn register-31=sp)
+                     ((? option load/store-extend-type)
+                      (? Rm register-31=z)
+                      (? S load/store8-extend-amount))))
+               (BITS (2 #b00)           ;size=B
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b00)
+                     (1 0)              ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (1 1)
+                     (5 Rm)
+                     (3 option)
+                     (1 S)
+                     (2 #b10)
+                     (5 Rn)
+                     (5 Vt)))
+              ;; LDR register, SIMD&FP (H), (C7.2.178)
+              ;; STR register, SIMD&FP (H), (C7.2.316)
+              ((H (? Vt vregister)
+                  (+ (? Rn register-31=sp)
+                     ((? option load/store-extend-type)
+                      (? Rm register-31=z)
+                      (? S load/store16-extend-amount))))
+               (BITS (2 #b01)           ;size=H
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b00)
+                     (1 0)              ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (1 1)
+                     (5 Rm)
+                     (3 option)
+                     (1 S)
+                     (2 #b10)
+                     (5 Rn)
+                     (5 Vt)))
+              ;; LDR register, SIMD&FP (S), (C7.2.178)
+              ;; STR register, SIMD&FP (S), (C7.2.316)
+              ((S (? Vt vregister)
+                  (+ (? Rn register-31=sp)
+                     ((? option load/store-extend-type)
+                      (? Rm register-31=z)
+                      (? S load/store32-extend-amount))))
+               (BITS (2 #b10)           ;size=H
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b00)
+                     (1 0)              ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (1 1)
+                     (5 Rm)
+                     (3 option)
+                     (1 S)
+                     (2 #b10)
+                     (5 Rn)
+                     (5 Vt)))
+
+              ;; LDR register, SIMD&FP (D), (C7.2.178)
+              ;; STR register, SIMD&FP (D), (C7.2.316)
+              ((D (? Vt vregister)
+                  (+ (? Rn register-31=sp)
+                     ((? option load/store-extend-type)
+                      (? Rm register-31=z)
+                      (? S load/store64-extend-amount))))
+               (BITS (2 #b11)           ;size=D
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b00)
+                     (1 0)              ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (1 1)
+                     (5 Rm)
+                     (3 option)
+                     (1 S)
+                     (2 #b10)
+                     (5 Rn)
+                     (5 Vt)))
+              ;; LDR register, SIMD&FP (Q), (C7.2.178)
+              ;; STR register, SIMD&FP (Q), (C7.2.316)
+              ((Q (? Vt vregister)
+                  (+ (? Rn register-31=sp)
+                     ((? option load/store-extend-type)
+                      (? Rm register-31=z)
+                      (? S load/store128-extend-amount))))
+               (BITS (2 #b00)           ;size=Q
+                     (3 #b111)
+                     (1 1)              ;SIMD/FP
+                     (2 #b00)
+                     (1 1)              ;opc[1]
+                     (1 ,load/store)    ;opc[0]
+                     (1 1)
+                     (5 Rm)
+                     (3 option)
+                     (1 S)
+                     (2 #b10)
+                     (5 Rn)
+                     (5 Vt)))
+              ,@extra))))))
+  ;; The ARM assembler has `LDRB w13,...' for byte-sized load into
+  ;; general register 13, and `LDR b13,...' for byte-sized load into
+  ;; vector register 13.  We use a separate mnemonic for general
+  ;; registers and vector registers.
+  (define-simd/fp-load/store-instruction STR.V 0)
+  (define-simd/fp-load/store-instruction LDR.V 1
+    ;; LDR PC-relative literal, SIMD&FP (C7.2.177)
+    (((? opc ldr-literal-simd/fp-size)
+      (? Vt vregister)
+      (@PCO (? offset signed-19)))
+     (BITS (2 opc)
+           (3 #b011)
+           (1 1)                        ;SIMD/FP
+           (2 #b00)
+           (19 offset SIGNED)
+           (5 Vt)))
+    (((? size) (? Vt) (@PCR (? label) (? temp register<31)))
+     (VARIABLE-WIDTH offset `(/ (- ,label *PC*) 4)
+       ((#x-40000 #x3ffff)
+        (MACRO 32 (LDR.V ,size ,Vt (@PCO (* 4 ,offset)))))
+       ((#x-100000000 #xffffffff)
+        (MACRO 64 (ADRP-ADD X ,temp (@PCO ,(* 4 offset)))) ;(*)
+        (MACRO 32 (LDR.V X ,Vt ,temp)))))))
+
+(let-syntax
+    ((define-aes-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment
+         (receive (mnemonic opcode) (apply values (cdr form))
+           `(define-instruction ,mnemonic
+              (((? Rd vregister) (? Rn vregister))
+               (BITS (8 #b01001110)
+                     (2 #b00)           ;size
+                     (5 #b10100)
+                     (5 ,opcode)
+                     (2 #b10)
+                     (5 Rn)
+                     (5 Rd)))))))))
+  (define-aes-instruction AESE   #b00100)
+  (define-aes-instruction AESD   #b00101)
+  (define-aes-instruction AESMC  #b00110)
+  (define-aes-instruction AESIMC #b00111))
+
+(let-syntax
+    ((define-sha-3reg-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment
+         (receive (mnemonic opcode) (apply values (cdr form))
+           `(define-instruction ,mnemonic
+              (((? Rd vregister) (? Rn vregister) (? Rm vregister))
+               (BITS (8 #b01011110)
+                     (2 #b00)           ;size
+                     (1 0)
+                     (5 Rm)
+                     (1 0)
+                     (3 ,opcode)
+                     (2 #b00)
+                     (5 Rn)
+                     (5 Rd)))))))))
+  (define-sha-3reg-instruction SHA1C     #b000)
+  (define-sha-3reg-instruction SHA1P     #b001)
+  (define-sha-3reg-instruction SHA1M     #b010)
+  (define-sha-3reg-instruction SHA1SU0   #b011)
+  (define-sha-3reg-instruction SHA256H   #b100)
+  (define-sha-3reg-instruction SHA256H2  #b101)
+  (define-sha-3reg-instruction SHA256SU1 #b110))
+
+(let-syntax
+    ((define-sha-2reg-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment
+         (receive (mnemonic opcode) (apply values (cdr form))
+           `(define-instruction ,mnemonic
+              (((? Rd vregister) (? Rn vregister))
+               (BITS (8 #b01011110)
+                     (2 #b00)           ;size
+                     (5 #b10100)
+                     (5 ,opcode)
+                     (2 #b10)
+                     (5 Rn)
+                     (5 Rd)))))))))
+  (define-sha-2reg-instruction SHA1H     #b00000)
+  (define-sha-2reg-instruction SHA1SU1   #b00001)
+  (define-sha-2reg-instruction SHA256SU0 #b00010))
+
+(let-syntax
+    ((define-fp-unary-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment
+         (receive (mnemonic opcode-s U a opcode-v) (apply values (cdr form))
+           `(define-instruction ,mnemonic
+              ;; scalar
+              (((? type fp-scalar-size) (? Rd vregister) (? Rn vregister))
+               (BITS (1 0)              ;M=0
+                     (1 0)              ;?
+                     (1 0)              ;S=0
+                     (5 #b11110)
+                     (2 type)
+                     (1 1)
+                     (6 ,opcode-s)
+                     (5 #b10000)
+                     (5 Rn)
+                     (5 Rd)))
+              ;; vector
+              (((? sz:Q fp-vector-size)
+                (? Rd vregister)
+                (? Rn vregister))
+               (BITS (1 0)              ;M=0
+                     (1 (fix:and sz:Q 1)) ;Q
+                     (1 ,U)
+                     (5 #b01110)
+                     (1 ,a)
+                     (6 (fix:lsh sz:Q -1)) ;sz
+                     (5 ,opcode-v)
+                     (2 #b10)
+                     (5 Rn)
+                     (5 Rd)))))))))
+  ;; arithmetic
+  (define-fp-unary-instruction FABS   #b000001 0 1 #b01111)
+  (define-fp-unary-instruction FNEG   #b000010 1 1 #b01111)
+  (define-fp-unary-instruction FSQRT  #b000011 1 1 #b11111)
+  ;; rounding
+  (define-fp-unary-instruction FRINTN #b001000 0 0 #b11000)
+  (define-fp-unary-instruction FRINTP #b001001 0 1 #b11000)
+  (define-fp-unary-instruction FRINTM #b001010 0 0 #b11001)
+  (define-fp-unary-instruction FRINTZ #b001011 0 1 #b11001)
+  (define-fp-unary-instruction FRINTA #b001100 1 0 #b11000)
+  (define-fp-unary-instruction FRINTX #b001110 1 0 #b11001)
+  (define-fp-unary-instruction FRINTI #b001111 1 1 #b11001))
+
+(let-syntax
+    ((define-fp-binary-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment
+         (receive (mnemonic opcode-s U a S opcode-v) (apply values (cdr form))
+           `(define-instruction ,mnemonic
+              (((? type fp-scalar-size)
+                (? Rd vregister)
+                (? Rn vregister)
+                (? Rm vregister))
+               (BITS (1 0)              ;M
+                     (1 0)
+                     (1 0)              ;S
+                     (5 #b11110)
+                     (2 type)
+                     (1 1)
+                     (5 Rm)
+                     (4 ,opcode-s)
+                     (2 #b10)
+                     (5 Rn)
+                     (5 Rd)))
+              (((? Q fp16-vector-size)
+                (? Rd vregister)
+                (? Rn vregister)
+                (? Rm vregister))
+               (BITS (1 0)
+                     (1 Q)
+                     (1 ,U)
+                     (5 #b01110)
+                     (1 ,a)
+                     (1 0)
+                     (1 1)
+                     (5 Rm)
+                     (2 #b00)
+                     (4 ,opcode-v)
+                     (5 Rn)
+                     (5 Rd)))
+              (((? sz:Q fp32/64-vector-size)
+                (? Rd vregister)
+                (? Rn vregister)
+                (? Rm vregister))
+               (BITS (1 0)
+                     (1 (fix:and sz:Q 1))  ;Q
+                     (1 ,U)
+                     (5 #b01110)
+                     (1 ,S)
+                     (1 (fix:lsh sz:Q -1)) ;sz
+                     (1 1)
+                     (5 Rm)
+                     (2 #b11)
+                     (4 ,opcode-v)
+                     (5 Rn)
+                     (5 Rd)))))))))
+  (define-fp-binary-instruction FMUL   #b0000 1 0 0 #b011)
+  (define-fp-binary-instruction FDIV   #b0001 1 0 0 #b111)
+  (define-fp-binary-instruction FADD   #b0010 0 0 0 #b010)
+  (define-fp-binary-instruction FSUB   #b0011 0 1 1 #b010)
+  (define-fp-binary-instruction FMAX   #b0100 0 0 0 #b110)
+  (define-fp-binary-instruction FMIN   #b0101 0 1 1 #b110)
+  (define-fp-binary-instruction FMAXNM #b0110 0 0 0 #b000)
+  (define-fp-binary-instruction FMINNM #b0111 0 1 1 #b000))
+
+(let-syntax
+    ((define-fp-compare-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment
+         (receive (mnemonic opc-hi) (apply values (cdr form))
+           `(define-instruction ,mnemonic
+              (((? type fp-scalar-size) (? Rn vregister) Z)
+               (BITS (1 0)              ;M
+                     (1 0)
+                     (1 0)              ;S
+                     (5 #b11110)
+                     (2 type)
+                     (1 1)
+                     (5 #b0000)         ;Rm = #0.0
+                     (2 #b00)
+                     (4 #b1000)
+                     (5 Rn)
+                     (1 ,opc-hi)
+                     (1 1)              ;zero variant
+                     (3 #b000)))
+              (((? type fp-scalar-size) (? Rn vregister) (? Rm vregister))
+               (BITS (1 0)              ;M
+                     (1 0)
+                     (1 0)              ;S
+                     (5 #b11110)
+                     (2 type)
+                     (1 1)
+                     (5 Rm)
+                     (2 0)
+                     (4 #b1000)
+                     (5 Rn)
+                     (1 ,opc-hi)
+                     (1 0)              ;register variant
+                     (3 #b000)))))))))
+  ;; quiet compare
+  (define-fp-compare-instruction FCMP 0)
+  ;; compare and raise exceptions
+  (define-fp-compare-instruction FCMPE 1))
+
+(let-syntax
+    ((define-fp-conditional-compare-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment
+         (receive (mnemonic op) (apply values (cdr form))
+           `(define-instruction ,mnemonic
+              (((? type fp-scalar-size)
+                (? Rn vregister)
+                (? Rm vregister)
+                (&U (? nzcv nzcv-value))
+                (? cc branch-condition))
+               (BITS (1 0)              ;M
+                     (1 0)
+                     (1 0)              ;S
+                     (5 #b11110)
+                     (2 type)
+                     (1 1)
+                     (5 Rm)
+                     (4 cc)
+                     (2 #b01)
+                     (5 Rn)
+                     (1 ,op)
+                     (4 nzcv)))))))))
+  ;; quiet compare
+  (define-fp-conditional-compare-instruction FCCMP 0)
+  ;; compare and raise exceptions
+  (define-fp-conditional-compare-instruction FCCMPE 1))
+
+(let-syntax
+    ((define-simd-byte-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment
+         (receive (mnemonic U opc2 opcode) (apply values (cdr form))
+           `(define-instruction ,mnemonic
+              (((? Q simd-byte-vector-size)
+                (? Rd vregister)
+                (? Rn vregister)
+                (? Rm vregister))
+               (BITS (1 0)
+                     (1 Q)
+                     (1 ,U)
+                     (5 #b01110)
+                     (2 ,opc2)
+                     (1 1)
+                     (5 Rm)
+                     (5 ,opcode)
+                     (1 1)
+                     (5 Rn)
+                     (5 Rd)))))))))
+  (define-simd-byte-instruction BSL 1 #b01 #b00011)
+  (define-simd-byte-instruction BIT 1 #b10 #b00011))
+
+(define-instruction FMOV
+  ;; vector, immediate (C7.2.122)
+  (((? U:op:Q fmov-vector-immediate-size)
+    (? Rd vregister)
+    (&U (? abc fmov-abc) (? defgh fmov-defgh)))
+   (BITS (1 0)
+         (1 (fix:and 1 U:op:Q))
+         (1 (fix:and 1 (fix:lsh U:op:Q -2)))
+         (5 #b01111)
+         (5 #b00000)
+         (3 abc)
+         (4 #b1111)
+         (1 (fix:and 1 (fix:lsh U:op:Q -1)))
+         (1 1)
+         (5 defgh)
+         (5 Rd)))
+  (((? U:op:Q fmov-vector-immediate-size)
+    (? Rd vregister)
+    (&. (? abcdefgh fmov-abcdefgh)))
+   (BITS (1 0)
+         (1 (fix:and 1 U:op:Q))
+         (1 (fix:and 1 (fix:lsh U:op:Q -2)))
+         (5 #b01111)
+         (5 #b00000)
+         (3 (fix:lsh abcdefgh -5))
+         (4 #b1111)
+         (1 (fix:and 1 (fix:lsh U:op:Q -1)))
+         (1 1)
+         (5 (fix:and #b11111 abcdefgh))
+         (5 Rd)))
+  ;; register (C7.2.123)
+  (((? type fp-scalar-size) (? Rd vregister) (? Rn vregister))
+   (BITS (1 0)
+         (1 0)
+         (1 0)
+         (5 #b11110)
+         (2 type)
+         (1 1)
+         (4 #b0000)
+         (2 #b00)                       ;opc
+         (5 #b10000)
+         (5 Rn)
+         (5 Rd)))
+  ;; general (C7.2.124)
+  (((? sf sf-size) (? Rd register-31=z)
+    (? rmode:type fmov-general-size) (? Rn vregister))
+   (BITS (1 sf)
+         (1 0)
+         (1 0)
+         (5 #b11110)
+         (2 (fix:and #b11 rmode:type))
+         (1 1)
+         (2 (fix:lsh rmode:type -2))
+         (3 #b110)                      ;opcode
+         (6 #b000000)
+         (5 Rn)
+         (5 Rd)))
+  (((? rmode:type fmov-general-size) (? Rd vregister)
+    (? sf sf-size) (? Rn register-31=z))
+   (BITS (1 sf)
+         (1 0)
+         (1 0)
+         (5 #b11110)
+         (2 (fix:and #b11 rmode:type))
+         (1 1)
+         (2 (fix:lsh rmode:type -2))
+         (3 #b111)                      ;opcode
+         (6 #b000000)
+         (5 Rn)
+         (5 Rd)))
+  ;; scalar, immediate (C7.2.125)
+  (((? type fp-scalar-size) (? Rd vregister) (&U (? imm8 unsigned-8)))
+   (BITS (1 0)
+         (1 0)
+         (1 0)
+         (5 #b11110)
+         (2 type)
+         (1 1)
+         (8 imm8)
+         (3 #b100)
+         (5 #b00000)
+         (5 Rd)))
+  (((? type fp-scalar-size) (? Rd vregister) (&. (? imm8 fp-binary8)))
+   (BITS (1 0)
+         (1 0)
+         (1 0)
+         (5 #b11110)
+         (2 type)
+         (1 1)
+         (8 imm8)
+         (3 #b100)
+         (5 #b00000)
+         (5 Rd))))
diff --git a/src/compiler/machines/aarch64/insutl.scm b/src/compiler/machines/aarch64/insutl.scm
index ad91370b9..9307fc270 100644
--- a/src/compiler/machines/aarch64/insutl.scm
+++ b/src/compiler/machines/aarch64/insutl.scm
@@ -98,6 +98,11 @@ USA.
        (<= 0 x #x7f)
        x))
 
+(define (unsigned-8 x)
+  (and (exact-nonnegative-integer? x)
+       (<= 0 x #xff)
+       x))
+
 (define (unsigned-12 x)
   (and (exact-nonnegative-integer? x)
        (<= 0 x #xfff)
@@ -154,6 +159,11 @@ USA.
     ((NV) 'AL)
     (else #f)))
 
+(define (nzcv-value x)
+  (and (exact-integer? x)
+       (<= 0 x #xf)
+       x))
+
 (define (sf-size size)
   (case size
     ((W) 0)
@@ -221,6 +231,13 @@ USA.
     ((X) #b01)
     (else #f)))
 
+(define (ldr-literal-simd/fp-size t)
+  (case t
+    ((S) #b00)
+    ((D) #b01)
+    ((Q) #b10)
+    (else #f)))
+
 (define (load/store-extend-type t)
   (case t
     ((UTXW) #b010)
@@ -429,3 +446,103 @@ USA.
     ((H) #b01)
     ((W) #b10)
     (else #f)))
+
+(define (fp-scalar-size t)
+  (case t
+    ((H) #b11)
+    ((S) #b00)
+    ((D) #b01)
+    (else #f)))
+
+(define (fp-vector-size t)
+  ;; Low bit is Q, next five bits are opcode, high bit is sz.
+  (case t
+    ((H4) #b1111000)
+    ((H8) #b1111001)
+    ((S2) #b0100000)
+    ((S4) #b0100001)
+    ((D2) #b1100001)
+    (else #f)))
+
+(define (fp16-vector-size t)
+  ;; Q
+  (case t
+    ((H4) 0)
+    ((H8) 1)
+    (else #f)))
+
+(define (fp32/64-vector-size t)
+  (case t
+    ((S2) #b00)
+    ((S4) #b01)
+    ((D2) #b11)
+    (else #f)))
+
+(define (simd-byte-vector-size t)
+  ;; Q
+  (case t
+    ((B8) 0)
+    ((B16) 1)
+    (else #f)))
+
+(define (simd-integer-vector-size t)
+  ;; Low bit is Q, high two bits are sz.
+  (case t
+    ((B8) #b000)
+    ((B16) #b001)
+    ((H4) #b010)
+    ((H8) #b011)
+    ((S2) #b100)
+    ((S4) #b101)
+    (else #f)))
+
+(define (simd-double-integer-vector-size t)
+  ;; Low bit is Q, high two bits are sz.
+  (case t
+    ((B8) #b000)
+    ((B16) #b001)
+    ((H4) #b010)
+    ((H8) #b011)
+    ((S2) #b100)
+    ((S4) #b101)
+    ((D2) #b111)
+    (else #f)))
+
+(define (fmov-vector-immediate-size t)
+  ;; #b{U}{op}{Q}
+  (case t
+    ((H4) #b100)
+    ((H8) #b101)
+    ((S2) #b000)
+    ((S4) #b001)
+    ((D2) #b010)
+    (else #f)))
+
+(define (fmov-general-size t)
+  ;; Low two bits are type; high two bits are rmode.
+  (case t
+    ((H) #b0011)
+    ((S) #b0000)
+    ((D) #b0001)
+    ((D1) #b0110)
+    (else #f)))
+
+(define (fmov-abc x)
+  (and (exact-nonnegative-integer? x)
+       (<= 0 x #b111)
+       x))
+
+(define (fmov-defgh x)
+  (and (exact-nonnegative-integer? x)
+       (<= 0 x #b11111)
+       x))
+
+;; XXX fixed-point
+(define (fmov-abcdefgh x)
+  x
+  #f)
+
+;; XXX 8-bit floating-point
+(define (fp-binary8 x)
+  x
+  #f)
diff --git a/src/compiler/machines/aarch64/lapgen.scm b/src/compiler/machines/aarch64/lapgen.scm
index ae9ad5595..9b5cff4db 100644
--- a/src/compiler/machines/aarch64/lapgen.scm
+++ b/src/compiler/machines/aarch64/lapgen.scm
@@ -78,6 +78,17 @@ USA.
         ((register-value-class=word? register) 'GENERAL)
         ((register-value-class=float? register) 'FLOAT)
         (else (error "Unknown register type:" register))))
+
+;;; Convert between RTL repersentation of machine (not pseudo)
+;;; registers, and LAP syntax for register numbers.
+
+(define (float-register->fpr register)
+  (assert (<= v0 register))
+  (assert (<= register v31))
+  (- register 32))
+
+(define (fpr->float-register fpr)
+  (+ fpr 32))
 
 ;;; References, for machine register allocator.  Not used by LAP
 ;;; syntax.
@@ -137,7 +148,9 @@ USA.
              (LAP (ADD X ,target ,source (&U 0)))
              (LAP (ORR X ,target Z ,source))))
         ((FLOAT)
-         (LAP (FMOV D ,target ,source)))
+         (let ((source (float-register->fpr source))
+               (target (float-register->fpr target)))
+           (LAP (FMOV D ,target ,source))))
         (else
          (error "Unknown register type:" source target)))))
 
@@ -162,13 +175,13 @@ USA.
 (define (load-register register ea)
   (case (register-type register)
     ((GENERAL) (LAP (LDR X ,register ,ea)))
-    ((FLOAT) (LAP (LDR D ,register ,ea)))
+    ((FLOAT) (LAP (LDR D ,(float-register->fpr register) ,ea)))
     (else (error "Unknown register type:" register))))
 
 (define (store-register register ea)
   (case (register-type register)
     ((GENERAL) (LAP (STR X ,register ,ea)))
-    ((FLOAT) (LAP (STR D ,register ,ea)))
+    ((FLOAT) (LAP (STR D ,(float-register->fpr register) ,ea)))
     (else (error "Unknown register type:" register))))
 
 ;;; Utilities
@@ -311,6 +324,10 @@ USA.
   (LAP ,@(load-pc-relative-address target label)
        (LDR X ,target ,target)))
 
+(define (load-pc-relative-float target temp label)
+  (LAP ,@(load-pc-relative-address temp label)
+       (LDR.V D ,target ,temp)))
+
 (define (load-tagged-immediate target type datum)
   (load-unsigned-immediate target (make-non-pointer-literal type datum)))
 
@@ -435,6 +452,28 @@ USA.
 (define (object->address target source)
   (object->datum target source))
 
+;;;; Data labels
+
+(define (allocate-data-label datum block-name offset alignment data)
+  (let* ((block
+          (or (find-extra-code-block block-name)
+              (let ((block
+                     (declare-extra-code-block! block-name 'ANYWHERE '())))
+                (add-extra-code!
+                 block
+                 (LAP (PADDING ,offset ,alignment ,padding-string)))
+                block)))
+         (pairs (extra-code-block/xtra block))
+         (place (assoc datum pairs)))
+    (if place
+        (cdr place)
+        (let ((label (generate-label block-name)))
+          (set-extra-code-block/xtra!
+           block
+           (cons (cons datum label) pairs))
+          (add-extra-code! block (LAP (LABEL ,label) ,@data))
+          label))))
+
 ;;;; Linearizer interface
 
 (define (lap:make-label-statement label)
diff --git a/src/compiler/machines/aarch64/machine.scm b/src/compiler/machines/aarch64/machine.scm
index a9f52876e..bab1e107c 100644
--- a/src/compiler/machines/aarch64/machine.scm
+++ b/src/compiler/machines/aarch64/machine.scm
@@ -503,6 +503,7 @@ USA.
 ;; XXX doesn't belong here
 
 (define-integrable type-code:fixnum #x1a)
+(define-integrable type-code:flonum #x06)
 (define-integrable type-code:manifest-closure #x0d)
 (define-integrable type-code:manifest-vector #x00)
 
@@ -519,7 +520,6 @@ USA.
   (object-datum object))
 
 (define compiler:open-code-floating-point-arithmetic?
-  ;; XXX not yet
   #f)
 
 (define compiler:primitives-with-no-open-coding
@@ -529,30 +529,17 @@ USA.
     DIVIDE-FIXNUM       ;nobody open-codes this
     FIXNUM-LSH          ;open-coding not useful without constant operands
     FLOATING-VECTOR-CONS;nobody open-codes this
-    FLOATING-VECTOR-REF ;no flonum arithmetic yet
-    FLOATING-VECTOR-SET!;no flonum arithmetic yet
-    FLONUM-ABS          ;no flonum arithmetic yet
     FLONUM-ACOS         ;not useful to open-code hairy math
-    FLONUM-ADD          ;no flonum arithmetic yet
     FLONUM-ASIN         ;not useful to open-code hairy math
     FLONUM-ATAN         ;not useful to open-code hairy math
     FLONUM-ATAN2        ;not useful to open-code hairy math
-    FLONUM-CEILING      ;no flonum arithmetic yet
     FLONUM-COS          ;not useful to open-code hairy math
-    FLONUM-DIVIDE       ;no flonum arithmetic yet
     FLONUM-EXP          ;not useful to open-code hairy math
     FLONUM-EXPM1        ;not useful to open-code hairy math
-    FLONUM-FLOOR        ;no flonum arithmetic yet
     FLONUM-LOG          ;not useful to open-code hairy math
     FLONUM-LOG1P        ;not useful to open-code hairy math
-    FLONUM-MULTIPLY     ;no flonum arithmetic yet
-    FLONUM-NEGATE       ;no flonum arithmetic yet
-    FLONUM-ROUND        ;no flonum arithmetic yet
     FLONUM-SIN          ;not useful to open-code hairy math
-    FLONUM-SQRT         ;no flonum arithmetic yet
-    FLONUM-SUBTRACT     ;no flonum arithmetic yet
     FLONUM-TAN          ;not useful to open-code hairy math
-    FLONUM-TRUNCATE     ;no flonum arithmetic yet
     GCD-FIXNUM          ;nobody open-codes this
     VECTOR-CONS         ;nobody open-codes this
     ))
diff --git a/src/compiler/machines/aarch64/rules2.scm b/src/compiler/machines/aarch64/rules2.scm
index bf455ead8..6107f2006 100644
--- a/src/compiler/machines/aarch64/rules2.scm
+++ b/src/compiler/machines/aarch64/rules2.scm
@@ -115,6 +115,20 @@ USA.
    (lambda (label) (LAP (CBZ X ,source (@PCR ,label ,regnum:scratch-0))))
    (lambda (label) (LAP (CBNZ X ,source (@PCR ,label ,regnum:scratch-0))))))
 
+(define (set-test-bit-set-branches! source bit)
+  (set-current-branches!
+   (lambda (label)
+     (LAP (TBNZ X ,source (&U ,bit) (@PCR ,label ,regnum:scratch-0))))
+   (lambda (label)
+     (LAP (TBZ X ,source (&U ,bit) (@PCR ,label ,regnum:scratch-0))))))
+
+(define (set-test-bit-clear-branches! source bit)
+  (set-current-branches!
+   (lambda (label)
+     (LAP (TBZ X ,source (&U ,bit) (@PCR ,label ,regnum:scratch-0))))
+   (lambda (label)
+     (LAP (TBNZ X ,source (&U ,bit) (@PCR ,label ,regnum:scratch-0))))))
+
 (define (set-condition-branches! cc ~cc)
   (set-current-branches!
    (lambda (label) (LAP (B. ,cc (@PCR ,label ,regnum:scratch-0))))
diff --git a/src/compiler/machines/aarch64/rulflo.scm b/src/compiler/machines/aarch64/rulflo.scm
index a73d1b7c3..6924bd56d 100644
--- a/src/compiler/machines/aarch64/rulflo.scm
+++ b/src/compiler/machines/aarch64/rulflo.scm
@@ -29,6 +29,379 @@ USA.
 
 (declare (usual-integrations))
 
-;;; XXX not yet
+;; register: RTL representation of machine (not pseudo) register
+;; fpr: LAP representation of SIMD/FP register
+;;
+;; (not needed for general because the numbers coincide)
 
-#f
+(define (float-source-register! register)
+  (assert (eq? 'FLOAT (register-type register)))
+  (load-alias-register! register 'FLOAT))
+
+(define (float-source-fpr! register)
+  (float-register->fpr (float-source-register! register)))
+
+(define (float-move-to-temporary-register! register)
+  (assert (eq? 'FLOAT (register-type register)))
+  (move-to-temporary-register! register 'FLOAT))
+
+(define (float-move-to-temporary-fpr! register)
+  (float-register->fpr (float-move-to-temporary-register! register)))
+
+(define (float-target-register! register)
+  (assert (eq? 'FLOAT (register-type register)))
+  (delete-dead-registers!)
+  (allocate-alias-register! register 'FLOAT))
+
+(define (float-target-fpr! register)
+  (float-register->fpr (float-target-register! register)))
+
+(define (float-temporary-register!)
+  (allocate-temporary-register! 'FLOAT))
+
+(define (float-temporary-fpr!)
+  (float-register->fpr (float-temporary-register!)))
+
+(define (allocate-binary64-label bit-string)
+  (allocate-data-label bit-string 'BINARY64 0 8
+    (LAP (DATA 64 U ,(bit-string->unsigned-integer bit-string)))))
+
+(define binary64-bits:+inf
+  (unsigned-integer->bit-string 64 #x7ff0000000000000))
+
+(define binary64-bits:smallest-normal
+  (unsigned-integer->bit-string 64 #x0010000000000000))
+
+;; Layout:
+;;
+;;      0       manifest-nm-vector [length = 1 word]
+;;      8       double (binary64) float
+;;      16
+
+;;;; Flonum Loads & Stores
+
+;; Float load/store.
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target)) (OBJECT->FLOAT (REGISTER (? source))))
+  (let* ((source (standard-source! source))
+         (target (float-target-fpr! target))
+         (temp regnum:scratch-0))
+    (LAP ,@(object->address temp source)
+         (LDR.V D ,target (+ ,temp (&U (* 8 1)))))))
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target)) (FLOAT->OBJECT (REGISTER (? source))))
+  (let* ((source (float-source-fpr! source))
+         (target (standard-target! target))
+         (temp regnum:scratch-0)
+         (Free regnum:free-pointer))
+    (LAP
+     ;; target := Free
+     ,@(register->register-transfer Free target)
+     ;; temp := manifest-nm-vector, length = 1 word
+     ,@(load-tagged-immediate temp (ucode-type MANIFEST-NM-VECTOR) 1)
+     ;; *Free++ := temp
+     (STR X ,temp (POST+ ,Free (& 8)))
+     ;; *Free++ := source
+     (STR.V D ,source (POST+ ,Free (& 8)))
+     ;; Tag it.
+     ,@(affix-type target type-code:flonum target (lambda () temp)))))
+
+;; Constant-offset floating vector load/store.
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLOAT-OFFSET (REGISTER (? base))
+                        (MACHINE-CONSTANT (? offset))))
+  (QUALIFIER (fits-in-unsigned-12? offset))
+  (let* ((base (standard-source! base))
+         (target (float-target-fpr! target)))
+    (LAP (LDR.V D ,target (+ ,base (&U (* 8 ,offset)))))))
+
+(define-rule statement
+  (ASSIGN (FLOAT-OFFSET (REGISTER (? base))
+                        (MACHINE-CONSTANT (? offset)))
+          (REGISTER (? source)))
+  (QUALIFIER (fits-in-unsigned-12? offset))
+  (let* ((base (standard-source! base))
+         (source (float-source-fpr! source)))
+    (LAP (STR.V D ,source (+ ,base (&U (* 8 ,offset)))))))
+
+;; Variable-offset floating vector load/store.
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLOAT-OFFSET (REGISTER (? base))
+                        (REGISTER (? offset))))
+  (let* ((base (standard-source! base))
+         (offset (standard-source! offset))
+         (target (float-target-fpr! target)))
+    (LAP (LDR.V D ,target (+ ,base (LSL ,offset 3))))))
+
+(define-rule statement
+  (ASSIGN (FLOAT-OFFSET (REGISTER (? base))
+                        (REGISTER (? offset)))
+          (REGISTER (? source)))
+  (let* ((base (standard-source! base))
+         (offset (standard-source! offset))
+         (source (float-source-fpr! source)))
+    (LAP (STR.V D ,source (+ ,base (LSL ,offset 3))))))
+
+;;;; Flonum Arithmetic -- unary operations
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-1-ARG (? operator) (REGISTER (? source)) (? overflow?)))
+  overflow?                             ;ignore
+  (let* ((source (float-source-fpr! source))
+         (target (float-target-fpr! target)))
+    ((flonum-1-arg/operator operator) target source)))
+
+(define-integrable (flonum-1-arg/operator operation)
+  (lookup-arithmetic-method operation flonum-methods/1-arg))
+
+(define flonum-methods/1-arg
+  (list 'FLONUM-METHODS/1-ARG))
+
+(define-arithmetic-method 'FLONUM-ABS flonum-methods/1-arg
+  (lambda (target source)
+    (LAP (FABS D ,target ,source))))
+
+(define-arithmetic-method 'FLONUM-NEGATE flonum-methods/1-arg
+  (lambda (target source)
+    (LAP (FNEG D ,target ,source))))
+
+(define-arithmetic-method 'FLONUM-SQRT flonum-methods/1-arg
+  (lambda (target source)
+    (LAP (FSQRT D ,target ,source))))
+
+(define-arithmetic-method 'FLONUM-CEILING flonum-methods/1-arg
+  (lambda (target source)
+    ;; round toward Plus infinity
+    (LAP (FRINTP D ,target ,source))))
+
+(define-arithmetic-method 'FLONUM-FLOOR flonum-methods/1-arg
+  (lambda (target source)
+    ;; round toward Minus infinity
+    (LAP (FRINTM D ,target ,source))))
+
+(define-arithmetic-method 'FLONUM-ROUND flonum-methods/1-arg
+  (lambda (target source)
+    ;; round to Nearest (ties to even)
+    (LAP (FRINTN D ,target ,source))))
+
+(define-arithmetic-method 'FLONUM-TRUNCATE flonum-methods/1-arg
+  (lambda (target source)
+    ;; round toward Zero
+    (LAP (FRINTZ D ,target ,source))))
+
+;;;; Flonum Arithmetic -- binary operations
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-2-ARGS (? operator)
+                         (REGISTER (? source1))
+                         (REGISTER (? source2))
+                         (? overflow?)))
+  overflow?                             ;ignore
+  ((flonum-2-args/operator operator) target source1 source2))
+
+(define flonum-methods/2-args
+  (list 'FLONUM-METHODS/2-ARGS))
+
+(define-integrable (flonum-2-args/operator operator)
+  (lookup-arithmetic-method operator flonum-methods/2-args))
+
+(define ((flonum-2-args/standard operate) target source1 source2)
+  (let* ((source1 (float-source-fpr! source1))
+         (source2 (float-source-fpr! source2))
+         (target (float-target-fpr! target)))
+    (operate target source1 source2)))
+
+(define ((flonum-2-args/temporary operate) target source1 source2)
+  ;; Allocate a temporary register that we can write to without
+  ;; destroying source1 and source2; then assign it as an alias for the
+  ;; target at the end.
+  (let* ((source1 (float-source-fpr! source1))
+         (source2 (float-source-fpr! source2))
+         (temp-register (float-temporary-register!)))
+    (delete-dead-registers!)
+    (rtl-target:=machine-register! target temp-register)
+    (operate (float-register->fpr temp-register) source1 source2)))
+
+(define-arithmetic-method 'FLONUM-ADD flonum-methods/2-args
+  (flonum-2-args/standard
+   (lambda (target source1 source2)
+     (LAP (FADD D ,target ,source1 ,source2)))))
+
+(define-arithmetic-method 'FLONUM-SUBTRACT flonum-methods/2-args
+  (flonum-2-args/standard
+   (lambda (target source1 source2)
+     (LAP (FSUB D ,target ,source1 ,source2)))))
+
+(define-arithmetic-method 'FLONUM-MULTIPLY flonum-methods/2-args
+  (flonum-2-args/standard
+   (lambda (target source1 source2)
+     (LAP (FMUL D ,target ,source1 ,source2)))))
+
+(define-arithmetic-method 'FLONUM-DIVIDE flonum-methods/2-args
+  (flonum-2-args/standard
+   (lambda (target source1 source2)
+     (LAP (FDIV D ,target ,source1 ,source2)))))
+
+(define-arithmetic-method 'FLONUM-COPYSIGN flonum-methods/2-args
+  (flonum-2-args/temporary
+   (lambda (target source1 source2)
+     (let ((tempx (general-temporary!)))
+       ;; XXX do this without a temporary general register -- MOVI?
+       (LAP (MOVZ X ,tempx (LSL (&U #x8000) 48))
+            (FMOV D ,target X ,tempx)
+            ;; target[i] := source2[i] if signbit[i] else source1[i]
+            (BSL B8 ,target ,source2 ,source1))))))
+
+;;;; Flonum Predicates
+
+(define-rule predicate
+  (FLONUM-PRED-1-ARG (? predicate) (REGISTER (? source)))
+  (let* ((predicate (flonum-pred-1->2-args predicate))
+         (source1 (float-source-fpr! source))
+         (source2 'Z))
+    (float-branch! predicate)
+    (float-comparison predicate source1 source2)))
+
+(define (flonum-pred-1->2-args predicate)
+  (case predicate
+    ((FLONUM-NEGATIVE?) 'FLONUM-LESS?)
+    ((FLONUM-IS-ZERO?) 'FLONUM-IS-EQUAL?)
+    ((FLONUM-ZERO?) 'FLONUM-EQUAL?)
+    ((FLONUM-POSITIVE?) 'FLONUM-POSITIVE?)
+    (else (error "Invalid flonum-pred-1-arg:" predicate))))
+
+(define-rule predicate
+  (FLONUM-PRED-1-ARG FLONUM-IS-NEGATIVE? (REGISTER (? source)))
+  (let* ((source (float-source-fpr! source))
+         ;; Avoid scratch-0, which may be used in the branches.
+         (temp regnum:scratch-1))
+    (set-test-bit-set-branches! temp 63)
+    (LAP (FMOV X ,temp D ,source))))
+
+(define-rule predicate
+  (FLONUM-PRED-1-ARG FLONUM-IS-NAN? (REGISTER (? source)))
+  (let ((source (float-source-fpr! source)))
+    (set-condition-branches! 'VS 'VC)
+    (LAP (FCMP D ,source ,source))))
+
+(define-rule predicate
+  (FLONUM-PRED-1-ARG FLONUM-IS-INFINITE? (REGISTER (? source)))
+  ;; Discriminate on |x| = +inf, versus |x| =/= +inf or unordered.
+  (set-condition-branches! 'EQ 'NE)
+  (let ((source (float-source-fpr! source)))
+    (delete-dead-registers!)
+    (let* ((temp-source (float-temporary-fpr!))
+           (temp-inf (float-temporary-fpr!))
+           (label (allocate-binary64-label binary64-bits:+inf)))
+      (LAP (FABS D ,temp-source ,source)
+           ,@(load-pc-relative-float temp-inf regnum:scratch-0 label)
+           (FCMP D ,temp-source ,temp-inf)))))
+
+(define-rule predicate
+  (FLONUM-PRED-1-ARG FLONUM-IS-FINITE? (REGISTER (? source)))
+  ;; Discriminate on |x| < +inf, versus |x| >= +inf or unordered.
+  (set-condition-branches! 'LO 'HS)     ;LO = lt; HS = gt, eq, or un
+  (let ((source (float-source-fpr! source)))
+    (delete-dead-registers!)
+    (let* ((temp-source (float-temporary-fpr!))
+           (temp-inf (float-temporary-fpr!))
+           (label (allocate-binary64-label binary64-bits:+inf)))
+      (LAP (FABS D ,temp-source ,source)
+           ,@(load-pc-relative-float temp-inf regnum:scratch-0 label)
+           (FCMP D ,temp-source ,temp-inf)))))
+
+(define-rule predicate
+  (FLONUM-PRED-1-ARG FLONUM-IS-NORMAL? (REGISTER (? source)))
+  ;; Break it into two steps:
+  ;;
+  ;;    1. Is |x| < +inf, versus |x| >= +inf or unordered?
+  ;;    2. Is |x| >= smallest normal, versus |x| < smallest normal?
+  ;;
+  ;; The branches are:
+  ;;
+  ;;    GE: gt or eq, when n = v
+  ;;    LO: lt, when c = 0
+  ;;
+  (set-condition-branches! 'GE 'LO)
+  (let ((source (float-source-fpr! source)))
+    (delete-dead-registers!)
+    (let* ((temp-source (float-temporary-fpr!))
+           (temp-inf (float-temporary-fpr!))
+           (temp-norm (float-temporary-fpr!))
+           (label-inf (allocate-binary64-label binary64-bits:+inf))
+           (label-norm
+            (allocate-binary64-label binary64-bits:smallest-normal)))
+      ;; XXX Use LDP to load both registers in one go.
+      (LAP (FABS D ,temp-source ,source)
+           ,@(load-pc-relative-float temp-inf regnum:scratch-0 label-inf)
+           ,@(load-pc-relative-float temp-norm regnum:scratch-0 label-norm)
+           (FCMP D ,temp-source ,temp-inf)
+           ;; If |x| < +inf, compare source and norm; otherwise, if |x|
+           ;; >= +inf or |x| and +inf are unordered, set NCVB := 1000
+           ;; so that GE will be not-taken and LO will be taken.
+           (FCCMP D ,temp-source ,temp-norm (&U #b1000) LT)))))
+
+(define-rule predicate
+  (FLONUM-PRED-2-ARGS (? predicate)
+                      (REGISTER (? source1))
+                      (REGISTER (? source2)))
+  (let* ((source1 (float-source-fpr! source1))
+         (source2 (float-source-fpr! source2)))
+    (float-branch! predicate)
+    (float-comparison predicate source1 source2)))
+
+(define (float-branch! predicate)
+  (case predicate
+    ((FLONUM-EQUAL? FLONUM-IS-EQUAL?)
+     (set-equal-branches!))
+    ((FLONUM-LESS? FLONUM-IS-LESS?)
+     (set-condition-branches! 'LO 'HS)) ;LO = lt; HS = gt, eq, or un
+    ((FLONUM-GREATER? FLONUM-IS-GREATER?)
+     (set-condition-branches! 'GT 'LE)) ;LE = le, eq, or un
+    ((FLONUM-IS-LESS-OR-EQUAL?)         ;XXX FLONUM-LESS-OR-EQUAL?
+     (set-condition-branches! 'LS 'HI)) ;LS = le or eq; HI = gt or un
+    ((FLONUM-IS-GREATER-OR-EQUAL?)      ;XXX FLONUM-GREATER-OR-EQUAL?
+     (set-condition-branches! 'GE 'LT)) ;LT = lt or un
+    ((FLONUM-IS-LESS-OR-GREATER?)       ;XXX FLONUM-LESS-OR-GREATER?
+     (set-current-branches!
+      (lambda (label)                   ;branch to label if less or greater
+        (let ((unordered (generate-label 'UNORDERED)))
+          (LAP
+           ;; branch if unordered (V=1)
+           (B. VS (@PCR ,unordered ,regnum:scratch-0))
+           ;; branch if less or greater
+           (B. NE (@PCR ,label ,regnum:scratch-0))
+           (LABEL ,unordered))))
+      (lambda (label)                   ;branch to label if NaN or equal
+        (LAP
+         ;; branch if unordered (V=1)
+         (B. VS (@PCR ,label ,regnum:scratch-0))
+         ;; branch if equal
+         (B. EQ (@PCR ,label ,regnum:scratch-0))))))
+    ((FLONUM-IS-UNORDERED?)
+     (set-condition-branches! 'VS 'VC))
+    (else
+     (error "Unknown float predicate:" predicate))))
+
+(define (float-comparison predicate source1 source2)
+  (case predicate
+    ((FLONUM-EQUAL? FLONUM-LESS? FLONUM-GREATER?)
+     ;; floating-point compare with exception
+     (LAP (FCMPE D ,source1 ,source2)))
+    ((FLONUM-IS-EQUAL?
+      FLONUM-IS-LESS?
+      FLONUM-IS-GREATER?
+      FLONUM-IS-LESS-OR-EQUAL?
+      FLONUM-IS-GREATER-OR-EQUAL?
+      FLONUM-IS-LESS-OR-GREATER?
+      FLONUM-IS-UNORDERED?)
+     ;; floating-point quiet compare
+     (LAP (FCMP D ,source1 ,source2)))))
-- 
2.25.1