From 77873d2cea82ae189b2f2a7cc996b62e1d798974 Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <campbell@mumble.net>
Date: Wed, 11 Nov 2009 15:24:13 -0500
Subject: [PATCH] Open-code floating-point primitives on AMD x86-64.

Only lightly tested.  Use with caution.  Slippery when wet.  Keep out
of reach of children.
---
 src/compiler/machines/x86-64/insmac.scm |  49 +-
 src/compiler/machines/x86-64/instrf.scm | 984 +++++++++++++++--------
 src/compiler/machines/x86-64/insutl.scm | 116 ++-
 src/compiler/machines/x86-64/lapgen.scm | 205 ++---
 src/compiler/machines/x86-64/machin.scm | 146 ++--
 src/compiler/machines/x86-64/rules1.scm |  68 +-
 src/compiler/machines/x86-64/rulfix.scm |  59 +-
 src/compiler/machines/x86-64/rulflo.scm | 998 +++++++-----------------
 src/compiler/machines/x86-64/rulrew.scm | 115 +--
 src/microcode/cmpauxmd/x86-64.m4        | 233 ++++--
 10 files changed, 1517 insertions(+), 1456 deletions(-)
diff --git a/src/compiler/machines/x86-64/insmac.scm b/src/compiler/machines/x86-64/insmac.scm
index 9f8dede03..559354bdd 100644
--- a/src/compiler/machines/x86-64/insmac.scm
+++ b/src/compiler/machines/x86-64/insmac.scm
@@ -74,15 +74,20 @@ USA.
   (sc-macro-transformer
    (lambda (form environment)
      environment
-     (if (syntax-match? '(IDENTIFIER ? SYMBOL) (cdr form))
+     (if (syntax-match? '(IDENTIFIER * SYMBOL) (cdr form))
 	 `(DEFINE (,(cadr form) EXPRESSION)
 	    (LET ((MATCH-RESULT (PATTERN-LOOKUP ,ea-database-name EXPRESSION)))
 	      (AND MATCH-RESULT
-		   ,(if (pair? (cddr form))
-			`(LET ((EA (MATCH-RESULT)))
-			   (AND (MEMQ ',(caddr form) (EA/CATEGORIES EA))
-				EA))
-			`(MATCH-RESULT)))))
+		   ,(let ((categories (cddr form)))
+		      (if (pair? categories)
+			  `(LET ((EA (MATCH-RESULT)))
+			     (AND
+			      (OR
+			       ,@(map (lambda (category)
+					`(MEMQ ',category (EA/CATEGORIES EA)))
+				      categories))
+			      EA))
+			  `(MATCH-RESULT))))))
 	 (ill-formed-syntax form)))))
 
 (define (parse-categories categories environment context)
@@ -90,7 +95,7 @@ USA.
   (if (not (and (pair? categories)
 		(eq? 'CATEGORIES (car categories))
 		(pair? (cdr categories))
-		(memq (cadr categories) '(REGISTER MEMORY))
+		(memq (cadr categories) '(REGISTER MEMORY XMM))
 		(null? (cddr categories))))
       (error "Malformed CATEGORIES for effective address rule:"
 	     categories
@@ -215,24 +220,38 @@ USA.
 	(values tail 0))))
 
 (define (collect-prefix options tail environment)
-  (let loop ((options options) (operand #f) (register #f) (r/m #f))
+  (let loop ((options options) (operand #f) (register #f) (r/m #f) (float #f))
     (if (pair? options)
 	(case (caar options)
-	  ((OPERAND) (loop (cdr options) (cadar options) register r/m))
+	  ((OPERAND) (loop (cdr options) (cadar options) register r/m float))
 	  ((OPCODE-REGISTER)
 	   (loop (cdr options)
 		 operand
 		 (or (not (pair? (cdar options))) (cadar options))
-		 r/m))
+		 r/m
+		 float))
 	  ((ModR/M)
 	   ;; (ModR/M <r/m>), for fixed digits
 	   ;; (ModR/M <reg> <r/m>), for registers
-	   (if (pair? (cddar options))
-	       (loop (cdr options) operand (cadar options) (caddar options))
-	       (loop (cdr options) operand #f (cadar options))))
+	   (receive (register r/m)
+	       (if (pair? (cddar options))
+		   (values (cadar options) (caddar options))
+		   (values #f (cadar options)))
+	     (loop (cdr options) operand register r/m float)))
+	  ((FLOAT)
+	   ;; (FLOAT <scalar/packed> <single/double>)
+	   (loop (cdr options) operand register r/m (cdar options)))
 	  (else (error "Bad instruction prefix option:" (car options))))
-	(let ((cons-prefix (close-syntax 'CONS-PREFIX environment)))
-	  `(,cons-prefix ,operand ,register ,r/m ,tail)))))
+	(if float
+	    (let ((cons-float-prefix
+		   (close-syntax 'CONS-FLOAT-PREFIX environment)))
+	      (if operand
+		  (error "Float instructions can't have operand size prefix:"
+			 operand))
+	      `(,cons-float-prefix ,register ,r/m ,(car float) ,(cadr float)
+				   ,tail))
+	    (let ((cons-prefix (close-syntax 'CONS-PREFIX environment)))
+	      `(,cons-prefix ,operand ,register ,r/m ,tail))))))
 
 (define (collect-ModR/M field tail environment)
   (let ((digit-or-reg (car field))
diff --git a/src/compiler/machines/x86-64/instrf.scm b/src/compiler/machines/x86-64/instrf.scm
index bbad64cf6..7e3021d0b 100644
--- a/src/compiler/machines/x86-64/instrf.scm
+++ b/src/compiler/machines/x86-64/instrf.scm
@@ -23,319 +23,689 @@ USA.
 
 |#
 
-;;;; Intel i387/i486 Instruction Set
+;;;; AMD x86-64 128-bit Media Instruction Set
 ;;; package: (compiler lap-syntaxer)
 
-(declare (usual-integrations))
+;;; The mnemonics here don't entirely match the ones in the AMD
+;;; manual, or in your typical x86-64 assembler.  These mnemonics try
+;;; to adhere to a convention of treating operand sizes, precisions,
+;;; and packed/scalar choices as arguments to a common mnemonic, where
+;;; it is sensible for there to be a choice.  Sometimes this is not
+;;; entirely clear, such as PSHUF (which works only with longword-size
+;;; (32-bit) operands) and PSHUFH/PSHUFL (which work only with
+;;; word-size (16-bit) operands).  And sometimes this doesn't work
+;;; very well (e.g., MOVQ).  Most instructions for floating-point
+;;; arithmetic have F suffixed to their names; e.g., rather than
+;;; ADDSS, ADDSD, ADDPS, and ADDPD, there's a single ADDF mnemonic, to
+;;; be used as (ADDF S S ...), (ADDF S D ...), &c.
+;;;
+;;; Would it have been better just to transcribe exactly the mnemonics
+;;; in the AMD manual?  Perhaps, and it might have caused fewer errors
+;;; in transcription, since there would be fewer different formats
+;;; that way.
 
-#|
+(declare (usual-integrations))
 
 (let-syntax
-    ((define-binary-flonum
-       (sc-macro-transformer
-	(lambda (form environment)
-	  environment
-	  (let ((mnemonic (list-ref form 1))
-		(pmnemonic (list-ref form 2))
-		(imnemonic (list-ref form 3))
-		(digit (list-ref form 4))
-		(opcode1 (list-ref form 5))
-		(opcode2 (list-ref form 6)))
-	    `(begin
-	       (define-instruction ,mnemonic
-		 (((ST 0) (ST (? i)))
-		  (BYTE (8 #xd8)
-			(8 (+ ,opcode1 i))))
-
-		 (((ST (? i)) (ST 0))
-		  (BYTE (8 #xdc)
-			(8 (+ ,opcode2 i))))
-
-		 (()
-		  (BYTE (8 #xde)
-			(8 (+ ,opcode2 1))))
-
-		 ((D (? source mW))
-		  (BYTE (8 #xdc))
-		  (ModR/M ,digit source))
-
-		 ((S (? source mW))
-		  (BYTE (8 #xd8))
-		  (ModR/M ,digit source)))
-
-	       (define-instruction ,pmnemonic
-		 (((ST (? i)) (ST 0))
-		  (BYTE (8 #xde)
-			(8 (+ ,opcode2 i)))))
-
-	       (define-instruction ,imnemonic
-		 ((L (? source mW))
-		  (BYTE (8 #xda))
-		  (ModR/M ,digit source))
-
-		 ((H (? source mW))
-		  (BYTE (8 #xde))
-		  (ModR/M ,digit source)))))))))
-
-  ;; The i486 book (and 387, etc.) has inconsistent instruction
-  ;; descriptions and opcode assignments for FSUB and siblings,
-  ;; and FDIV and siblings.
-  ;; FSUB ST(i),ST is described as replacing ST(i) with ST-ST(i)
-  ;; while the opcode described replaces ST(i) with ST(i)-ST.
-
-  ;; In the following, the F% forms follow the descriptions in the
-  ;; book, namely, F%SUB computes ST-ST(i) and F%SUBR computes
-  ;; ST(i)-ST, storing into their destination (first) argument.
-
-  ;; The %-less forms follow the opcodes and usual convention,
-  ;; namely FSUB computes destination (first) argument - source
-  ;; argument FSUBR computes source - destination.
-
-  (define-binary-flonum FADD   FADDP   FIADD   0 #xc0 #xc0)
-  (define-binary-flonum F%DIV  F%DIVP  F%IDIV  6 #xf0 #xf0)
-  (define-binary-flonum F%DIVR F%DIVPR F%IDIVR 7 #xf8 #xf8)
-  (define-binary-flonum FDIV   FDIVP   FIDIV   6 #xf0 #xf8)
-  (define-binary-flonum FDIVR  FDIVPR  FIDIVR  7 #xf8 #xf0)
-  (define-binary-flonum FMUL   FMULP   FIMUL   1 #xc8 #xc8)
-  (define-binary-flonum F%SUB  F%SUBP  F%ISUB  4 #xe0 #xe0)
-  (define-binary-flonum F%SUBR F%SUBPR F%ISUBR 5 #xe8 #xe8)
-  (define-binary-flonum FSUB   FSUBP   FISUB   4 #xe0 #xe8)
-  (define-binary-flonum FSUBR  FSUBPR  FISUBR  5 #xe8 #xe0))
+    ((define-flop-instruction
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment                    ;ignore
+         (let ((mnemonic (cadr form))
+               (opcode (caddr form)))
+           `(define-instruction ,mnemonic
+              (((? p/s float-packed/scalar)
+                (? p float-precision)
+                (XMM (? target))
+                (? source xmm/m-ea))
+               (PREFIX (FLOAT p/s p) (ModR/M target source))
+               (BITS (8 #x0F)
+                     (8 ,opcode))
+               (ModR/M target source))))))))
+  (define-flop-instruction ADDF #x58)
+  (define-flop-instruction DIVF #x5E)
+  (define-flop-instruction MAXF #x5F)
+  (define-flop-instruction MINF #x5D)
+  (define-flop-instruction MULF #x59)
+  (define-flop-instruction SQRTF #x51)
+  (define-flop-instruction SUBF #x5C))
+
+(let-syntax ((define-packed-flop-instruction
+              (sc-macro-transformer
+               (lambda (form environment)
+                 environment            ;ignore
+                 (let ((mnemonic (cadr form))
+                       (opcode (caddr form)))
+                   `(define-instruction ,mnemonic
+                      ((P D (XMM (? target)) (? source xmm/m-ea))
+                       (PREFIX (ModR/M target source))
+                       (BITS (8 #x66)
+                             (8 #x0F)
+                             (8 #xD0))
+                       (ModR/M target source))
+
+                      ((P S (XMM (? target)) (? source xmm/m-ea))
+                       (PREFIX (ModR/M target source))
+                       (BITS (8 #xF2)
+                             (8 #x0F)
+                             (8 #xD0))
+                       (ModR/M target source))))))))
+  (define-packed-flop-instruction ADDSUBF       #xD0)
+  (define-packed-flop-instruction HADDF         #x7C)
+  (define-packed-flop-instruction HSUBF         #x7D))
 
-(define-trivial-instruction F2XM1 #xd9 #xf0)
-(define-trivial-instruction FABS  #xd9 #xe1)
-
-(define-instruction FBLD
-  (((? source mW))
-   (BYTE (8 #xd8))
-   (ModR/M 4 source)))
-
-(define-instruction FBSTP
-  (((? target mW))
-   (BYTE (8 #xdf))
-   (ModR/M 6 target)))
-
-(define-trivial-instruction FCHS   #xd9 #xe0)
-(define-trivial-instruction FCLEX  #x9b #xdb #xe2) ; = (FWAIT) (FNCLEX)
-(define-trivial-instruction FNCLEX #xdb #xe2)
-
-(let-syntax
-    ((define-flonum-comparison
-       (sc-macro-transformer
-	(lambda (form environment)
-	  environment
-	  (let ((mnemonic (cadr form))
-		(digit (caddr form))
-		(opcode (cadddr form)))
-	    `(define-instruction ,mnemonic
-	       (((ST 0) (ST (? i)))
-		(BYTE (8 #xd8)
-		      (8 (+ ,opcode i))))
-
-	       (()
-		(BYTE (8 #xd8)
-		      (8 (+ ,opcode 1))))
-
-	       ((D (? source mW))
-		(BYTE (8 #xdc))
-		(ModR/M ,digit source))
-
-	       ((S (? source mW))
-		(BYTE (8 #xd8))
-		(ModR/M ,digit source))))))))
-
-  (define-flonum-comparison FCOM  2 #xd0)
-  (define-flonum-comparison FCOMP 3 #xd8))
-
-(define-trivial-instruction FCOMPP  #xde #xd9)
-(define-trivial-instruction FCOS    #xd9 #xff)
-(define-trivial-instruction FDECSTP #xd9 #xf6)
-
-(define-instruction FFREE
-  (((ST (? i)))
-   (BYTE (8 #xdd)
-	 (8 (+ #xc0 i)))))
-
-(let-syntax
-    ((define-flonum-integer-comparison
-       (sc-macro-transformer
-	(lambda (form environment)
-	  environment
-	  (let ((mnemonic (cadr form))
-		(digit (caddr form)))
-	    `(define-instruction ,mnemonic
-	       ((L (? source mW))
-		(BYTE (8 #xda))
-		(ModR/M ,digit source))
-
-	       ((H (? source mW))
-		(BYTE (8 #xde))
-		(ModR/M ,digit source))))))))
-
-  (define-flonum-integer-comparison FICOM  2)
-  (define-flonum-integer-comparison FICOMP 3))
+(let-syntax ((define-packed-bitwise-instruction
+              (sc-macro-transformer
+               (lambda (form environment)
+                 environment            ;ignore
+                 (let ((mnemonic (cadr form))
+                       (opcode (caddr form)))
+                   `(define-instruction ,mnemonic
+                      ((P
+                        (? p float-precision)
+                        (XMM (? target))
+                        (? source xmm/m-ea))
+                       (PREFIX (FLOAT 'P p) (ModR/M target source))
+                       (BITS (8 #x0F)
+                             (8 ,opcode))
+                       (ModR/M target source))))))))
+  (define-packed-bitwise-instruction ANDNF      #x55)
+  (define-packed-bitwise-instruction ANDF       #x54)
+  (define-packed-bitwise-instruction ORF        #x56)
+  (define-packed-bitwise-instruction XORF       #x57)
+  ;; Not really bitwise instruction, but these two fit the pattern.
+  (define-packed-bitwise-instruction UNPCKHF    #x15)
+  (define-packed-bitwise-instruction UNPCKLF    #x15))
+
+(define-instruction CMPF
+  (((? comparator float-comparator)
+    (? p/s float-packed/scalar)
+    (? p float-precision)
+    (XMM (? source1))
+    (? source2 xmm/m-ea))
+   (PREFIX (FLOAT p/s p) (ModR/M source1 source2))
+   (BITS (8 #x0F)
+         (8 #xC2))
+   (ModR/M source1 source2)
+   (BITS (8 comparator))))
+
+(let-syntax ((define-un/ordered-compare-instruction
+              (sc-macro-transformer
+               (lambda (form environment)
+                 environment            ;ignore
+                 (let ((mnemonic (cadr form))
+                       (opcode (caddr form)))
+                   `(define-instruction ,mnemonic
+                      ((D (XMM (? source1)) (? source2 xmm/m-ea))
+                       (PREFIX (ModR/M source1 source2))
+                       (BITS (8 #x66)
+                             (8 #x0F)
+                             (8 ,opcode))
+                       (ModR/M source1 source2))
+
+                      ((S (XMM (? source1)) (? source2 xmm/m-ea))
+                       (PREFIX (ModR/M source1 source2))
+                       (BITS (8 #x0F)
+                             (8 ,opcode))
+                       (ModR/M source1 source2))))))))
+  (define-un/ordered-compare-instruction COMISF #x2F)
+  (define-un/ordered-compare-instruction UCOMISF #x2E))
 
-(let-syntax
-    ((define-flonum-integer-memory
-       (sc-macro-transformer
-	(lambda (form environment)
-	  environment
-	  (let ((mnemonic (cadr form))
-		(digit1 (caddr form))
-		(digit2 (cadddr form)))
-	    `(define-instruction ,mnemonic
-	       ,@(if (not digit2)
-		     `()
-		     `(((Q (? source mW))
-			(BYTE (8 #xdf))
-			(ModR/M ,digit2 source))))
-
-	       ((L (? source mW))
-		(BYTE (8 #xdb))
-		(ModR/M ,digit1 source))
-
-	       ((H (? source mW))
-		(BYTE (8 #xdf))
-		(ModR/M ,digit1 source))))))))
-
-  (define-flonum-integer-memory FILD  0 5)
-  (define-flonum-integer-memory FIST  2 #f)
-  (define-flonum-integer-memory FISTP 3 7))
-
-(define-trivial-instruction FINCSTP #xd9 #xf7)
-(define-trivial-instruction FINIT   #x9b #xdb #xe3) ; = (FWAIT) (FNINT)
-(define-trivial-instruction FNINIT  #xdb #xe3)
-
-(let-syntax
-    ((define-flonum-memory
-       (sc-macro-transformer
-	(lambda (form environment)
-	  environment
-	  (let ((mnemonic (list-ref form 1))
-		(digit1 (list-ref form 2))
-		(digit2 (list-ref form 3))
-		(opcode1 (list-ref form 4))
-		(opcode2 (list-ref form 5)))
-	    `(define-instruction ,mnemonic
-	       (((ST (? i)))
-		(BYTE (8 ,opcode1)
-		      (8 (+ ,opcode2 i))))
-
-	       ((D (? operand mW))
-		(BYTE (8 #xdd))
-		(ModR/M ,digit1 operand))
-
-	       ((S (? operand mW))
-		(BYTE (8 #xd9))
-		(ModR/M ,digit1 operand))
-
-	       ,@(if (not digit2)
-		     `()
-		     `(((X (? operand mW))
-			(BYTE (8 #xdb))
-			(ModR/M ,digit2 operand))))))))))
-
-  (define-flonum-memory FLD  0 5  #xd9 #xc0)
-  (define-flonum-memory FST  2 #f #xdd #xd0)
-  (define-flonum-memory FSTP 3 7  #xdd #xd8))
-
-(define-trivial-instruction FLD1   #xd9 #xe8)
-(define-trivial-instruction FLDL2T #xd9 #xe9)
-(define-trivial-instruction FLDL2E #xd9 #xea)
-(define-trivial-instruction FLDPI  #xd9 #xeb)
-(define-trivial-instruction FLDLG2 #xd9 #xec)
-(define-trivial-instruction FLDLN2 #xd9 #xed)
-(define-trivial-instruction FLDZ   #xd9 #xee)
+(let-syntax ((define-conversion-instruction
+              (sc-macro-transformer
+               (lambda (form environment)
+                 environment            ;ignore
+                 (let ((mnemonic (cadr form))
+                       (rules-definitions (cddr form)))
+                   `(define-instruction ,mnemonic
+                      ,@(append-map
+                         (lambda (rules-definition)
+                           (if (not (eq? (car rules-definition) 'DEFINE-RULES))
+                               (error "Malformed conversion rules definition:"
+                                      rules-definition))
+                           (let ((pattern (cadr rules-definition))
+                                 (prefix-options (caddr rules-definition))
+                                 (rules (cdddr rules-definition)))
+                             (map (lambda (rule)
+                                    (let ((conversion (car rule))
+                                          (bytes
+                                           (map (lambda (byte) `(8 ,byte))
+                                                (cdr rule))))
+                                      `((,conversion ,@pattern)
+                                        (PREFIX ,@prefix-options
+                                                (ModR/M reg ea))
+                                        (BITS ,@bytes)
+                                        (ModR/M reg ea))))
+                                  rules)))
+                         rules-definitions)))))))
+
+  (define-conversion-instruction CVTF
+    (define-rules ((XMM (? reg)) (? ea xmm/m-ea))
+        ()
+      (DQ->PD   #xF3 #x0F #xE6)
+      (DQ->PS        #x0F #x5B)
+      (PD->DQ   #xF2 #x0F #xE6)
+      (PD->PS   #x66 #x0F #x5A)
+      (PS->DQ   #x66 #x0F #x5B)
+      (PS->PD        #x0F #x5A)
+      (SD->SS   #xF2 #x0F #x5A)
+      (SS->SD   #xF3 #x0F #x5A))
+
+    ;++ SIZE can be only L or Q, not W.
+    (define-rules ((? size operand-size) (R (? reg)) (? ea xmm/m-ea))
+        ((OPERAND size))
+      (SD->SI   #xF2 #x0F #x2D)
+      (SS->SI   #xF3 #x0F #x2D))
+
+    (define-rules ((? size operand-size) (XMM (? reg)) (? ea r/m-ea))
+        ((OPERAND size))
+      (SI->SD   #xF2 #x0F #x2A)
+      (SI->SS   #xF3 #x0F #x2A)))
+
+  (define-conversion-instruction CVTFT  ;Convert Truncated
+    (define-rules ((XMM (? reg)) (? ea xmm/m-ea))
+        ()
+      (PD->DQ   #x66 #x0F #xE6)
+      (PS->DQ   #xF3 #x0F #x5B))
+    (define-rules ((? size operand-size) (R (? reg)) (? ea xmm/m-ea))
+        ()
+      (SD->SI   #xF2 #x0F #x2C)
+      (SS->SI   #xF3 #x0F #x2C))))
 
-(let-syntax
-    ((define-flonum-state
-       (sc-macro-transformer
-	(lambda (form environment)
-	  environment
-	  (let ((mnemonic (list-ref form 1))
-		(opcode (list-ref form 2))
-		(digit (list-ref form 3))
-		(mnemonic2 (list-ref form 4)))
-	    `(begin
-	       ,@(if (not mnemonic2)
-		     `()
-		     `((define-instruction ,mnemonic2
-			 (((? source mW))
-			  (BYTE (8 #x9b) ; (FWAIT)
-				(8 ,opcode))
-			  (ModR/M ,digit source)))))
-
-	       (define-instruction ,mnemonic
-		 (((? source mW))
-		  (BYTE (8 ,opcode))
-		  (ModR/M ,digit source)))))))))
-
-  (define-flonum-state FNLDCW  #xd9 5 FLDCW)
-  (define-flonum-state FLDENV  #xd9 4 #f)
-  (define-flonum-state FNSTCW  #xd9 7 FSTCW)
-  (define-flonum-state FNSTENV #xd9 6 FSTENV)
-  (define-flonum-state FRSTOR  #xdb 4 #f)
-  (define-flonum-state FNSAVE  #xdd 6 FSAVE))
-
-(define-trivial-instruction FNOP    #xd9 #xd0)
-(define-trivial-instruction FPATAN  #xd9 #xf3)
-(define-trivial-instruction FPREM   #xd9 #xf8) ; truncating remainder
-(define-trivial-instruction FPREM1  #xd9 #xf5) ; IEEE remainder
-(define-trivial-instruction FPTAN   #xd9 #xf2)
-(define-trivial-instruction FRNDINT #xd9 #xfc)
-(define-trivial-instruction FSCALE  #xd9 #xfd)
-(define-trivial-instruction FSIN    #xd9 #xfe)
-(define-trivial-instruction FSINCOS #xd9 #xfb)
-(define-trivial-instruction FSQRT   #xd9 #xfa)
-
-(define-instruction FSTSW
-  (((? target mW))
-   (BYTE (8 #x9b)			; (FWAIT)
-	 (8 #xdf))
-   (ModR/M 7 target))
-
-  (((R 0))
-   (BYTE (8 #x9b)			; (FWAIT)
-	 (8 #xdf)
-	 (8 #xe0))))
-
-(define-instruction FNSTSW
-  (((? target mW))
-   (BYTE (8 #xdf))
-   (ModR/M 7 target))
-
-  (((R 0))
-   (BYTE (8 #xdf)
-	 (8 #xe0))))
+(define-instruction EXTRQ               ;SSE4A only
+  (((? target xmm-ea)
+    (&U (? size unsigned-5bit))
+    (&U (? position unsigned-5bit)))
+   (PREFIX (ModR/M target))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #x78))
+   (ModR/M 0 target)
+   (BITS (8 size UNSIGNED)
+         (8 position UNSIGNED)))
+
+  (((XMM (? target)) (? source xmm-ea))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #x79))
+   (ModR/M target source)))
+
+(define-instruction INSERTQ             ;SSE4A only
+  (((XMM (? target))
+    (? source xmm-ea)
+    (&U (? size unsigned-5bit))
+    (&U (? position unsigned-5bit)))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #xF2)
+         (8 #x0F)
+         (8 #x78))
+   (ModR/M target source)
+   (BITS (8 size UNSIGNED)
+         (8 position UNSIGNED)))
+
+  (((XMM (? target)) (? source xmm-ea))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #xF2)
+         (8 #x0F)
+         (8 #x79))
+   (ModR/M target source)))
+
+(define-instruction FXRSTOR
+  (((? source m-ea))
+   (PREFIX (ModR/M source))
+   (BITS (8 #x0F)
+         (8 #xAE))
+   (ModR/M 1 source)))
+
+(define-instruction FXSAVE
+  (((? target m-ea))
+   (PREFIX (ModR/M target))
+   (BITS (8 #x0F)
+         (8 #xAE))
+   (ModR/M 0 target)))
+
+;;; How does LDDQU differ from MOVDQU?
+
+(define-instruction LDDQU               ;Load Double Quadword Unaligned
+  (((XMM (? target)) (? source m-ea))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #xF2)
+         (8 #x0F)
+         (8 #xF0))
+   (ModR/M target source)))
+
+(define-instruction LDMXCSR
+  (((? source m-ea))
+   (PREFIX (ModR/M source))
+   (BITS (8 #x0F)
+         (8 #xAE))
+   (ModR/M 2 source)))
 
-(define-trivial-instruction FTST #xd9 #xe4)
-
-(let-syntax
-    ((define-binary-flonum
-       (sc-macro-transformer
-	(lambda (form environment)
-	  environment
-	  (let ((mnemonic (cadr form))
-		(opcode1 (caddr form))
-		(opcode2 (cadddr form)))
-	    `(define-instruction ,mnemonic
-	       (((ST 0) (ST (? i)))
-		(BYTE (8 ,opcode1)
-		      (8 (+ ,opcode2 i))))
-
-	       (()
-		(BYTE (8 ,opcode1)
-		      (8 (+ ,opcode2 1))))))))))
-
-  (define-binary-flonum FUCOM  #xdd #xe0)
-  (define-binary-flonum FUCOMP #xdd #xe8)
-  (define-binary-flonum FXCH   #xd9 #xc8))
-
-(define-trivial-instruction FUCOMPP #xda #xe9)
-(define-trivial-instruction FWAIT   #x9b)
-(define-trivial-instruction FXAM    #xd9 #xe5)
-(define-trivial-instruction FXTRACT #xd9 #xf4)
-(define-trivial-instruction FYL2X   #xd9 #xf1)
-(define-trivial-instruction FYL2XP1 #xd9 #xf9)
-
-|#
\ No newline at end of file
+(define-instruction MASKMOVDQU
+  (((@R 7) (XMM (? source1)) (? source2 xmm-ea))
+   (PREFIX (ModR/M source1 source2))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #xF7))
+   (ModR/M source1 source2)))
+
+(define-instruction MOVAF               ;Aligned
+  ((P (? p float-precision) (XMM (? target)) (? source xmm/m-ea))
+   (PREFIX (FLOAT 'P p) (ModR/M target source))
+   (BITS (8 #x0F)
+         (8 #x28))
+   (ModR/M target source))
+
+  ((P (? p float-precision) (? target xmm/m-ea) (XMM (? source)))
+   (PREFIX (FLOAT 'P p) (ModR/M source target))
+   (BITS (8 #x0F)
+         (8 #x29))
+   (ModR/M source target)))
+
+(define-instruction MOVD
+  ;++ SIZE can be only L or Q, not W.
+  (((? size operand-size) (XMM (? target)) (? source r/m-ea))
+   (PREFIX (OPERAND size) (ModR/M target source))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #x6E))
+   (ModR/M target source))
+
+  (((? size operand-size) (? target r/m-ea) (XMM (? source)))
+   (PREFIX (OPERAND size) (ModR/M source target))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #x7E))
+   (ModR/M source target)))
+
+(define-instruction MOVDDUP
+  (((XMM (? target)) (? source xmm/m-ea))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #xF2)
+         (8 #x0F)
+         (8 #x12))
+   (ModR/M target source)))
+
+(let-syntax ((define-move-dq-instruction
+              (sc-macro-transformer
+               (lambda (form environment)
+                 environment            ;ignore
+                 (let ((mnemonic (cadr form))
+                       (prefix (caddr form)))
+
+                   `(define-instruction ,mnemonic
+                      (((XMM (? target)) (? source xmm/m-ea))
+                       (PREFIX (ModR/M target source))
+                       (BITS (8 ,prefix)
+                             (8 #x0F)
+                             (8 #x6F))
+                       (ModR/M target source))
+
+                      (((? target xmm/m-ea) (XMM (? source)))
+                       (PREFIX (ModR/M source target))
+                       (BITS (8 ,prefix)
+                             (8 #x0F)
+                             (8 #x7F))
+                       (ModR/M source target))))))))
+  (define-move-dq-instruction MOVDQA #x66)
+  (define-move-dq-instruction MOVDQU #xF3))
+
+(let-syntax ((define-move-high/low-instructions
+              (sc-macro-transformer
+               (lambda (form environment)
+                 (let ((MOVxy (cadr form))
+                       (MOVx (caddr form))
+                       (opcode1 (cadddr form))
+                       (opcode2 (car (cddddr form))))
+
+                   `(begin
+                      (define-instruction ,MOVxy
+                        ((P S (XMM (? target)) (? source xmm-ea))
+                         (PREFIX (ModR/M target source))
+                         (BITS (8 #x0F)
+                               (8 ,opcode1))
+                         (ModR/M target source)))
+
+                      (define-instruction ,MOVx
+                        ((P D (XMM (? target)) (? source m-ea))
+                         (PREFIX (ModR/M target source))
+                         (BITS (8 #x66)
+                               (8 #x0F)
+                               (8 ,opcode2))
+                         (ModR/M target source))
+
+                        ((P D (? target m-ea) (XMM (? source)))
+                         (PREFIX (ModR/M source target))
+                         (BITS (8 #x66)
+                               (8 #x0F)
+                               (8 ,(+ opcode2 1)))
+                         (ModR/M source target))
+
+                        ((P S (XMM (? target)) (? source m-ea))
+                         (PREFIX (ModR/M target source))
+                         (BITS (8 #x0F)
+                               (8 ,opcode2))
+                         (ModR/M target source))
+
+                        ((P S (? target m-ea) (XMM (? source)))
+                         (PREFIX (ModR/M source target))
+                         (BITS (8 #x0F)
+                               (8 ,(+ opcode2 1)))
+                         (ModR/M source target)))))))))
+  ;; Note: (MOVL ...) is very different from (MOV L ...)!
+  (define-move-high/low-instructions MOVHL MOVH #x12 #x16)
+  (define-move-high/low-instructions MOVLH MOVL #x12 #x16))
+
+(define-instruction MOVMSKF
+  ((P (? p float-precision) (R (? target)) (? source xmm-ea))
+   (PREFIX (FLOAT 'P p) (ModR/M target source))
+   (BITS (8 #x0F)
+         (8 #x50))
+   (ModR/M target source)))
+
+(define-instruction MOVNTDQ
+  (((? target m-ea) (XMM (? source)))
+   (PREFIX (FLOAT 'P 'D) (ModR/M source target))
+   (BITS (8 #x0F)
+         (8 #xE7))
+   (ModR/M source target)))
+
+(define-instruction MOVNTF
+  (((? p/s float-packed/scalar)
+    (? p float-precision)
+    (? target m-ea)
+    (XMM (? source)))
+   (PREFIX (FLOAT p/s p) (ModR/M source target))
+   (BITS (8 #x0F)
+         (8 #x2B))
+   (ModR/M source target)))
+
+;; Note: (MOVQ ...) is very different from (MOV Q ...)!
+(define-instruction MOVQ
+  (((XMM (? target)) (? source xmm/m-ea))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #xF3)
+         (8 #x0F)
+         (8 #x7E))
+   (ModR/M target source))
+
+  (((? target xmm/m-ea) (XMM (? source)))
+   (PREFIX (ModR/M source target))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #xD6))
+   (ModR/M source target)))
+
+;;; Using the mnemonic MOVF avoids conflict with the general MOVS
+;;; instruction.
+
+(define-instruction MOVF
+  ((S (? p float-precision) (XMM (? target)) (? source xmm/m-ea))
+   (PREFIX (FLOAT 'S p) (ModR/M target source))
+   (BITS (8 #x0F)
+         (8 #x10))
+   (ModR/M target source))
+
+  ((S (? p float-precision) (? target xmm/m-ea) (XMM (? source)))
+   (PREFIX (FLOAT 'S p) (ModR/M source target))
+   (BITS (8 #x0F)
+         (8 #x11))
+   (ModR/M source target)))
+
+(let-syntax ((define-mov/dup-instruction
+              (sc-macro-transformer
+               (lambda (form environment)
+                 environment            ;ignore
+                 (let ((mnemonic (cadr form))
+                       (opcode (caddr form)))
+                   `(define-instruction ,mnemonic
+                      (((XMM (? target)) (? source xmm/m-ea))
+                       (PREFIX (ModR/M target source))
+                       (BITS (8 #xF3)
+                             (8 #x0F)
+                             (8 ,opcode))
+                       (ModR/M target source))))))))
+  (define-mov/dup-instruction MOVSHDUP #x16)
+  (define-mov/dup-instruction MOVSLDUP #x12))
+
+(define-instruction MOVUF               ;Unaligned
+  ((P (? p float-precision) (XMM (? target)) (? source xmm/m-ea))
+   (PREFIX (FLOAT 'P p) (ModR/M target source))
+   (BITS (8 #x0F)
+         (8 #x10))
+   (ModR/M target source))
+  ((P (? p float-precision) (? target xmm/m-ea) (XMM (? source)))
+   (PREFIX (FLOAT 'P p) (ModR/M source target))
+   (BITS (8 #x0F)
+         (8 #x11))
+   (ModR/M source target)))
+
+(let-syntax ((define-packed-sized-instruction
+              (sc-macro-transformer
+               (lambda (form environment)
+                 environment            ;ignore
+                 (let ((mnemonic (cadr form))
+                       (size/opcode-list (cddr form)))
+                   `(define-instruction ,mnemonic
+                      ,@(map (lambda (size/opcode)
+                               (let ((size (car size/opcode))
+                                     (opcode (cadr size/opcode)))
+                                 `((,size (XMM (? target)) (? source xmm/m-ea))
+                                   (PREFIX (ModR/M target source))
+                                   (BITS (8 #x66)
+                                         (8 #x0F)
+                                         (8 ,opcode))
+                                   (ModR/M target source))))
+                             size/opcode-list)))))))
+  (define-packed-sized-instruction PACKS (B #x63) (UB #x67) (W #x6B))
+  (define-packed-sized-instruction PADD (B #xFC) (W #xFD) (L #xFE) (Q #xD4))
+  ;++ Should PADDU be considered a separate instruction from PADDS?
+  (define-packed-sized-instruction PADDS (B #xEC) (W #xED) (UB #xDC) (UW #xDD))
+  (define-packed-sized-instruction PAVG (B #xE0) (W #xE3))
+  (define-packed-sized-instruction PCMPEQ (B #x74) (W #x75) (L #x76))
+  (define-packed-sized-instruction PCMPGT (B #x64) (W #x65) (L #x66))
+  (define-packed-sized-instruction PMAX (W #xEE) (UB #xDE))
+  (define-packed-sized-instruction PMIN (W #xEA) (UB #xDA))
+  (define-packed-sized-instruction PSUB (B #xF8) (W #xF9) (L #xFA) (Q #xFB))
+  ;++ Should PSUBSU be considered a separate instruction from PSUBS?
+  (define-packed-sized-instruction PSUBS (B #xE8) (W #xE9) (UB #xD8) (UW #xD9))
+  ;++ Should the size indicate the source or target size?  Right now
+  ;++ it indicates the source size.
+  (define-packed-sized-instruction PUNPCKH (B #x68) (W #x69) (L #x6A) (Q #x6D))
+  (define-packed-sized-instruction PUNPCKL (B #x60) (W #x61) (L #x62) (Q #x6C))
+  )
+
+(let-syntax ((define-packed-instruction
+              (sc-macro-transformer
+               (lambda (form environment)
+                 environment            ;ignore
+                 (let ((mnemonic (cadr form))
+                       (opcode (caddr form)))
+                   `(define-instruction ,mnemonic
+                      (((XMM (? target)) (? source xmm/m-ea))
+                       (PREFIX (ModR/M target source))
+                       (BITS (8 #x66)
+                             (8 #x0F)
+                             (8 ,opcode))
+                       (ModR/M target source))))))))
+  (define-packed-instruction PAND #xDB)
+  (define-packed-instruction PANDN #xDF)
+  (define-packed-instruction PMADDWL #xF5)
+  (define-packed-instruction PMULUDQ #xF4)
+  (define-packed-instruction POR #xEB)
+  (define-packed-instruction PSADBW #xF6)
+  (define-packed-instruction PXOR #xEF))
+
+(define-instruction PEXTR
+  ((W (R (? target)) (? source xmm-ea) (&U (? position unsigned-3bit)))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #xC5))
+   (ModR/M target source)
+   (BITS (8 position UNSIGNED))))
+
+(define-instruction PINSR
+  ((W (XMM (? target)) (? source r-ea) (&U (? position unsigned-3bit)))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #xC4))
+   (ModR/M target source)
+   (BITS (8 position UNSIGNED))))
+
+(define-instruction PMOVMSKB
+  (((R (? target)) (? source xmm-ea))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #xD7))
+   (ModR/M target source)))
+
+(define-instruction PMULH
+  ((W (XMM (? target)) (? source xmm/m-ea))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #xE5))
+   (ModR/M target source))
+
+  ((UW (XMM (? target)) (? source xmm/m-ea))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #xE4))
+   (ModR/M target source)))
+
+(define-instruction PMULL
+  ((W (XMM (? target)) (? source xmm/m-ea))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #xD5))
+   (ModR/M target source)))
+
+(define-instruction PSHUF
+  ((L (XMM (? target)) (? source xmm/m-ea) (&U (? wibblethwop unsigned-byte)))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 #x70))
+   (ModR/M target source)
+   (BITS (8 wibblethwop))))
+
+(define-instruction PSHUFH
+  ((W (XMM (? target)) (? source xmm/m-ea) (&U (? zob unsigned-byte)))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #xF3)
+         (8 #x0F)
+         (8 #x70))
+   (ModR/M target source)
+   (BITS (8 zob))))
+
+;;; Note: (PSHUF L ...) is very different from (PSHUFL ...)!  (The
+;;; latter must be (PSHUFL W ...) in any case.)
+
+(define-instruction PSHUFL
+  ((W (XMM (? target)) (? source xmm/m-ea) (&U (? veeblefitzer unsigned-byte)))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #xF3)
+         (8 #x0F)
+         (8 #x70))
+   (ModR/M target source)
+   (BITS (8 veeblefitzer))))
+
+(let-syntax ((define-shift-instruction
+              (sc-macro-transformer
+               (lambda (form environment)
+                 environment            ;ignore
+                 (let ((mnemonic (cadr form))
+                       (digit (caddr form))
+                       (dq-digit (cadddr form))
+                       (opcode (car (cddddr form)))
+                       (size/opcode-list (cddddr form)))
+                   `(define-instruction ,mnemonic
+                      ((DQ (? target xmm-ea) (&U count unsigned-byte))
+                       (PREFIX (ModR/M target))
+                       (BITS (8 #x66)
+                             (8 #x0F)
+                             (8 #x73))
+                       (ModR/M ,dq-digit target)
+                       (BITS (8 count UNSIGNED)))
+
+                      (((? size operand-size)
+                        (? target xmm-ea)
+                        (&U (? count unsigned-byte)))
+                       (PREFIX (ModR/M target))
+                       (BITS (8 #x66)
+                             (8 #x0F)
+                             (8 (case size ((Q) #x73) ((L) #x72) ((W) #x71))))
+                       (ModR/M ,digit target)
+                       (BITS (8 count UNSIGNED)))
+
+                      (((? size operand-size)
+                        (XMM (? target))
+                        (? source xmm/m-ea))
+                       (PREFIX (ModR/M target source))
+                       (BITS (8 #x66)
+                             (8 #x0F)
+                             (8 (case size
+                                  ((Q) ,(+ opcode 1))
+                                  ((L) ,(+ opcode 2))
+                                  ((W) ,(+ opcode 3)))))
+                       (ModR/M target source))))))))
+  (define-shift-instruction PSLL 6 7 #xF0)
+  (define-shift-instruction PSRL 2 3 #xD0))
+
+(define-instruction PSRA
+  ;++ This does not admit an operand size of Q.
+  (((? size operand-size) (? target xmm-ea) (&U (? count unsigned-byte)))
+   (PREFIX (ModR/M target))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 (case size ((L) #x72) ((W) #x71))))
+   (ModR/M 4 target)
+   (BITS (8 count UNSIGNED)))
+
+  (((? size operand-size) (XMM (? target)) (? source xmm/m-ea))
+   (PREFIX (ModR/M target source))
+   (BITS (8 #x66)
+         (8 #x0F)
+         (8 (case size ((L) #xE2) ((W) #xE1))))
+   (ModR/M target source)))
+
+(let-syntax ((define-reciprocal-instruction
+              (sc-macro-transformer
+               (lambda (form environment)
+                 (let ((mnemonic (cadr form))
+                       (opcode (caddr form)))
+                   `(define-instruction ,mnemonic
+                      (((? p/s float-packed/scalar)
+                        S               ;Single-precision only.
+                        (XMM (? target))
+                        (? source xmm/m-ea))
+                       (PREFIX (FLOAT p/s 'S) (ModR/M target source))
+                       (BITS (8 #x0F)
+                             (8 ,opcode))
+                       (ModR/M target source))))))))
+  (define-reciprocal-instruction RCPF #x53)
+  (define-reciprocal-instruction RSQRTF #x52))
+
+(define-instruction SHUF
+  ((P (? p float-precision)
+      (XMM (? target))
+      (? source xmm/m-ea)
+      (&U (? command unsigned-2bit)))
+   (PREFIX (FLOAT 'P p) (ModR/M target source))
+   (BITS (8 #x0F)
+         (8 #xC6))
+   (ModR/M target source)
+   (BITS (8 command UNSIGNED))))
+
+(define-instruction STMXCSR
+  (((? target m-ea))
+   (PREFIX (ModR/M target))
+   (BITS (8 #x0F)
+         (8 #xAE))
+   (ModR/M 3 target)))
diff --git a/src/compiler/machines/x86-64/insutl.scm b/src/compiler/machines/x86-64/insutl.scm
index 752816b21..fa3ab3b2a 100644
--- a/src/compiler/machines/x86-64/insutl.scm
+++ b/src/compiler/machines/x86-64/insutl.scm
@@ -42,6 +42,12 @@ USA.
    (MODE #b11)
    (R/M (register-bits r)))
 
+  ((XMM (? r))
+   (CATEGORIES XMM)
+   (REX (B r))
+   (MODE #b11)
+   (R/M (register-bits r)))
+
 ;;;; Register-indirect
 
   ((@R (? r indirect-reg))
@@ -181,8 +187,11 @@ USA.
    (R/M 5)
    (BITS (32 offset SIGNED))))
 
-(define-ea-transformer r/m-ea)
+(define-ea-transformer r-ea REGISTER)
+(define-ea-transformer xmm-ea XMM)
 (define-ea-transformer m-ea MEMORY)
+(define-ea-transformer r/m-ea REGISTER MEMORY)
+(define-ea-transformer xmm/m-ea XMM MEMORY)
 
 (define-structure (effective-address
 		   (conc-name ea/)
@@ -197,10 +206,53 @@ USA.
 (declare (integrate-operator register-rex))
 (define-integrable (register-rex register rex)
   (declare (integrate register))
-  (if (>= register r8)
+  (if (>= register 8)
       rex
       0))
 
+(define (cons-ModR/M digit ea tail)
+  (cons-syntax (ea/register ea)
+    (cons-syntax digit
+      (cons-syntax (ea/mode ea)
+	(append-syntax! (ea/extra ea) tail)))))
+
+(declare (integrate-operator opcode-register))
+(define (opcode-register opcode register)
+  (declare (integrate opcode))
+  (+ opcode (if (>= register 8) (- register 8) register)))
+
+(declare (integrate-operator float-comparator))
+(define (float-comparator comparator)
+  (case comparator
+    ((=) 0)
+    ((<) 1)
+    ((<=) 2)
+    ((UNORDERED) 3)
+    ((/=) 4)
+    ((>=) 5)
+    ((>) 6)
+    ((ORDERED) 7)
+    (else (error "Bad float comparator:" comparator))))
+
+(declare (integrate-operator operand-size))
+(define (operand-size s)
+  ;; B must be handled separately in general.
+  (case s
+    ((W L Q) s)
+    (else #f)))
+
+(declare (integrate-operator float-packed/scalar))
+(define (float-packed/scalar s)
+  (case s
+    ((S P) s)
+    (else #f)))
+
+(declare (integrate-operator float-precision))
+(define (float-precision s)
+  (case s
+    ((D S) s)
+    (else #f)))
+
 (define (cons-prefix operand-size register ea tail)
   (let ((tail
 	 (if (eq? operand-size 'W)
@@ -222,28 +274,40 @@ USA.
 	(else (error "Invalid operand size:" operand-size)))
       (let ((extended-register?
 	     (or (eqv? register #t)
-		 (and register (>= register r8)))))
+		 (and register (>= register 8)))))
 	(if ea
 	    (fix:or (if extended-register? #x44 0) (ea/rex-prefix ea))
 	    (if extended-register? #x41 0)))))))
 
-(define (cons-ModR/M digit ea tail)
-  (cons-syntax (ea/register ea)
-    (cons-syntax digit
-      (cons-syntax (ea/mode ea)
-	(append-syntax! (ea/extra ea) tail)))))
-
-(declare (integrate-operator opcode-register))
-(define (opcode-register opcode register)
-  (declare (integrate opcode))
-  (+ opcode (if (>= register r8) (- register r8) register)))
+;;; The SSE instructions don't consistently use this pattern for their
+;;; opcodes, but enough of them do that this approximate abstraction
+;;; helps to clarify the instruction syntax.
+
+(define (cons-float-prefix register ea packed/scalar precision tail)
+  (let* ((tail
+	  (let ((float (list packed/scalar precision)))
+	    (if (equal? float '(P S))
+		tail
+		(cons
+		 (syntax-evaluation
+		  (cond ((equal? float '(P D)) #x66)
+			((equal? float '(S D)) #xF2)
+			((equal? float '(S S)) #xF3)
+			(else (error "Bad float type:" float)))
+		  coerce-8-bit-unsigned)
+		 tail))))
+	 (rex-prefix
+	  (let ((extended-register?
+		 (or (eqv? register #t)
+		     (and register (>= register 8)))))
+	    (if ea
+		(fix:or (if extended-register? #x44 0) (ea/rex-prefix ea))
+		(if extended-register? #x41 0)))))
+    (if (zero? rex-prefix)
+	tail
+	(cons-syntax (syntax-evaluation rex-prefix coerce-8-bit-unsigned)
+		     tail))))
 
-(define (operand-size s)
-  ;; B must be handled separately in general.
-  (case s
-    ((W L Q) s)
-    (else #f)))
-
 (define-integrable (register-bits r)
   (fix:and r #b111))
 
@@ -297,6 +361,18 @@ USA.
     ((8) #b11)
     (else false)))
 
+(declare (integrate-operator unsigned-2bit))
+(define (unsigned-2bit value)
+  (and (<= 0 value #b11) value))
+
+(declare (integrate-operator unsigned-3bit))
+(define (unsigned-3bit value)
+  (and (<= 0 value #b111) value))
+
+(declare (integrate-operator unsigned-5bit))
+(define (unsigned-5bit value)
+  (and (<= 0 value #b11111) value))
+
 (define (signed-byte value)
   (and (fits-in-signed-byte? value)
        value))
@@ -328,7 +404,7 @@ USA.
 (define (unsigned-quad value)
   (and (fits-in-unsigned-quad? value)
        value))
-
+
 (define (sign-extended-byte value)
   (and (fits-in-signed-byte? value)
        value))
diff --git a/src/compiler/machines/x86-64/lapgen.scm b/src/compiler/machines/x86-64/lapgen.scm
index a28795447..55f9f85c1 100644
--- a/src/compiler/machines/x86-64/lapgen.scm
+++ b/src/compiler/machines/x86-64/lapgen.scm
@@ -35,48 +35,20 @@ USA.
   ;; rbp holds the pointer mask
   ;; rsi holds the register array pointer
   ;; rdi holds the free pointer
-  ;++ float
-  ;; fr7 is not used so that we can always push on the stack once.
   (list rax rcx rdx rbx r8 r9 r10 r11 r12 r13 r14 r15
-	;++ float
-	;; fr0 fr1 fr2 fr3 fr4 fr5 fr6
-	;; mmx0 mmx1 mmx2 mmx3 mmx4 mmx5 mmx6 mmx7
-	;; xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7
-	;; xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15
-	))
+	xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7
+	xmm8 xmm9 xmm10 xmm11 xmm12 xmm13 xmm14 xmm15))
 
 (define (sort-machine-registers registers)
   registers)
 
-;++ float
-
-#;
-(define (sort-machine-registers registers)
-  ;; FR0 is preferable to other FPU regs.  We promote it to the front
-  ;; if we find another FPU reg in front of it.
-  (let loop ((regs registers))
-    (cond ((null? regs) registers)	; no float regs at all
-	  ((general-register? (car regs)); ignore general regs
-	   (loop (cdr regs)))
-	  ((= (car regs) fr0)		; found FR0 first
-	   registers)
-	  ((memq fr0 regs)		; FR0 not first, is it present?
-	   (cons fr0 (delq fr0 registers)) ; move to front
-	   registers)
-	  (else				; FR0 absent
-	   registers))))
-
 (define (register-type register)
   (cond ((machine-register? register)
 	 (vector-ref
 	  '#(GENERAL GENERAL GENERAL GENERAL GENERAL GENERAL GENERAL GENERAL
 	     GENERAL GENERAL GENERAL GENERAL GENERAL GENERAL GENERAL GENERAL
-	     ;++ float
-	     ;; FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT   ;x87 fp
-	     ;; FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT   ;MMX 64bit
-	     ;; MEDIA MEDIA MEDIA MEDIA MEDIA MEDIA MEDIA MEDIA   ;XMM 128bit
-	     ;; MEDIA MEDIA MEDIA MEDIA MEDIA MEDIA MEDIA MEDIA
-	     )
+	     FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT
+	     FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT FLOAT)
 	  register))
 	((register-value-class=word? register)
 	 'GENERAL)
@@ -87,19 +59,14 @@ USA.
 
 (define register-reference
   (let ((references (make-vector number-of-machine-registers)))
-    (do ((i rax (+ i 1)))
-	((> i r15))
-      (vector-set! references i (INST-EA (R ,i))))
-    ;++ float
-    ;; (do ((i fr0 (+ i 1)))
-    ;; 	((>= i fr7))
-    ;;   (vector-set! references i (INST-EA (ST ,(floreg->sti i)))))
-    ;; (do ((i mmx0 (+ i 1)))
-    ;; 	((>= i mmx7))
-    ;;   (vector-set! references i (INST-EA (MMX ...))))
-    ;; (do ((i xmm0 (+ i 1)))
-    ;; 	((>= i xmm15))
-    ;;   (vector-set! references i (INST-EA (XMM ...))))
+    (do ((r rax (+ r 1))
+	 (i 0 (+ i 1)))
+	((> r r15))
+      (vector-set! references r (INST-EA (R ,i))))
+    (do ((r xmm0 (+ r 1))
+	 (i 0 (+ i 1)))
+	((> r xmm15))
+      (vector-set! references r (INST-EA (XMM ,i))))
     (lambda (register)
       (vector-ref references register))))
 
@@ -109,10 +76,9 @@ USA.
 (define (reference->register-transfer source target)
   (cond ((equal? (register-reference target) source)
 	 (LAP))
-	;++ float
 	((float-register-reference? source)
 	 ;; Assume target is a float register
-	 (LAP (FLD ,source)))
+	 (LAP (MOVF S D ,(register-reference target) ,source)))
 	(else
 	 (memory->machine-register source target))))
 
@@ -126,14 +92,9 @@ USA.
 (define (register->home-transfer source target)
   (machine->pseudo-register source target))
 
-;++ float
-
 (define-integrable (float-register-reference? ea)
-  ea
-  #f
-  #;
   (and (pair? ea)
-       (eq? (car ea) 'ST)))
+       (eq? (car ea) 'XMM)))
 
 ;;;; Linearizer interface
 
@@ -162,34 +123,22 @@ USA.
 
 ;;;; Utilities for the register allocator interface
 
-(define-integrable (machine->machine-register source target)
+(define (generate-move register source-ref target-ref)
+  (if (float-register? register)
+      (LAP (MOVF S D ,target-ref ,source-ref))
+      (LAP (MOV Q ,target-ref ,source-ref))))
+
+(define (machine->machine-register source target)
   (guarantee-registers-compatible source target)
-  ;++ float
-  (if (not (float-register? source))
-      (LAP (MOV Q ,(register-reference target) ,(register-reference source)))
-      (let ((ssti (floreg->sti source))
-	    (tsti (floreg->sti target)))
-	(if (zero? ssti)
-	    (LAP (FST (ST ,tsti)))
-	    (LAP (FLD (ST ,ssti))
-		 (FSTP (ST ,(1+ tsti))))))))
+  (generate-move source
+		 (register-reference source)
+		 (register-reference target)))
 
 (define (machine-register->memory source target)
-  ;++ float
-  (if (not (float-register? source))
-      (LAP (MOV Q ,target ,(register-reference source)))
-      (let ((ssti (floreg->sti source)))
-	(if (zero? ssti)
-	    (LAP (FST D ,target))
-	    (LAP (FLD (ST ,ssti))
-		 (FSTP D ,target))))))
+  (generate-move source (register-reference source) target))
 
 (define (memory->machine-register source target)
-  ;++ float
-  (if (not (float-register? target))
-      (LAP (MOV Q ,(register-reference target) ,source))
-      (LAP (FLD D ,source)
-	   (FSTP (ST ,(1+ (floreg->sti target)))))))
+  (generate-move target source (register-reference target)))
 
 (define-integrable (offset-referenceable? offset)
   (byte-offset-referenceable? (* address-units-per-object offset)))
@@ -213,7 +162,7 @@ USA.
       (error "Negative unsigned offset:" offset))
   ;; We don't have unsigned addressing modes.
   (byte-offset-reference register offset))
-
+
 ;;; This returns an offset in objects, not bytes.
 
 (define-integrable (pseudo-register-offset register)
@@ -226,29 +175,11 @@ USA.
 (define-integrable (machine->pseudo-register source target)
   (machine-register->memory source (pseudo-register-home target)))
 
-;++ float
-
-(define (general-register? register)
-  register
-  #t)
-
-(define (float-register? register)
-  register
-  #f)
-
-(define (floreg->sti reg)
-  (error "x87 floating-point not supported:" `(FLOREG->STI ,reg)))
-
-#|
-(define-integrable (floreg->sti reg)
-  (- reg fr0))
-
 (define-integrable (general-register? register)
-  (< register fr0))
+  (< register xmm0))
 
 (define-integrable (float-register? register)
-  (<= fr0 register fr7))
-|#
+  (>= register xmm0))
 
 ;;;; Utilities for the rules
 
@@ -271,6 +202,10 @@ USA.
 	(flush-register! machine-reg)
 	(add-pseudo-register-alias! rtl-reg machine-reg))))
 
+;;; OBJECT->MACHINE-REGISTER! takes only general registers, not float
+;;; registers.  Otherwise, (INST-EA (R ,mreg)) would need to be
+;;; (register-reference mreg).
+
 (define (object->machine-register! object mreg)
   ;; This ordering allows LOAD-CONSTANT to use MREG as a temporary.
   (let ((code (load-constant (INST-EA (R ,mreg)) object)))
@@ -391,7 +326,7 @@ USA.
     temp				;ignore
     (LAP ,@prefix
 	 ,@(receiver operand))))
-
+
 ;;; SIGNED-IMMEDIATE-OPERAND and UNSIGNED-IMMEDIATE-OPERAND abstract
 ;;; the pattern of performing an operation with an instruction that
 ;;; takes an immediate operand of 32 bits, but using a value that may
@@ -410,6 +345,9 @@ USA.
     (cond ((fits-in-signed-long? value)
 	   (values #f (LAP) operand))
 	  ((fits-in-signed-quad? value)
+	   ;; (values #f
+	   ;;         (LAP)
+	   ;;         (INST-EA (@PCR ,(allocate-signed-quad-label value))))
 	   (let ((temp (temporary-reference)))
 	     (values temp (LAP (MOV Q ,temp ,operand)) temp)))
 	  (else
@@ -420,10 +358,39 @@ USA.
     (cond ((fits-in-unsigned-long? value)
 	   (values #f (LAP) operand))
 	  ((fits-in-unsigned-quad? value)
+	   ;; (values #f
+	   ;;         (LAP)
+	   ;;         (INST-EA (@PCR ,(allocate-unsigned-quad-label value))))
 	   (let ((temp (temporary-reference)))
 	     (values temp (LAP (MOV Q ,temp ,operand)) temp)))
 	  (else
 	   (error "Unsigned immediate value too large:" value)))))
+
+(define (allocate-data-label datum block-name offset alignment data)
+  (let* ((block
+	  (or (find-extra-code-block block-name)
+	      (let ((block
+		     (declare-extra-code-block! block-name 'ANYWHERE '())))
+		(add-extra-code!
+		 block
+		 (LAP (PADDING ,offset ,alignment ,padding-string)))
+		block)))
+	 (pairs (extra-code-block/xtra block))
+	 (place (assoc datum pairs)))
+    (if place
+	(cdr place)
+	(let ((label (generate-label block-name)))
+	  (set-extra-code-block/xtra!
+	   block
+	   (cons (cons datum label) pairs))
+	  (add-extra-code! block (LAP (LABEL ,label) ,@data))
+	  label))))
+
+(define (allocate-unsigned-quad-label quad)
+  (allocate-data-label quad 'QUADS 0 8 (LAP (QUAD U ,quad))))
+
+(define (allocate-signed-quad-label quad)
+  (allocate-data-label quad 'QUADS 0 8 (LAP (QUAD S ,quad))))
 
 (define (target-register target)
   (delete-dead-registers!)
@@ -461,6 +428,48 @@ USA.
 
 (define-integrable (allocate-indirection-register! register)
   (load-alias-register! register 'GENERAL))
+
+(define (binary-register-operation operate commutative? type move
+				   target source1 source2)
+  (let* ((worst-case
+	  (lambda (target source1 source2)
+	    (LAP ,@(move target source1)
+		 ,@(operate target source2))))
+	 (new-target-alias!
+	  (lambda ()
+	    (let ((source1 (standard-register-reference source1 type #f))
+		  (source2 (standard-register-reference source2 type #f)))
+	      (delete-dead-registers!)
+	      (worst-case
+	       (register-reference
+		(or (register-alias target type)
+		    (allocate-alias-register! target type)))
+	       source1
+	       source2)))))
+    (cond ((not (pseudo-register? target))
+	   (if (not (eq? (register-type target) type))
+	       (error "binary-register-operation: Wrong type register"
+		      target
+		      type)
+	       (worst-case (register-reference target)
+			   (standard-register-reference source1 type #f)
+			   (standard-register-reference source2 type #f))))
+	  ((register-copy-if-available source1 type target)
+	   => (lambda (get-alias-ref)
+		(if (= source2 source1)
+		    (let ((ref (get-alias-ref)))
+		      (operate ref ref))
+		    (let ((source2
+			   (standard-register-reference source2 type #f)))
+		      (operate (get-alias-ref) source2)))))
+	  ((not commutative?)
+	   (new-target-alias!))
+	  ((register-copy-if-available source2 type target)
+	   => (lambda (get-alias-ref)
+		(let ((source1 (standard-register-reference source1 type #f)))
+		  (operate (get-alias-ref) source1))))
+	  (else
+	   (new-target-alias!)))))
 
 (define (with-indexed-address base* index* scale b-offset protect recvr)
   (let* ((base (allocate-indirection-register! base*))
diff --git a/src/compiler/machines/x86-64/machin.scm b/src/compiler/machines/x86-64/machin.scm
index c28e8c63f..f6aa1e57d 100644
--- a/src/compiler/machines/x86-64/machin.scm
+++ b/src/compiler/machines/x86-64/machin.scm
@@ -169,48 +169,26 @@ USA.
 (define r14 14)
 (define r15 15)
 
-;;; x87 floating-point stack locations, allocated as if registers.
-
-(define fr0 16)
-(define fr1 17)
-(define fr2 18)
-(define fr3 19)
-(define fr4 20)
-(define fr5 21)
-(define fr6 22)
-(define fr7 23)
-
-;;; 64-bit media registers (deprecated).
-
-(define mmx0 24)
-(define mmx1 25)
-(define mmx2 26)
-(define mmx3 27)
-(define mmx4 28)
-(define mmx5 29)
-(define mmx6 30)
-(define mmx7 31)
-
 ;;; 128-bit media registers.
 
-(define xmm0 32)
-(define xmm1 33)
-(define xmm2 34)
-(define xmm3 35)
-(define xmm4 36)
-(define xmm5 37)
-(define xmm6 38)
-(define xmm7 39)
-(define xmm8 40)
-(define xmm9 41)
-(define xmm10 42)
-(define xmm11 43)
-(define xmm12 44)
-(define xmm13 45)
-(define xmm14 46)
-(define xmm15 47)
+(define xmm0 16)
+(define xmm1 17)
+(define xmm2 18)
+(define xmm3 19)
+(define xmm4 20)
+(define xmm5 21)
+(define xmm6 22)
+(define xmm7 23)
+(define xmm8 24)
+(define xmm9 25)
+(define xmm10 26)
+(define xmm11 27)
+(define xmm12 28)
+(define xmm13 29)
+(define xmm14 30)
+(define xmm15 31)
 
-(define number-of-machine-registers 16)
+(define number-of-machine-registers 32)
 (define number-of-temporary-registers 256)
 
 (define-integrable regnum:stack-pointer rsp)
@@ -233,12 +211,8 @@ USA.
 	 value-class=address)
 	((<= r8 register r15)
 	 value-class=object)
-	((<= fr0 register fr7)
-	 value-class=float)
-	((<= mmx0 register mmx7)
-	 (error "MMX media registers not allocated:" register))
 	((<= xmm0 register xmm15)
-	 (error "XMM media registers not allocated:" register))
+	 value-class=float)
 	(else
 	 (error "Invalid machine register:" register))))
 
@@ -392,64 +366,54 @@ USA.
       (error "Unknown register type" locative)))
 
 (define (rtl:constant-cost expression)
-  ;; i486 clock count for instruction to construct/fetch into register.
-  (let ((if-integer
-	 (lambda (value)
-	   value			; ignored
-	   ;; Can this be done in fewer bytes for suitably small values?
-	   1))				; MOV immediate
-	(get-pc-cost
-	 (+ 3				; CALL
-	    4))				; POP
-	(based-reference-cost
-	 1)				; MOV r/m
-	(address-offset-cost
-	 1))				; LEA instruction
-
-    (define (if-synthesized-constant type datum)
-      (if-integer (make-non-pointer-literal type datum)))
-
+  ;; Counts derived from the AMD64 Software Optimization Guide, Rev
+  ;; 3.06, from September 2005.  Scaled by two because LEA costs 1/2!
+  ;; This is pretty silly, but probably better than using i486 clock
+  ;; counts.
+  (let ((cost:lea 1)
+	(cost:mov-mem 6)
+	(cost:mov-imm 2)
+	(cost:or 2))
     (case (rtl:expression-type expression)
       ((CONSTANT)
        (let ((value (rtl:constant-value expression)))
 	 (if (non-pointer-object? value)
-	     (if-synthesized-constant (object-type value)
-				      (careful-object-datum value))
-	     (+ get-pc-cost based-reference-cost))))
+	     cost:mov-imm
+	     cost:mov-mem)))
       ((MACHINE-CONSTANT)
-       (if-integer (rtl:machine-constant-value expression)))
-      ((ENTRY:PROCEDURE
-	ENTRY:CONTINUATION)
-       (+ get-pc-cost address-offset-cost))
-      ((ASSIGNMENT-CACHE
-	VARIABLE-CACHE)
-       (+ get-pc-cost based-reference-cost))
-      ((OFFSET-ADDRESS
-	BYTE-OFFSET-ADDRESS
-	FLOAT-OFFSET-ADDRESS)
-       address-offset-cost)
+       cost:mov-imm)
+      ((ENTRY:PROCEDURE ENTRY:CONTINUATION)
+       (+ cost:mov-imm cost:lea cost:or))
+      ((OFFSET-ADDRESS BYTE-OFFSET-ADDRESS FLOAT-OFFSET-ADDRESS)
+       (receive (offset-selector scale)
+	   (case (rtl:expression-type expression)
+	     ((OFFSET-ADDRESS)
+	      (values rtl:offset-address-offset address-units-per-object))
+	     ((BYTE-OFFSET-ADDRESS)
+	      (values rtl:byte-offset-address-offset 1))
+	     ((FLOAT-OFFSET-ADDRESS)
+	      (values rtl:float-offset-address-offset
+		      address-units-per-float)))
+	 (let ((offset (offset-selector expression)))
+	   (if (and (rtl:machine-constant? offset)
+		    (not
+		     (fits-in-signed-long?
+		      (* scale (rtl:machine-constant-value offset)))))
+	       (+ cost:mov-imm cost:lea)
+	       cost:lea))))
       ((CONS-POINTER)
        (and (rtl:machine-constant? (rtl:cons-pointer-type expression))
 	    (rtl:machine-constant? (rtl:cons-pointer-datum expression))
-	    (if-synthesized-constant
-	     (rtl:machine-constant-value (rtl:cons-pointer-type expression))
-	     (rtl:machine-constant-value
-	      (rtl:cons-pointer-datum expression)))))
-      (else
-       false))))
+	    cost:mov-imm))
+      (else #f))))
 
 (define compiler:open-code-floating-point-arithmetic?
-  false)
+  #t)
 
 (define compiler:primitives-with-no-open-coding
   '(DIVIDE-FIXNUM
     &/
-    FLOATING-VECTOR-CONS FLOATING-VECTOR-LENGTH
-    FLOATING-VECTOR-REF FLOATING-VECTOR-SET! FLONUM-ABS
-    FLONUM-ACOS FLONUM-ADD FLONUM-ASIN FLONUM-ATAN FLONUM-ATAN2
-    FLONUM-CEILING FLONUM-COS FLONUM-DIVIDE FLONUM-EQUAL?
-    FLONUM-EXP FLONUM-FLOOR FLONUM-GREATER? FLONUM-LESS?
-    FLONUM-LOG FLONUM-MULTIPLY FLONUM-NEGATE FLONUM-NEGATIVE?
-    FLONUM-POSITIVE? FLONUM-ROUND FLONUM-SIN FLONUM-SQRT
-    FLONUM-SUBTRACT FLONUM-TAN FLONUM-TRUNCATE FLONUM-ZERO?
-    FLONUM? GCD-FIXNUM STRING-ALLOCATE VECTOR-CONS))
\ No newline at end of file
+    FLOATING-VECTOR-CONS FLONUM-ACOS FLONUM-ASIN FLONUM-ATAN
+    FLONUM-ATAN2 FLONUM-CEILING FLONUM-COS FLONUM-EXP FLONUM-FLOOR
+    FLONUM-LOG FLONUM-ROUND FLONUM-SIN FLONUM-TAN FLONUM-TRUNCATE
+    GCD-FIXNUM STRING-ALLOCATE VECTOR-CONS))
\ No newline at end of file
diff --git a/src/compiler/machines/x86-64/rules1.scm b/src/compiler/machines/x86-64/rules1.scm
index d65fa052e..d3b265155 100644
--- a/src/compiler/machines/x86-64/rules1.scm
+++ b/src/compiler/machines/x86-64/rules1.scm
@@ -372,6 +372,7 @@ USA.
 			       ,(if signed?
 				    (INST-EA (& ,n))
 				    (INST-EA (&U ,n))))
+			  ;++ Check that SCALE is a valid SIB scale.
 			  (LEA Q ,target (@RI ,source ,temp ,scale)))))))))))
 
 (define-integrable (load-displaced-register target source n scale)
@@ -394,28 +395,64 @@ USA.
 
 (define (load-pc-relative-address/typed target type label)
   ;++ This is pretty horrid, especially since it happens for every
-  ;++ continuation pushed!  Neither alternative is much good.
-  ;; Twenty bytes.
+  ;++ continuation pushed!  None of the alternatives is much good.
+  ;; Twenty bytes, but only three instructions and no extra memory.
   (let ((temp (temporary-register-reference)))
     (LAP (MOV Q ,temp (&U ,(make-non-pointer-literal type 0)))
 	 (LEA Q ,target (@PCR ,label))
 	 (OR Q ,target ,temp)))
   #|
-  ;; Nineteen bytes.
+  ;; Nineteen bytes, but rather complicated (and needs syntax for an
+  ;; addressing mode not presently supported).
+  (cond ((zero? type)
+	 (LAP (LEA Q ,target (@PCR ,label))))
+	((zero? (remainder type 8))
+	 (receive (type-divisor scale scale-log)
+	     (cond ((not (zero? (remainder type #x10))) (values 8 8 3))
+		   ((not (zero? (remainder type #x20))) (values #x10 4 2))
+		   ((not (zero? (remainder type #x40))) (values #x20 2 1))
+		   (else (error "Type too large:" type)))
+	   (let ((offset (quotient type type-divisor)))
+	     (LAP (LEA Q ,target (@PCR ,label))
+		  (LEA Q ,target (@OI ,offset ,target ,scale))
+		  (ROR Q ,target (&U ,scale-log))))))
+	(else ...))
+  |#
+  #|
+  ;; This would be brilliant, except that it needs (PC * 2^6)-relative
+  ;; addressing, rather than PC-relative addressing.
+  (let* ((reference-point (generate-label 'PC))
+	 (offset
+	  `(+ ,type
+	      (* ,(expt 2 scheme-type-width) (- ,label ,reference-point)))))
+    (LAP (LEA Q ,target (@PCO ,offset))
+	 (LABEL ,reference-point)
+	 (ROR Q ,target (&U ,scheme-type-width))))
+  |#
+  #|
+  ;; Nineteen bytes and no temporaries, but four instructions.
   (LAP (LEA Q ,target (@PCR ,label))
        (SHL Q ,target (&U ,scheme-type-width))
        (OR Q ,target (&U ,type))
        (ROR Q ,target (&U ,scheme-type-width)))
   |#
-  ;++ This doesn't work because CONSTANT->LABEL will give us a label
-  ;++ for the Scheme number object, not for the machine bit string.
   #|
-  ;; Seventeen bytes -- but we need the label to work.
+  ;; Seventeen bytes, but this requires reading eight bytes of memory.
   (let ((temp (temporary-register-reference))
 	(literal (make-non-pointer-literal type 0)))
-    (LAP (MOV Q ,temp (@PCR ,(constant->label literal)))
+    (LAP (MOV Q ,temp (@PCR ,(allocate-unsigned-quad-label literal)))
 	 (LEA Q ,target (@PCR ,label))
 	 (OR Q ,target ,temp)))
+  |#
+  #|
+  ;; Fourteen bytes and no temporaries, but this requires reading
+  ;; eight bytes of memory.
+  (let* ((reference-point (generate-label 'REFERENCE-POINT))
+	 (expression
+	  `(+ ,(make-non-pointer-literal type 0) (- ,label ,reference-point))))
+    (LAP (LABEL ,reference-point)
+	 (LEA Q ,target (@PCR ,reference-point))
+	 (ADD Q ,target (@PCR ,(allocate-unsigned-quad-label expression)))))
   |#)
 
 (define (load-char-into-register type source target)
@@ -464,17 +501,18 @@ USA.
 				   (rtl:offset-address-offset base))))
        expression))
 
-(define (with-decoded-detagged-offset expression recvr)
+(define (with-decoded-detagged-offset expression receiver)
   (let ((base (rtl:offset-base expression)))
     (let ((base* (rtl:offset-address-base base))
 	  (index (rtl:offset-address-offset base)))
-      (recvr (rtl:register-number (if (rtl:register? base*)
-				      base*
-				      (rtl:object->address-expression base*)))
-	     (rtl:register-number (if (rtl:register? index)
-				      index
-				      (rtl:object->datum-expression index)))
-	     (rtl:machine-constant-value (rtl:offset-offset expression))))))
+      (receiver
+       (rtl:register-number (if (rtl:register? base*)
+				base*
+				(rtl:object->address-expression base*)))
+       (rtl:register-number (if (rtl:register? index)
+				index
+				(rtl:object->datum-expression index)))
+       (rtl:machine-constant-value (rtl:offset-offset expression))))))
 
 ;;;; Improved string references
 
diff --git a/src/compiler/machines/x86-64/rulfix.scm b/src/compiler/machines/x86-64/rulfix.scm
index 9ca807e42..863d17d7b 100644
--- a/src/compiler/machines/x86-64/rulfix.scm
+++ b/src/compiler/machines/x86-64/rulfix.scm
@@ -316,54 +316,14 @@ USA.
 		   FIXNUM-AND
 		   FIXNUM-OR
 		   FIXNUM-XOR)))
-	     
+	     
 (define ((fixnum-2-args/standard commutative? operate) target source1
 						       source2 overflow?)
   overflow?				; ignored
-  (two-arg-register-operation operate
-			      commutative?
-			      target
-			      source1
-			      source2))
-
-(define (two-arg-register-operation operate commutative?
-				    target source1 source2)
-  (let* ((worst-case
-	  (lambda (target source1 source2)
-	    (LAP (MOV Q ,target ,source1)
-		 ,@(operate target source2))))
-	 (new-target-alias!
-	  (lambda ()
-	    (let ((source1 (any-reference source1))
-		  (source2 (any-reference source2)))
-	      (delete-dead-registers!)
-	      (worst-case (target-register-reference target)
-			  source1
-			  source2)))))
-    (cond ((not (pseudo-register? target))
-	   (if (not (eq? (register-type target) 'GENERAL))
-	       (error "two-arg-register-operation: Wrong type register"
-		      target 'GENERAL)
-	       (worst-case (register-reference target)
-			   (any-reference source1)
-			   (any-reference source2))))
-	  ((register-copy-if-available source1 'GENERAL target)
-	   =>
-	   (lambda (get-alias-ref)
-	     (if (= source2 source1)
-		 (let ((ref (get-alias-ref)))
-		   (operate ref ref))
-		 (let ((source2 (any-reference source2)))
-		   (operate (get-alias-ref) source2)))))
-	  ((not commutative?)
-	   (new-target-alias!))
-	  ((register-copy-if-available source2 'GENERAL target)
-	   =>
-	   (lambda (get-alias-ref)
-	     (let ((source1 (any-reference source1)))
-	       (operate (get-alias-ref) source1))))
-	  (else
-	   (new-target-alias!)))))
+  (binary-register-operation operate commutative? 'GENERAL
+			     (lambda (target source)
+			       (LAP (MOV Q ,target ,source)))
+			     target source1 source2))
 
 (define (fixnum-2-args/register*constant operator target
 					 source constant overflow?)
@@ -508,11 +468,10 @@ USA.
     (lambda (target source1 source2 overflow?)
       overflow?				; ignored
       (require-register! rcx)
-      (two-arg-register-operation operate
-				  #f
-				  target
-				  source1
-				  source2))))
+      (binary-register-operation operate #f 'GENERAL
+				 (lambda (target source)
+				   (LAP (MOV Q ,target ,source)))
+				 target source1 source2))))
 
 (define (do-division target source1 source2 result-reg)
   (prefix-instructions! (load-machine-register! source1 rax))
diff --git a/src/compiler/machines/x86-64/rulflo.scm b/src/compiler/machines/x86-64/rulflo.scm
index 885624014..e5ca59fb8 100644
--- a/src/compiler/machines/x86-64/rulflo.scm
+++ b/src/compiler/machines/x86-64/rulflo.scm
@@ -27,805 +27,369 @@ USA.
 ;;; package: (compiler lap-syntaxer)
 
 (declare (usual-integrations))
-
-#|
 
-;; ****
-;; Missing: 2 argument operations and predicates with non-trivial
-;; constant arguments.
-;; Also missing with (OBJECT->FLOAT (REGISTER ...)) operands.
-;; ****
+(define (flonum-source! source)
+  (or (register-alias source 'FLOAT)
+      (load-alias-register! source 'FLOAT)))
 
-(define (flonum-source! register)
-  (floreg->sti (load-alias-register! register 'FLOAT)))
+(define-integrable (flonum-source-reference! source)
+  (register-reference (flonum-source! source)))
 
-(define (flonum-target! pseudo-register)
+(define (flonum-target! target)
   (delete-dead-registers!)
-  (floreg->sti (allocate-alias-register! pseudo-register 'FLOAT)))
+  (or (register-alias target 'FLOAT)
+      (allocate-alias-register! target 'FLOAT)))
+
+(define-integrable (flonum-target-reference! target)
+  (register-reference (flonum-target! target)))
+
+;;; FLONUM-DATA-OFFSET is the number of bytes after the location
+;;; addressed by a FLONUM-tagged pointer before the actual flonum data
+;;; begin.
 
-(define (flonum-temporary!)
-  (allocate-temporary-register! 'FLOAT))
+(define-integrable flonum-data-offset address-units-per-object)
 
 (define-rule statement
-  ;; convert a floating-point number to a flonum object
-  (ASSIGN (REGISTER (? target))
-	  (FLOAT->OBJECT (REGISTER (? source))))
-  (let* ((source (register-alias source 'FLOAT))
-	 (target (target-register-reference target)))
-    (LAP (MOV W (@R ,regnum:free-pointer)
-	      (&U ,(make-non-pointer-literal
-		    (ucode-type manifest-nm-vector)
-		    2)))
-	 ,@(if (not source)
-	       ;; Value is in memory home
-	       (let ((off (pseudo-register-offset source))
-		     (temp (temporary-register-reference)))
-		 (LAP (MOV W ,target
-			   ,(offset-reference regnum:regs-pointer off))
-		      (MOV W ,temp
-			   ,(offset-reference regnum:regs-pointer (1+ off)))
-		      (MOV W (@RO B ,regnum:free-pointer 4) ,target)
-		      (MOV W (@RO B ,regnum:free-pointer 8) ,temp)))
-	       (store-float (floreg->sti source)
-			    (INST-EA (@RO B ,regnum:free-pointer 4))))
-	 (LEA ,target
-	      (@RO UW ,regnum:free-pointer
-		   ,(make-non-pointer-literal (ucode-type flonum) 0)))
-	 (ADD W (R ,regnum:free-pointer) (& 12)))))
-
-#|
+  (ASSIGN (REGISTER (? target)) (FLOAT->OBJECT (REGISTER (? source))))
+  (let* ((source (flonum-source-reference! source))
+         (target (target-register-reference target)))
+    (LAP ,@(with-unsigned-immediate-operand
+               (make-non-pointer-literal (ucode-type MANIFEST-NM-VECTOR) 1)
+             (lambda (operand)
+               (LAP (MOV Q (@R ,regnum:free-pointer) ,operand))))
+         (MOVF S D (@RO ,regnum:free-pointer ,flonum-data-offset) ,source)
+         (MOV Q ,target (&U ,(make-non-pointer-literal (ucode-type FLONUM) 0)))
+         (OR Q ,target (R ,regnum:free-pointer))
+         (LEA Q (R ,regnum:free-pointer)
+              (@RO ,regnum:free-pointer
+                   ,(+ flonum-data-offset address-units-per-float))))))
+
 (define-rule statement
-  ;; convert a flonum object to a floating-point number
   (ASSIGN (REGISTER (? target)) (OBJECT->FLOAT (REGISTER (? source))))
   (let* ((source (move-to-temporary-register! source 'GENERAL))
-	 (target (flonum-target! target)))
+         (target (flonum-target-reference! target)))
     (LAP ,@(object->address (register-reference source))
-	 ,@(load-float (INST-EA (@RO B ,source 4)) target))))
-|#
-
-(define-rule statement
-  ;; Convert a flonum object to a floating-point number.  Unlike the
-  ;; version above which has an implicits OBJECT->ADDRESS, this one
-  ;; uses the addressing mode to remove the type-code.  Saves a cycle
-  ;; and maybe a register spill if SOURCE is live after instruction.
-  (ASSIGN (REGISTER (? target)) (OBJECT->FLOAT (REGISTER (? source))))
-  (let* ((source (source-register source))
-	 (target (flonum-target! target)))
-    (object->float source target)))
-
-(define (object->float source-register target)
-  (let ((untagging+offset
-	 (- 4 (make-non-pointer-literal (ucode-type flonum) 0))))
-    (load-float (INST-EA (@RO W ,source-register ,untagging+offset)) target)))
+         (MOVF S D ,target (@RO ,source ,flonum-data-offset)))))
 
-;;;; Floating-point vector support.
-
 (define-rule statement
   (ASSIGN (REGISTER (? target)) (? expression rtl:simple-float-offset?))
   (let* ((source (float-offset->reference! expression))
-	 (target (flonum-target! target)))
-    (load-float source target)))
+         (target (flonum-target-reference! target)))
+    (LAP (MOVF S D ,target ,source))))
 
 (define-rule statement
   (ASSIGN (? expression rtl:simple-float-offset?) (REGISTER (? source)))
-  (let ((source (flonum-source! source))
-	(target (float-offset->reference! expression)))
-    (store-float source target)))
+  (let ((source (flonum-source-reference! source))
+        (target (float-offset->reference! expression)))
+    (LAP (MOVF S D ,target ,source))))
 
 (define-rule statement
-  (ASSIGN (REGISTER (? target))
-	  (? expression rtl:detagged-float-offset?))
+  (ASSIGN (REGISTER (? target)) (? expression rtl:detagged-float-offset?))
   (with-detagged-float-location expression
-    (lambda (temp)
-      (load-float temp target))))
+    (lambda (source)
+      (LAP (MOVF S D ,(flonum-target-reference! target) ,source)))))
 
 (define-rule statement
-  (ASSIGN (? expression rtl:detagged-float-offset?)
-	  (REGISTER (? source)))
+  (ASSIGN (? expression rtl:detagged-float-offset?) (REGISTER (? source)))
   (with-detagged-float-location expression
-    (lambda (temp)
-      (store-float (flonum-source! source) temp))))
+    (lambda (target)
+      (LAP (MOVF S D ,target ,(flonum-source-reference! source))))))
 
-(define (with-detagged-float-location rtl-expression recvr)
-  ;; Never needs to protect a register because it is a float register!
+(define (with-detagged-float-location rtl-expression receiver)
   (with-decoded-detagged-float-offset rtl-expression
-    (lambda (base index w-offset)
-      (with-indexed-address base index 8 (* 4 w-offset) false recvr))))
+    (lambda (base float-index object-offset)
+      (with-indexed-address base float-index address-units-per-float
+          (* address-units-per-object object-offset)
+          ;; No general registers to protect -- the target and source
+          ;; will always be float registers.
+          #f
+        receiver))))
+
+;;; These are nearly identical copies of RTL:DETAGGED-OFFSET? and
+;;; WITH-DECODED-DETAGGED-OFFSET, with FLOAT-OFFSET substituted for
+;;; OFFSET.  It is unfortunate that the RTL doesn't have a clearer
+;;; abstraction of offsets and addresses with arbitrary data.
 
 (define (rtl:detagged-float-offset? expression)
   (and (rtl:float-offset? expression)
-       (let ((base (rtl:float-offset-base expression))
-	     (offset (rtl:float-offset-offset expression)))
-	 (and (rtl:offset-address? base)
-	      (rtl:machine-constant? (rtl:offset-address-offset base))
-	      (rtl:detagged-index? (rtl:offset-address-base base)
-				   offset)))
+       (rtl:machine-constant? (rtl:float-offset-offset expression))
+       (let ((base (rtl:float-offset-base expression)))
+         (and (rtl:float-offset-address? base)
+              (rtl:detagged-index? (rtl:float-offset-address-base base)
+                                   (rtl:float-offset-address-offset base))))
        expression))
 
-(define (with-decoded-detagged-float-offset expression recvr)
-  (let ((base (rtl:float-offset-base expression))
-	(index (rtl:float-offset-offset expression)))
-    (let ((base* (rtl:offset-address-base base)))
-      (recvr (rtl:register-number (if (rtl:register? base*)
-				      base*
-				      (rtl:object->address-expression base*)))
-	     (rtl:register-number (if (rtl:register? index)
-				      index
-				      (rtl:object->datum-expression index)))
-	     (rtl:machine-constant-value (rtl:offset-address-offset base))))))
-
-(define (load-float ea sti)
-  (LAP (FLD D ,ea)
-       (FSTP (ST ,(1+ sti)))))
-
-(define (store-float sti ea)
-  (if (zero? sti)
-      (LAP (FST D ,ea))
-      (LAP (FLD (ST ,sti))
-	   (FSTP D ,ea))))
+(define (with-decoded-detagged-float-offset expression receiver)
+  (let ((base (rtl:float-offset-base expression)))
+    (let ((base* (rtl:float-offset-address-base base))
+          (index (rtl:float-offset-address-offset base)))
+      (receiver
+       (rtl:register-number (if (rtl:register? base*)
+                                base*
+                                (rtl:object->address-expression base*)))
+       (rtl:register-number (if (rtl:register? index)
+                                index
+                                (rtl:object->datum-expression index)))
+       (rtl:machine-constant-value (rtl:float-offset-offset expression))))))
 
 ;;;; Flonum Arithmetic
 
 (define-rule statement
   (ASSIGN (REGISTER (? target))
-	  (FLONUM-1-ARG (? operation) (REGISTER (? source)) (? overflow?)))
-  overflow?				;ignore
-  ((flonum-1-arg/operator operation) target source))
-
-(define ((flonum-unary-operation/general operate) target source)
-  (define (default)
-    (let* ((source (flonum-source! source))
-	   (target (flonum-target! target)))
-      (operate target source)))
-  ;; Attempt to reuse source for target if it is in ST(0).
-  ;; Otherwise we will target ST(0) by sorting the machine registers.
-  (cond ((and (pseudo-register? target) (pseudo-register? source)
-	      (eqv? fr0 (pseudo-register-alias *register-map* 'FLOAT source)))
-	 (reuse-pseudo-register-alias
-	  source 'FLOAT
-	  (lambda (alias)
-	    (let* ((sti (floreg->sti alias)))
-	      (delete-register! alias)
-	      (delete-dead-registers!)
-	      (add-pseudo-register-alias! target alias)
-	      (operate sti sti)))
-	  default))
-	(else (default))))
-
-'(define ((flonum-unary-operation/general operate) target source)
-  (define (default)
-    (let* ((source (flonum-source! source))
-	   (target (flonum-target! target)))
-      (operate target source)))
-  ;; Attempt to reuse source for target.  This works well when the
-  ;; source is ST(0).  We try to arrange this by sorting the registers
-  ;; to give allocation preference to ST(0).
-  (cond ((pseudo-register? target)
-	 (reuse-pseudo-register-alias
-	  source 'FLOAT
-	  (lambda (alias)
-	    (let* ((sti (floreg->sti alias)))
-	      (delete-register! alias)
-	      (delete-dead-registers!)
-	      (add-pseudo-register-alias! target alias)
-	      (operate sti sti)))
-	  default))
-	(else (default))))
-
-'(define ((flonum-unary-operation/general operate) target source)
-  (define (default)
-    (let* ((source (flonum-source! source))
-	   (target (flonum-target! target)))
-      (operate target source)))
-  ;; Attempt to reuse source for target.  This works well when the
-  ;; source is ST(0).  We try to arrange this by sorting the registers
-  ;; to give allocation preference to ST(0).
-  (cond ((pseudo-register? target)
-	 (let ((alias
-		(and (dead-register? source)
-		     (pseudo-register-alias *register-map* 'FLOAT source))))
-	   (if alias
-	       (default)))
-	
-	(reuse-pseudo-register-alias
-	  source 'FLOAT
-	  (lambda (alias)
-	    (let* ((sti (floreg->sti alias)))
-		(delete-register! alias)
-		(delete-dead-registers!)
-		(add-pseudo-register-alias! target alias)
-		(operate sti sti)))
-	  default))
-	(else (default))))
-
-(define (flonum-1-arg/operator operation)
+          (FLONUM-1-ARG (? operator) (REGISTER (? source)) (? overflow?)))
+  overflow?                             ;ignore
+  (let* ((source (flonum-source-reference! source))
+         (target (flonum-target-reference! target)))
+    ((flonum-1-arg/operator operator) target source)))
+
+(define-integrable (flonum-1-arg/operator operation)
   (lookup-arithmetic-method operation flonum-methods/1-arg))
 
 (define flonum-methods/1-arg
   (list 'FLONUM-METHODS/1-ARG))
-
-;;; Notice the weird ,', syntax here.
-;;; If LAP changes, this may also have to change.
 
-(let-syntax
-    ((define-flonum-operation
-       (sc-macro-transformer
-	(lambda (form environment)
-	  environment
-	  (let ((primitive-name (cadr form))
-		(opcode (caddr form)))
-	    `(define-arithmetic-method ',primitive-name flonum-methods/1-arg
-	       (flonum-unary-operation/general
-		(lambda (target source)
-		  (if (and (zero? target) (zero? source))
-		      (LAP (,opcode))
-		      (LAP (FLD (ST ,', source))
-			   (,opcode)
-			   (FSTP (ST ,',(1+ target)))))))))))))
-  (define-flonum-operation FLONUM-NEGATE FCHS)
-  (define-flonum-operation FLONUM-ABS FABS)
-  ;; Disabled: FSIN and FCOS limited to pi * 2^62.
-  ;;(define-flonum-operation FLONUM-SIN FSIN)
-  ;;(define-flonum-operation FLONUM-COS FCOS)
-  (define-flonum-operation FLONUM-SQRT FSQRT)
-  (define-flonum-operation FLONUM-ROUND FRNDINT))
-
-;; These (and FLONUM-ROUND above) presume that the default rounding mode
-;; is round-to-nearest/even
-
-(define (define-rounding prim-name mode)
-  (define-arithmetic-method prim-name flonum-methods/1-arg
-    (flonum-unary-operation/general
-     (lambda (target source)
-       (let ((temp (temporary-register-reference)))
-	 (LAP (FSTCW (@R ,regnum:free-pointer))
-	      ,@(if (and (zero? target) (zero? source))
-		    (LAP)
-		    (LAP (FLD (ST ,source))))
-	      (MOV B ,temp (@RO B ,regnum:free-pointer 1))
-	      (OR B (@RO B ,regnum:free-pointer 1) (&U ,mode))
-	      (FNLDCW (@R ,regnum:free-pointer))
-	      (FRNDINT)
-	      (MOV B (@RO B ,regnum:free-pointer 1) ,temp)
-	      ,@(if (and (zero? target) (zero? source))
-		    (LAP)
-		    (LAP (FSTP (ST ,(1+ target)))))
-	      (FNLDCW (@R ,regnum:free-pointer))))))))
-
-(define-rounding 'FLONUM-CEILING #x08)
-(define-rounding 'FLONUM-FLOOR #x04)
-(define-rounding 'FLONUM-TRUNCATE #x0c)
-
-;; This is used in order to avoid using two stack locations for
-;; the remainder unary operations.
-
-(define ((flonum-unary-operation/stack-top operate) target source)
-  (define (finish source->top)
-    ;; Perhaps this can be improved?
-    (rtl-target:=machine-register! target fr0)
-    (LAP ,@source->top
-	 ,@(operate)))
-
-  (if (or (machine-register? source)
-	  (not (is-alias-for-register? fr0 source))
-	  (not (dead-register? source)))
-      (finish (load-machine-register! source fr0))
-      (begin
-	(delete-dead-registers!)
-	(finish (LAP)))))
-
-(define-arithmetic-method 'FLONUM-LOG flonum-methods/1-arg
-  (flonum-unary-operation/stack-top
-   (lambda ()
-     (LAP (FLDLN2)
-	  (FXCH (ST 0) (ST 1))
-	  (FYL2X)))))
-
-(define-arithmetic-method 'FLONUM-EXP flonum-methods/1-arg
-  (flonum-unary-operation/stack-top
-   (lambda ()
-     ;; Hair to avoid arithmetic for non-finite inputs: exp(-inf) = 0,
-     ;; but exp(x) = x for any other non-finite x.  We use the first
-     ;; free slot (1) to pick apart the double format to check for
-     ;; non-finite inputs, and (2) to avoid using two stack slots.
-     (let ((temp (temporary-register-reference))
-	   (infinity-or-nan (generate-label 'INFINITY-OR-NAN))
-	   (join (generate-label 'JOIN))
-	   (temp-pointer regnum:free-pointer))
-       (LAP (FST D (@R ,temp-pointer))
-	    (MOV W ,temp (@RO W ,temp-pointer 4))
-	    (AND W ,temp (&U #x7FFFFFFF))
-	    (CMP W ,temp (&U #x7FF00000))
-	    (JAE B (@PCR ,infinity-or-nan))
-	    ;; Compute 2^(x log_2 e) with F2XM1 and FSCALE.
-	    (FLDL2E)			;st0 = lg e, st1 = x
-	    (FMULP (ST 1) (ST 0))	;st0 = x lg e
-	    (FLD (ST 0))		;st0 = x lg e, st1 = x lg e
-	    (FRNDINT)			;st0 = I(x lg e), st1 = x lg e
-	    (FSUB (ST 1) (ST 0))	;st0 = I(x lg e), st1 = F(x lg e)
-	    (FSTP D (@R ,temp-pointer)) ;st0 = F(x lg e), save I(x lg e)
-	    (F2XM1)			;st0 = 2^F(x lg e) - 1
-	    (FLD1)			;st0 = 1, st1 = 2^F(x lg e) - 1
-	    (FADD)			;st0 = 2^F(x lg e)
-	    (FLD D (@R ,temp-pointer))	;st0 = I(x lg e), st1 = 2^F(x lg e)
-	    (FXCH (ST 0) (ST 1))	;st0 = 2^F(x lg e), st1 = I(x lg e)
-	    (FSCALE)			;st0 = 2^F(x lg e) * 2^I(x lg e),
-					;st1 = I(x lg e)
-	    (FSTP (ST 1))		;Drop st1, leaving in st0 the value
-	    (JMP B (@PCR ,join))	; 2^(F(x lg e) + I(x lg e)) = e^x.
-	  (LABEL ,infinity-or-nan)
-	    (CMP W (@RO W ,temp-pointer 4) (&U #xFFF00000))
-	    (JNE B (@PCR ,join))
-	    (CMP W (@RO W ,temp-pointer 0) (& 0))
-	    (JNE B (@PCR ,join))
-	    (FSTP (ST 0))		;Pop argument.
-	    (FLDZ)			;Return zero.
-	  (LABEL ,join))))))
+(define ((flonum-unary-operation/target-bits bit-string operate) target source)
+  (LAP (MOVF S D ,target (@PCR ,(allocate-double-float-bits-label bit-string)))
+       ,@(operate target source)))
+
+(define double-flobits:negative-zero
+  (let ((bit-string (make-bit-string 64 #f)))
+    (bit-string-set! bit-string 63)
+    bit-string))
+
+(define-arithmetic-method 'FLONUM-ABS flonum-methods/1-arg
+  (flonum-unary-operation/target-bits
+   (bit-string-not double-flobits:negative-zero)
+   (lambda (target source)
+     ;; No scalar version, but doing this packed is harmless.
+     (LAP (ANDF P D ,target ,source)))))
+
+(define-arithmetic-method 'FLONUM-NEGATE flonum-methods/1-arg
+  (flonum-unary-operation/target-bits
+   double-flobits:negative-zero
+   (lambda (target source)
+     ;; No scalar version, but doing this packed is harmless.
+     (LAP (XORF P D ,target ,source)))))
+
+(define-arithmetic-method 'FLONUM-SQRT flonum-methods/1-arg
+  (lambda (target source)
+    (LAP (SQRTF S D ,target ,source))))
 
-#|
-;; Disabled: FPTAN limited to pi * 2^62.
-(define-arithmetic-method 'FLONUM-TAN flonum-methods/1-arg
-  (flonum-unary-operation/stack-top
-   (lambda ()
-     (LAP (FPTAN)
-	  (FSTP (ST 0))			; FPOP
-	  ))))
-|#
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-2-ARGS (? operator)
+                         (REGISTER (? source1))
+                         (REGISTER (? source2))
+                         (? overflow?)))
+  overflow?                             ;ignore
+  ((flonum-2-args/operator operator) target source1 source2))
 
-(define-arithmetic-method 'FLONUM-ATAN flonum-methods/1-arg
-  (flonum-unary-operation/stack-top
-   (lambda ()
-     (LAP (FLD1)
-	  (FPATAN)))))
-
-;; For now, these preserve values in memory
-;; in order to avoid flushing a stack location.
-
-(define-arithmetic-method 'FLONUM-ACOS flonum-methods/1-arg
-  (flonum-unary-operation/stack-top
-   (lambda ()
-     (LAP (FST D (@R ,regnum:free-pointer))
-	  (FMUL (ST 0) (ST 0))
-	  (FLD1)
-	  (F%SUBP (ST 1) (ST 0))
-	  (FSQRT)
-	  (FLD D (@R ,regnum:free-pointer))
-	  (FPATAN)))))
-
-(define-arithmetic-method 'FLONUM-ASIN flonum-methods/1-arg
-  (flonum-unary-operation/stack-top
-   (lambda ()
-     (LAP (FST D (@R ,regnum:free-pointer))
-	  (FMUL (ST 0) (ST 0))
-	  (FLD1)
-	  (F%SUBP (ST 1) (ST 0))
-	  (FSQRT)
-	  (FLD D (@R ,regnum:free-pointer))
-	  (FXCH (ST 0) (ST 1))
-	  (FPATAN)))))
-
 (define-rule statement
   (ASSIGN (REGISTER (? target))
-	  (FLONUM-2-ARGS (? operation)
-			 (REGISTER (? source1))
-			 (REGISTER (? source2))
-			 (? overflow?)))
-  overflow?				;ignore
-  ((flonum-2-args/operator operation) target source1 source2))
-
-;; Binary instructions all use ST(0), and are of the forms
-;;   Fop ST(0),ST(i)
-;;   Fop ST(i),ST(0)
-;;   FopP ST(i),ST(0)
-;;   Fop ST(0),memory
-;;
-;; If possible, we like to target ST(0) since it is likely to be the
-;; source of a subsequent operation.  Failing that, it is good to
-;; reuse one of the source aliases.
-
-(define ((flonum-binary-operation operate) target source1 source2)
-  (define (default)
-    (let* ((sti1 (flonum-source! source1))
-	   (sti2 (flonum-source! source2)))
-      (operate (flonum-target! target) sti1 sti2)))
-  (define (try-reuse-1 if-cannot)
-    (reuse-pseudo-register-alias
-     source1 'FLOAT
-     (lambda (alias1)
-       (let* ((sti1 (floreg->sti alias1))
-	      (sti2 (if (= source1 source2)
-			sti1
-			(flonum-source! source2))))
-	 (delete-register! alias1)
-	 (delete-dead-registers!)
-	 (add-pseudo-register-alias! target alias1)
-	 (operate sti1 sti1 sti2)))
-     if-cannot))
-  (define (try-reuse-2 if-cannot)
-    (reuse-pseudo-register-alias
-     source2 'FLOAT
-     (lambda (alias2)
-       (let* ((sti2 (floreg->sti alias2))
-	      (sti1 (if (= source1 source2)
-			sti2
-			(flonum-source! source1))))
-	 (delete-register! alias2)
-	 (delete-dead-registers!)
-	 (add-pseudo-register-alias! target alias2)
-	 (operate sti2 sti1 sti2)))
-     if-cannot))
-  (cond ((pseudo-register? target)
-	 (if (is-alias-for-register? fr0 source1)
-	     (try-reuse-1 (lambda () (try-reuse-2 default)))
-	     (try-reuse-2 (lambda () (try-reuse-1 default)))))
-	((not (eq? (register-type target) 'FLOAT))
-	 (error "flonum-2-args: Wrong type register" target 'FLOAT))
-	(else (default))))
-
-(define (flonum-2-args/operator operation)
-  (lookup-arithmetic-method operation flonum-methods/2-args))
+          (FLONUM-2-ARGS (? operator)
+                         (REGISTER (? source))
+                         (OBJECT->FLOAT (CONSTANT (? constant)))
+                         (? overflow?)))
+  overflow?                             ;ignore
+  ((flonum-register*constant/operator operator) target source constant))
+
+(define-rule statement
+  (ASSIGN (REGISTER (? target))
+          (FLONUM-2-ARGS (? operator)
+                         (OBJECT->FLOAT (CONSTANT (? constant)))
+                         (REGISTER (? source))
+                         (? overflow?)))
+  overflow?                             ;ignore
+  ((flonum-constant*register/operator operator) target constant source))
 
 (define flonum-methods/2-args
   (list 'FLONUM-METHODS/2-ARGS))
 
-(define (flonum-1-arg%1/operator operation)
-  (lookup-arithmetic-method operation flonum-methods/1-arg%1))
+(define flonum-methods/register*constant
+  (list 'FLONUM-METHODS/REGISTER*CONSTANT))
 
-(define flonum-methods/1-arg%1
-  (list 'FLONUM-METHODS/1-ARG%1))
+(define flonum-methods/constant*register
+  (list 'FLONUM-METHODS/CONSTANT*REGISTER))
 
-(define (flonum-1%1-arg/operator operation)
-  (lookup-arithmetic-method operation flonum-methods/1%1-arg))
+(define-integrable (flonum-2-args/operator operator)
+  (lookup-arithmetic-method operator flonum-methods/2-args))
 
-(define flonum-methods/1%1-arg
-  (list 'FLONUM-METHODS/1%1-ARG))
+(define-integrable (flonum-register*constant/operator operator)
+  (lookup-arithmetic-method operator flonum-methods/register*constant))
 
-(define (binary-flonum-arithmetic? operation)
-  (memq operation '(FLONUM-ADD FLONUM-SUBTRACT FLONUM-MULTIPLY FLONUM-DIVIDE)))
+(define-integrable (flonum-constant*register/operator operator)
+  (lookup-arithmetic-method operator flonum-methods/constant*register))
 
-(let-syntax
-    ((define-flonum-operation
-       (sc-macro-transformer
-	(lambda (form environment)
-	  environment
-	  (let ((primitive-name (list-ref form 1))
-		(op1%2 (list-ref form 2))
-		(op1%2p (list-ref form 3))
-		(op2%1 (list-ref form 4))
-		(op2%1p (list-ref form 5)))
-	    `(begin
-	       (define-arithmetic-method ',primitive-name flonum-methods/2-args
-		 (flonum-binary-operation
-		  (lambda (target source1 source2)
-		    (cond ((= target source1)
-			   (cond ((zero? target)
-				  (LAP (,op1%2 (ST 0) (ST ,',source2))))
-				 ((zero? source2)
-				  (LAP (,op2%1 (ST ,',target) (ST 0))))
-				 (else
-				  (LAP (FLD (ST ,',source2))
-				       (,op2%1p (ST ,',(1+ target)) (ST 0))))))
-			  ((= target source2)
-			   (cond ((zero? target)
-				  (LAP (,op2%1 (ST 0) (ST ,',source1))))
-				 ((zero? source1)
-				  (LAP (,op1%2 (ST ,',target) (ST 0))))
-				 (else
-				  (LAP (FLD (ST ,',source1))
-				       (,op1%2p (ST ,',(1+ target)) (ST 0))))))
-			  (else
-			   (LAP (FLD (ST ,',source1))
-				(,op1%2 (ST 0) (ST ,',(1+ source2)))
-				(FSTP (ST ,',(1+ target)))))))))
-
-	       (define-arithmetic-method ',primitive-name
-		 flonum-methods/1%1-arg
-		 (flonum-unary-operation/general
-		  (lambda (target source)
-		    (if (= source target)
-			(LAP (FLD1)
-			     (,op1%2p (ST ,',(1+ target)) (ST 0)))
-			(LAP (FLD1)
-			     (,op1%2 (ST 0) (ST ,',(1+ source)))
-			     (FSTP (ST ,',(1+ target))))))))
-
-	       (define-arithmetic-method ',primitive-name
-		 flonum-methods/1-arg%1
-		 (flonum-unary-operation/general
-		  (lambda (target source)
-		    (if (= source target)
-			(LAP (FLD1)
-			     (,op2%1p (ST ,',(1+ target)) (ST 0)))
-			(LAP (FLD1)
-			     (,op2%1 (ST 0) (ST ,',(1+ source)))
-			     (FSTP (ST ,',(1+ target))))))))))))))
-
-  (define-flonum-operation FLONUM-ADD FADD FADDP FADD FADDP)
-  (define-flonum-operation FLONUM-SUBTRACT F%SUB F%SUBP F%SUBR F%SUBPR)
-  (define-flonum-operation FLONUM-MULTIPLY FMUL FMULP FMUL FMULP)
-  (define-flonum-operation FLONUM-DIVIDE F%DIV F%DIVP F%DIVR F%DIVPR))
-
-(define-arithmetic-method 'FLONUM-ATAN2 flonum-methods/2-args
-  (lambda (target source1 source2)
-    (if (and (not (machine-register? source1))
-	     (is-alias-for-register? fr0 source1)
-	     (dead-register? source1))
-	(let ((source2 (flonum-source! source2)))
-	  (delete-dead-registers!)
-	  (rtl-target:=machine-register! target fr0)
-	  (LAP (FLD (ST ,source2))
-	       (FPATAN)))
-	(begin
-	  (prefix-instructions! (load-machine-register! source1 fr0))
-	  (need-register! fr0)
-	  (let ((source2
-		 (if (= source2 source1) fr0 (flonum-source! source2))))
-	    (delete-dead-registers!)
-	    (rtl-target:=machine-register! target fr0)
-	    (LAP (FLD (ST ,source2))
-		 (FPATAN)))))))
-
-(define-arithmetic-method 'FLONUM-REMAINDER flonum-methods/2-args
-  (flonum-binary-operation
-   (lambda (target source1 source2)
-     (if (zero? source2)
-	 (LAP (FLD (ST ,source1))
-	      (FPREM1)
-	      (FSTP (ST ,(1+ target))))
-	 #|
-	 ;; This sequence is one cycle shorter than the one below,
-	 ;; but needs two spare stack locations instead of one.
-	 ;; Since FPREM1 is a variable, very slow instruction,
-	 ;; the difference in time will hardly be noticeable
-	 ;; but the availability of an extra "register" may be.
-	 (LAP (FLD (ST ,source2))
-	      (FLD (ST ,source1))
-	      (FPREM1)
-	      (FSTP (ST ,(+ target 2)))
-	      (FSTP (ST 0)))		; FPOP
-	 |#
-	 (LAP (FXCH (ST 0) (ST ,source2))
-	      (FLD (ST ,(if (zero? source1) source2 source1)))
-	      (FPREM1)
-	      (FSTP (ST ,(1+ (if (= target source2)
-				 0
-				 target))))
-	      (FXCH (ST 0) (ST ,source2)))))))
-
-(define-rule statement
-  (ASSIGN (REGISTER (? target))
-	  (FLONUM-2-ARGS FLONUM-SUBTRACT
-			 (OBJECT->FLOAT (CONSTANT 0.))
-			 (REGISTER (? source))
-			 (? overflow?)))
-  overflow?				;ignore
-  ((flonum-unary-operation/general
-    (lambda (target source)
-      (if (and (zero? target) (zero? source))
-	  (LAP (FCHS))
-	  (LAP (FLD (ST ,source))
-	       (FCHS)
-	       (FSTP (ST ,(1+ target)))))))
-   target source))
+(define ((flonum-2-args/standard commutative? operate) target source1 source2)
+  (binary-register-operation operate commutative? 'FLOAT
+                             (lambda (target source)
+                               (LAP (MOVF S D ,target ,source)))
+                             target source1 source2))
+
+(define ((flonum-register*constant/standard operate) target source constant)
+  (with-float-operand constant
+    (lambda (operand)
+      (operate
+       (register-reference (move-to-alias-register! source 'FLOAT target))
+       operand))))
+
+;++ Possible improvement, not currently easy with the generic register
+;++ allocator operations provided: if the constant is zero and we have
+;++ a temporary register available, we can zero that with XOR and use
+;++ it in the place of loading a PC-relative double in memory.
+
+(define ((flonum-constant*register/commutative operate) target constant source)
+  (with-float-operand constant
+    (lambda (operand)
+      (operate
+       (register-reference (move-to-alias-register! source 'FLOAT target))
+       operand))))
+
+(define ((flonum-constant*register/noncommutative operate)
+         target constant source)
+  (let* ((source (flonum-source-reference! source))
+         (target (flonum-target-reference! target)))
+    (with-float-operand constant
+      (lambda (operand)
+        (LAP (MOVF S D ,target ,operand)
+             ,@(operate target source))))))
 
-(define-rule statement
-  (ASSIGN (REGISTER (? target))
-	  (FLONUM-2-ARGS (? operation)
-			 (REGISTER (? source))
-			 (OBJECT->FLOAT (CONSTANT 1.))
-			 (? overflow?)))
-  (QUALIFIER (binary-flonum-arithmetic? operation))
-  overflow?				;ignore
-  ((flonum-1-arg%1/operator operation) target source))
-
-(define-rule statement
-  (ASSIGN (REGISTER (? target))
-	  (FLONUM-2-ARGS (? operation)
-			 (OBJECT->FLOAT (CONSTANT 1.))
-			 (REGISTER (? source))
-			 (? overflow?)))
-  (QUALIFIER (binary-flonum-arithmetic? operation))
-  overflow?				;ignore
-  ((flonum-1%1-arg/operator operation) target source))
+(let-syntax
+    ((binary-operation
+      (sc-macro-transformer
+       (lambda (form environment)
+         environment                    ;ignore
+         (let ((name (cadr form))
+               (op (caddr form))
+               (commutative? (cadddr form)))
+           `(let ((operate
+                   (lambda (target source)
+                     (LAP (,op S D ,',target ,',source)))))
+              (define-arithmetic-method ',name flonum-methods/2-args
+                (flonum-2-args/standard ,commutative? operate))
+              (define-arithmetic-method ',name flonum-methods/register*constant
+                (flonum-register*constant/standard operate))
+              (define-arithmetic-method ',name flonum-methods/constant*register
+                (,(if commutative?
+                      'flonum-constant*register/commutative
+                      'flonum-constant*register/noncommutative)
+                 operate))))))))
+  (binary-operation FLONUM-ADD ADDF #t)
+  (binary-operation FLONUM-DIVIDE DIVF #f)
+  (binary-operation FLONUM-MULTIPLY MULF #t)
+  (binary-operation FLONUM-SUBTRACT SUBF #f))
 
 ;;;; Flonum Predicates
 
-(define-rule predicate
-  (FLONUM-PRED-1-ARG (? predicate) (REGISTER (? source)))
-  (flonum-compare-zero predicate source))
-
-(define-rule predicate
-  (FLONUM-PRED-2-ARGS (? predicate)
-		      (REGISTER (? source1))
-		      (REGISTER (? source2)))
-  (let* ((st1 (flonum-source! source1))
-	 (st2 (flonum-source! source2)))
-    (cond ((zero? st1)
-	   (flonum-branch! predicate
-			   (LAP (FCOM (ST 0) (ST ,st2)))))
-	  ((zero? st2)
-	   (flonum-branch! (commute-flonum-predicate predicate)
-			   (LAP (FCOM (ST 0) (ST ,st1)))))
-	  (else
-	   (flonum-branch! predicate
-			   (LAP (FLD (ST ,st1))
-				(FCOMP (ST 0) (ST ,(1+ st2)))))))))
+(define double-flobits:zero
+  (make-bit-string 64 #f))
 
 (define-rule predicate
-  (FLONUM-PRED-2-ARGS (? predicate)
-		      (REGISTER (? source))
-		      (OBJECT->FLOAT (CONSTANT 0.)))
-  (flonum-compare-zero predicate source))
+  (FLONUM-PRED-1-ARG (? predicate) (REGISTER (? source)))
+  (flonum-branch!
+   predicate
+   (flonum-source-reference! source)
+   (INST-EA (@PCR ,(allocate-double-float-bits-label double-flobits:zero)))))
 
 (define-rule predicate
   (FLONUM-PRED-2-ARGS (? predicate)
-		      (OBJECT->FLOAT (CONSTANT 0.))
-		      (REGISTER (? source)))
-  (flonum-compare-zero (commute-flonum-predicate predicate) source))
+                      (REGISTER (? source1))
+                      (REGISTER (? source2)))
+  (flonum-branch! predicate
+                  (flonum-source-reference! source1)
+                  (flonum-source-reference! source2)))
 
 (define-rule predicate
   (FLONUM-PRED-2-ARGS (? predicate)
-		      (REGISTER (? source))
-		      (OBJECT->FLOAT (CONSTANT 1.)))
-  (flonum-compare-one predicate source))
+                      (REGISTER (? source))
+                      (OBJECT->FLOAT (CONSTANT (? constant))))
+  (with-float-operand constant
+    (lambda (operand)
+      (flonum-branch! predicate (flonum-source-reference! source) operand))))
 
 (define-rule predicate
   (FLONUM-PRED-2-ARGS (? predicate)
-		      (OBJECT->FLOAT (CONSTANT 1.))
-		      (REGISTER (? source)))
-  (flonum-compare-one (commute-flonum-predicate predicate) source))
-
-(define (flonum-compare-zero predicate source)
-  (let ((sti (flonum-source! source)))
-    (if (zero? sti)
-	(flonum-branch! predicate
-			(LAP (FTST)))
-	(flonum-branch! (commute-flonum-predicate predicate)
-			(LAP (FLDZ)
-			     (FCOMP (ST 0) (ST ,(1+ sti))))))))
-
-(define (flonum-compare-one predicate source)
-  (let ((sti (flonum-source! source)))
-    (flonum-branch! (commute-flonum-predicate predicate)
-		    (LAP (FLD1)
-			 (FCOMP (ST 0) (ST ,(1+ sti)))))))
-
-(define (commute-flonum-predicate pred)
-  (case pred
+                      (OBJECT->FLOAT (CONSTANT (? constant)))
+                      (REGISTER (? source)))
+  (with-float-operand constant
+    (lambda (operand)
+      (flonum-branch! (commute-flonum-predicate predicate)
+                      (flonum-source-reference! source)
+                      operand))))
+
+(define (commute-flonum-predicate predicate)
+  (case predicate
     ((FLONUM-EQUAL? FLONUM-ZERO?) 'FLONUM-EQUAL?)
     ((FLONUM-LESS? FLONUM-NEGATIVE?) 'FLONUM-GREATER?)
     ((FLONUM-GREATER? FLONUM-POSITIVE?) 'FLONUM-LESS?)
-    (else
-     (error "commute-flonum-predicate: Unknown predicate" pred))))
+    (else (error "commute-flonum-predicate: Unknown predicate" predicate))))
 
-(define (flonum-branch! predicate prefix)
+(define (flonum-branch! predicate source1 source2)
   (case predicate
     ((FLONUM-EQUAL? FLONUM-ZERO?)
      (set-current-branches! (lambda (label)
-			      (let ((unordered (generate-label 'UNORDERED)))
-				(LAP (JP (@PCR ,unordered))
-				     (JE (@PCR ,label))
-				     (LABEL ,unordered))))
-			    (lambda (label)
-			      (LAP (JNE (@PCR ,label))
-				   (JP (@PCR ,label))))))
+                              (let ((unordered (generate-label 'UNORDERED)))
+                                (LAP (JP (@PCR ,unordered))
+                                     (JE (@PCR ,label))
+                                     (LABEL ,unordered))))
+                            (lambda (label)
+                              (LAP (JNE (@PCR ,label))
+                                   (JP (@PCR ,label))))))
     ((FLONUM-LESS? FLONUM-NEGATIVE?)
      (set-current-branches! (lambda (label)
-			      (let ((unordered (generate-label 'UNORDERED)))
-				(LAP (JP (@PCR ,unordered))
-				     (JB (@PCR ,label))
-				     (LABEL ,unordered))))
-			    (lambda (label)
-			      (LAP (JAE (@PCR ,label))
-				   (JP (@PCR ,label))))))
+                              (let ((unordered (generate-label 'UNORDERED)))
+                                (LAP (JP (@PCR ,unordered))
+                                     (JB (@PCR ,label))
+                                     (LABEL ,unordered))))
+                            (lambda (label)
+                              (LAP (JAE (@PCR ,label))
+                                   (JP (@PCR ,label))))))
     ((FLONUM-GREATER? FLONUM-POSITIVE?)
      (set-current-branches! (lambda (label)
-			      (LAP (JA (@PCR ,label))))
-			    (lambda (label)
-			      (LAP (JBE (@PCR ,label))))))
+                              (LAP (JA (@PCR ,label))))
+                            (lambda (label)
+                              (LAP (JBE (@PCR ,label))))))
     (else
      (error "flonum-branch!: Unknown predicate" predicate)))
-  (flush-register! eax)
-  (LAP ,@prefix
-       (FSTSW (R ,eax))
-       (SAHF)))
-
-;; This is endianness dependent!
-
-(define (flonum-value->data-decl value)
-  (let ((high (make-bit-string 32 false))
-	(low (make-bit-string 32 false)))
-    (read-bits! value 32 high)
-    (read-bits! value 64 low)
-    (LAP ,@(lap:comment `(FLOAT ,value))
-	 (LONG U ,(bit-string->unsigned-integer high))
-	 (LONG U ,(bit-string->unsigned-integer low)))))
-
-(define (flo:32-bit-representation-exact? value)
-  ;; Returns unsigned long representation if 32 bit representation
-  ;; exists, i.e. if all `1' significant mantissa bits fit in the 32
-  ;; bit format and the exponent is within range.
-  (let ((mant-diff (make-bit-string (- 52 23) false)))
-    (read-bits! value (+ 32 0) mant-diff)
-    (and (bit-string-zero? mant-diff)
-	 (let ((expt64 (make-bit-string 11 false)))
-	   (read-bits! value (+ 32 52) expt64)
-	   (let ((expt (- (bit-string->unsigned-integer expt64) 1022)))
-	     (and (<= -127 expt 127)
-		  (let ((sign (make-bit-string 1  false))
-			(mant32 (make-bit-string 23 false)))
-		    (read-bits! value (+ 32 52 11) sign)
-		    (read-bits! value (+ 32 52 -23) mant32)
-		    (bit-string->unsigned-integer
-		     (bit-string-append
-		      (bit-string-append
-		       mant32
-		       (unsigned-integer->bit-string 8 (+ 126 expt)))
-		      sign)))))))))
-
-(define (flonum->label value block-name alignment offset data)
-  (let* ((block
-	  (or (find-extra-code-block block-name)
-	      (let ((block (declare-extra-code-block! block-name
-						      'ANYWHERE
-						      '())))
-		(add-extra-code!
-		 block
-		 (LAP (PADDING ,offset ,alignment ,padding-string)))
-		block)))
-	 (pairs (extra-code-block/xtra block))
-	 (place (assoc value pairs)))
-    (if place
-	(cdr place)
-	(let ((label (generate-label block-name)))
-	  (set-extra-code-block/xtra!
-	   block
-	   (cons (cons value label) pairs))
-	  (add-extra-code! block
-			   (LAP (LABEL ,label)
-				,@data))
-	  label))))
-
-(define (double-flonum->label fp-value)
-  (flonum->label fp-value 'DOUBLE-FLOATS 8 0
-		 (flonum-value->data-decl fp-value)))
-
-(define (single-flonum->label fp-value)
-  (flonum->label fp-value 'SINGLE-FLOATS 4 0
-		 (LAP ,@(lap:comment `(SINGLE-FLOAT ,fp-value))
-		      (LONG U ,(flo:32-bit-representation-exact? fp-value)))))
-				     
+  (LAP (UCOMISF D ,source1 ,source2)))
+                                   
 (define-rule statement
   (ASSIGN (REGISTER (? target)) (OBJECT->FLOAT (CONSTANT (? fp-value))))
   (cond ((not (flo:flonum? fp-value))
-	 (error "OBJECT->FLOAT: Not a floating-point value" fp-value))
-	((flo:= fp-value 0.0)
-	 (let ((target (flonum-target! target)))
-	   (LAP (FLDZ)
-		(FSTP (ST ,(1+ target))))))
-	((flo:= fp-value 1.0)
-	 (let ((target (flonum-target! target)))
-	   (LAP (FLD1)
-		(FSTP (ST ,(1+ target))))))
-	(compiler:cross-compiling?
-	 (let* ((temp (allocate-temporary-register! 'GENERAL))
-		(target (flonum-target! target)))
-	   (LAP ,@(load-constant (register-reference temp) fp-value)
-		,@(object->float temp target))))
-	(else
-	 (let ((target (flonum-target! target)))
-	   (with-pcr-float fp-value
-	      (lambda (ea size)
-		(LAP (FLD ,size ,ea)
-		     (FSTP (ST ,(1+ target))))))))))
-
-(define (with-pcr-float fp-value receiver)
-  (define (generate-ea label-expr size)
-    (with-pc
-     (lambda (pc-label pc-register)
-       (receiver (INST-EA (@RO W ,pc-register (- ,label-expr ,pc-label)))
-		 size))))
-  (if (flo:32-bit-representation-exact? fp-value)
-      (generate-ea (single-flonum->label fp-value) 'S)
-      (generate-ea (double-flonum->label fp-value) 'D)))
-|#
+         (error "OBJECT->FLOAT: Not a floating-point value" fp-value))
+        ((flo:= fp-value 0.0)
+         (let ((target (flonum-target-reference! target)))
+           (LAP (XORF P D ,target ,target))))
+        (else
+         (with-float-operand fp-value
+           (lambda (operand)
+             (LAP (MOVF S D ,(flonum-target-reference! target) ,operand)))))))
+
+(define (with-float-operand fp-value receiver)
+  (if (not (flo:flonum? fp-value))
+      (error "Invalid constant flonum operand:" fp-value))
+  (if compiler:cross-compiling?
+      (let ((temp (allocate-temporary-register! 'GENERAL)))
+        (LAP ,@(load-constant (register-reference temp) fp-value)
+             ,@(object->address (register-reference temp))
+             ,@(receiver (INST-EA (@RO ,temp ,flonum-data-offset)))))
+      (receiver (INST-EA (@PCR ,(allocate-double-float-label fp-value))))))
+
+(define (allocate-double-float-bits-label bit-string)
+  (allocate-data-label bit-string 'DOUBLE-FLOATS 0 8
+    (LAP (QUAD U ,(bit-string->unsigned-integer bit-string)))))
+
+(define (allocate-single-float-bits-label bit-string)
+  (allocate-data-label bit-string 'SINGLE-FLOATS 0 4
+    (LAP (LONG U ,(bit-string->unsigned-integer bit-string)))))
+
+(define (allocate-double-float-label flonum)
+  (allocate-double-float-bits-label
+   (let ((bit-string (make-bit-string 64 #f)))
+     ;; Skip the manifest preceding the flonum data.  Is there a
+     ;; better way to express this?
+     (let* ((bytes-per-object (vector-ref (gc-space-status) 0))
+            (bits-per-object (* 8 bytes-per-object))
+            (flonum-data-offset-in-bits bits-per-object))
+       (read-bits! flonum flonum-data-offset-in-bits bit-string))
+     bit-string)))
diff --git a/src/compiler/machines/x86-64/rulrew.scm b/src/compiler/machines/x86-64/rulrew.scm
index 70259dc8e..185d1bd4c 100644
--- a/src/compiler/machines/x86-64/rulrew.scm
+++ b/src/compiler/machines/x86-64/rulrew.scm
@@ -215,94 +215,24 @@ USA.
 
 (define-rule rewriting
   (OBJECT->FLOAT (REGISTER (? operand register-known-value)))
-  (QUALIFIER
-   (rtl:constant-flonum-test operand (lambda (v) v #T)))
+  ;; This is not quite what we want.  We really want to rewrite all
+  ;; OBJECT->FLOAT expressions with known constant operands, not just
+  ;; the nonzero ones, and then decide later whether to put it in
+  ;; memory based on whether there is a temporary register that we can
+  ;; zero with XOR.  By not rewriting this case when the constant is
+  ;; zero, using a temporary may cause some other register to be
+  ;; written to memory, which defeats the purpose of using XOR to
+  ;; avoid memory access.
+  (QUALIFIER (rtl:constant-flonum-test operand flo:nonzero?))
   (rtl:make-object->float operand))
 
 (define-rule rewriting
   (FLONUM-2-ARGS FLONUM-SUBTRACT
-		 (REGISTER (? operand-1 register-known-value))
-		 (? operand-2)
-		 (? overflow?))
-  (QUALIFIER (rtl:constant-flonum-test operand-1 flo:zero?))
-  (rtl:make-flonum-2-args 'FLONUM-SUBTRACT operand-1 operand-2 overflow?))
-
-(define-rule rewriting
-  (FLONUM-2-ARGS (? operation)
-		 (REGISTER (? operand-1 register-known-value))
-		 (? operand-2)
-		 (? overflow?))
-  (QUALIFIER
-   (and (memq operation
-	      '(FLONUM-ADD FLONUM-SUBTRACT FLONUM-MULTIPLY FLONUM-DIVIDE))
-	(rtl:constant-flonum-test operand-1 flo:one?)))
-  (rtl:make-flonum-2-args operation operand-1 operand-2 overflow?))
-
-(define-rule rewriting
-  (FLONUM-2-ARGS (? operation)
-		 (? operand-1)
-		 (REGISTER (? operand-2 register-known-value))
-		 (? overflow?))
-  (QUALIFIER
-   (and (memq operation
-	      '(FLONUM-ADD FLONUM-SUBTRACT FLONUM-MULTIPLY FLONUM-DIVIDE))
-	(rtl:constant-flonum-test operand-2 flo:one?)))
-  (rtl:make-flonum-2-args operation operand-1 operand-2 overflow?))
-
-(define-rule rewriting
-  (FLONUM-PRED-2-ARGS (? predicate)
-		      (? operand-1)
-		      (REGISTER (? operand-2 register-known-value)))
-  (QUALIFIER (rtl:constant-flonum-test operand-2 flo:zero?))
-  (list 'FLONUM-PRED-2-ARGS predicate operand-1 operand-2))
-
-(define-rule rewriting
-  (FLONUM-PRED-2-ARGS (? predicate)
-		      (REGISTER (? operand-1 register-known-value))
-		      (? operand-2))
-  (QUALIFIER (rtl:constant-flonum-test operand-1 flo:zero?))
-  (list 'FLONUM-PRED-2-ARGS predicate operand-1 operand-2))
-
-#|
-;; These don't work as written.  They are not simplified and are
-;; therefore passed whole to the back end, and there is no way to
-;; construct the graph at this level.
-
-;; acos (x) = atan ((sqrt (1 - x^2)) / x)
-
-(define-rule pre-cse-rewriting
-  (FLONUM-1-ARG FLONUM-ACOS (? operand) #f)
-  (rtl:make-flonum-2-args
-   'FLONUM-ATAN2
-   (rtl:make-flonum-1-arg
-    'FLONUM-SQRT
-    (rtl:make-flonum-2-args
-     'FLONUM-SUBTRACT
-     (rtl:make-object->float (rtl:make-constant 1.))
-     (rtl:make-flonum-2-args 'FLONUM-MULTIPLY operand operand false)
-     false)
-    false)
-   operand
-   false))
-
-;; asin (x) = atan (x / (sqrt (1 - x^2)))
-
-(define-rule pre-cse-rewriting
-  (FLONUM-1-ARG FLONUM-ASIN (? operand) #f)
-  (rtl:make-flonum-2-args
-   'FLONUM-ATAN2
-   operand
-   (rtl:make-flonum-1-arg
-    'FLONUM-SQRT
-    (rtl:make-flonum-2-args
-     'FLONUM-SUBTRACT
-     (rtl:make-object->float (rtl:make-constant 1.))
-     (rtl:make-flonum-2-args 'FLONUM-MULTIPLY operand operand false)
-     false)
-    false)
-   false))
-
-|#
+                 (REGISTER (? operand1 register-known-value))
+                 (? operand2)
+                 (? overflow?))
+  (QUALIFIER (rtl:constant-flonum-test operand1 flo:zero?))
+  (rtl:make-flonum-1-arg 'FLONUM-NEGATE operand2 overflow?))
 
 (define (rtl:constant-flonum-test expression predicate)
   (and (rtl:object->float? expression)
@@ -312,8 +242,8 @@ USA.
 		(and (flo:flonum? n)
 		     (predicate n)))))))
 
-(define (flo:one? value)
-  (flo:= value 1.))
+(define-integrable (flo:nonzero? value)
+  (not (flo:= value 0.)))
 
 ;;;; Indexed addressing modes
 
@@ -336,19 +266,6 @@ USA.
 		(MACHINE-CONSTANT (? value)))
   (QUALIFIER (and (rtl:float-offset-address? base)
 		  (rtl:simple-subexpressions? base)))
-  (if (zero? value)
-      (rtl:make-float-offset
-       (rtl:float-offset-address-base base)
-       (rtl:float-offset-address-offset base))
-      (rtl:make-float-offset base (rtl:make-machine-constant value))))
-
-(define-rule rewriting
-  (FLOAT-OFFSET (REGISTER (? base register-known-value))
-		(MACHINE-CONSTANT (? value)))
-  (QUALIFIER
-   (and (rtl:offset-address? base)
-	(rtl:simple-subexpressions? base)
-	(rtl:machine-constant? (rtl:offset-address-offset base))))   
   (rtl:make-float-offset base (rtl:make-machine-constant value)))
 
 ;; This is here to avoid generating things like
diff --git a/src/microcode/cmpauxmd/x86-64.m4 b/src/microcode/cmpauxmd/x86-64.m4
index d134362f5..52cf07995 100644
--- a/src/microcode/cmpauxmd/x86-64.m4
+++ b/src/microcode/cmpauxmd/x86-64.m4
@@ -170,7 +170,7 @@ ifdef(`SUPPRESS_LEADING_UNDERSCORE',
 ifdef(`WCC386R',
       `define(EFR,`$1_')',
       `define(EFR,`EPFR($1)')')
-
+
 define(hook_reference,`EFR(asm_$1)')
 
 define(define_data,`export_label(EVR($1))')
@@ -211,6 +211,10 @@ ifdef(`DASM',
       `define(allocate_space,`EVR($1) db $2 dup (0)')',
       `define(allocate_space,`EVR($1):
 	.space $2')')
+
+ifdef(`DASM',
+      `define(define_double,`EVR($1) dq $2')',
+      `define(define_double,`EVR($1):	.double $2')')
 
 ifdef(`DASM',
       `define(HEX, `0$1H')',
@@ -257,8 +261,8 @@ ifdef(`DASM',
       `define(LOF,`$1($2)')')
 
 ifdef(`DASM',
-      `define(DOF,`qword ptr $1[$2]')',
-      `define(DOF,`$1($2)')')
+      `define(QOF,`qword ptr $1[$2]')',
+      `define(QOF,`$1($2)')')
 
 ifdef(`DASM',
       `define(IDX,`dword ptr [$1] [$2]')',
@@ -290,11 +294,21 @@ define(TC_COMPILED_ENTRY,40)
 
 # TAG doesn't work due to m4 stupidity, so define these magic
 # constants here.  These are computed in terms of the parameters
-# above.
+# above, and ordered lexicographically.
 
+define(IMM_FALSE, `IMM(HEX(0000000000000000))')
+define(IMM_FIXNUM_0, `IMM(HEX(6800000000000000))')
+define(IMM_FLONUM_0, `IMM(HEX(1800000000000000))')
 define(IMM_MANIFEST_NM_VECTOR_1, `IMM(HEX(9c00000000000001))')
 define(IMM_TRUE, `IMM(HEX(2000000000000000))')
-define(IMM_FALSE, `IMM(HEX(0000000000000000))')
+
+# Flonums are represented by tagged pointers to the first of two
+# quadwords (sixteen bytes) in memory, the first of which is a
+# non-marked vector manifest of length 1, so that the GC will not
+# trace the other one, which is an IEEE 754 double-precision format
+# value.
+define(FLONUM_DATA_OFFSET,8)
+define(FLONUM_STORAGE_SIZE,16)
 
 define(REGBLOCK_VAL,16)
 define(REGBLOCK_COMPILER_TEMP,32)
@@ -333,6 +347,10 @@ allocate_quadword(C_Stack_Pointer)
 
 define_data(C_Frame_Pointer)
 allocate_quadword(C_Frame_Pointer)
+
+declare_alignment(8)
+define_double(flonum_zero,0.0)
+define_double(flonum_one,1.0)
 
 DECLARE_CODE_SEGMENT()
 declare_alignment(2)
@@ -421,12 +439,12 @@ ifdef(`WIN32',						# Register block = %rsi
 `	OP(mov,q)	TW(ABS(EVR(RegistersPtr)),regs)',
 `	OP(lea,q)	TW(ABS(EVR(Registers)),regs)')
 	OP(mov,q)	TW(ABS(EVR(Free)),rfree)	# Free pointer = %rdi
-	OP(mov,q)	TW(DOF(REGBLOCK_VAL(),regs),REG(rax)) # Value/dynamic link
+	OP(mov,q)	TW(QOF(REGBLOCK_VAL(),regs),REG(rax)) # Value/dynamic link
 	OP(mov,q)	TW(IMM(ADDRESS_MASK),rmask)	# = %rbp
 	OP(mov,q)	TW(ABS(EVR(stack_pointer)),REG(rsp))
 	OP(mov,q)	TW(REG(rax),REG(rcx))		# Preserve if used
 	OP(and,q)	TW(rmask,REG(rcx))		# Restore potential dynamic link
-	OP(mov,q)	TW(REG(rcx),DOF(REGBLOCK_DLINK(),regs))
+	OP(mov,q)	TW(REG(rcx),QOF(REGBLOCK_DLINK(),regs))
 	jmp		IJMP(REG(rdx))
 
 IF_WIN32(`
@@ -471,7 +489,7 @@ define_jump_indirection(interrupt_closure,18)
 define_jump_indirection(interrupt_continuation_2,3b)
 
 define_hook_label(interrupt_dlink)
-	OP(mov,q)	TW(DOF(REGBLOCK_DLINK(),regs),REG(rdx))
+	OP(mov,q)	TW(QOF(REGBLOCK_DLINK(),regs),REG(rdx))
 	OP(mov,b)	TW(IMM(HEX(19)),REG(al))
 	jmp	scheme_to_interface_call
 
@@ -551,21 +569,33 @@ asm_generic_fixnum_result:
 	OP(and,q)	TW(rmask,IND(REG(rsp)))
 	OP(or,b)	TW(IMM(TC_FIXNUM),REG(al))
 	OP(ror,q)	TW(IMM(TC_LENGTH),REG(rax))
-	OP(mov,q)	TW(REG(rax),LOF(REGBLOCK_VAL(),regs))
+	OP(mov,q)	TW(REG(rax),QOF(REGBLOCK_VAL(),regs))
+	ret
+
+declare_alignment(2)
+asm_generic_flonum_result:
+	OP(and,q)	TW(rmask,IND(REG(rsp)))
+	OP(mov,q)	TW(IMM_MANIFEST_NM_VECTOR_1,REG(rcx))
+	OP(mov,q)	TW(REG(rcx),IND(rfree))
+	movsd		TW(REG(xmm0),QOF(FLONUM_DATA_OFFSET,rfree))
+	OP(mov,q)	TW(IMM_FLONUM_0,REG(rax))
+	OP(or,q)	TW(rfree,REG(rax))
+	OP(lea,q)	TW(QOF(FLONUM_STORAGE_SIZE,rfree),rfree)
+	OP(mov,q)	TW(REG(rax),QOF(REGBLOCK_VAL(),regs))
 	ret
 
 declare_alignment(2)
 asm_generic_return_sharp_t:
 	OP(and,q)	TW(rmask,IND(REG(rsp)))
 	OP(mov,q)	TW(IMM_TRUE,REG(rax))
-	OP(mov,q)	TW(REG(rax),LOF(REGBLOCK_VAL(),regs))
+	OP(mov,q)	TW(REG(rax),QOF(REGBLOCK_VAL(),regs))
 	ret
 
 declare_alignment(2)
 asm_generic_return_sharp_f:
 	OP(and,q)	TW(rmask,IND(REG(rsp)))
 	OP(mov,q)	TW(IMM_FALSE,REG(rax))
-	OP(mov,q)	TW(REG(rax),LOF(REGBLOCK_VAL(),regs))
+	OP(mov,q)	TW(REG(rax),QOF(REGBLOCK_VAL(),regs))
 	ret
 
 define(define_unary_operation,
@@ -575,8 +605,16 @@ define_hook_label(generic_$1)
 	OP(mov,q)	TW(REG(rdx),REG(rax))
 	OP(shr,q)	TW(IMM(DATUM_LENGTH),REG(rax))
 	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(al))
+	je	asm_generic_$1_fix
+	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(al))
 	jne	asm_generic_$1_fail
 
+asm_generic_$1_flo:
+	OP(and,q)	TW(rmask,REG(rdx))
+	movsd		TW(QOF(FLONUM_DATA_OFFSET,REG(rdx)),REG(xmm0))
+	$4		TW(ABS(flonum_one),REG(xmm0))
+	jmp	asm_generic_flonum_result
+
 asm_generic_$1_fix:
 	OP(mov,q)	TW(REG(rdx),REG(rax))
 	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rax))
@@ -595,8 +633,17 @@ define_hook_label(generic_$1)
 	OP(mov,q)	TW(REG(rdx),REG(rax))
 	OP(shr,q)	TW(IMM(DATUM_LENGTH),REG(rax))
 	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(al))
+	je	asm_generic_$1_fix
+	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(al))
 	jne	asm_generic_$1_fail
 
+asm_generic_$1_flo:
+	OP(and,q)	TW(rmask,REG(rdx))
+	movsd		TW(QOF(FLONUM_DATA_OFFSET,REG(rdx)),REG(xmm0))
+	ucomisd		TW(ABS(flonum_zero),REG(xmm0))
+	$3	asm_generic_return_sharp_t
+	jmp	asm_generic_return_sharp_f
+
 asm_generic_$1_fix:
 	OP(mov,q)	TW(REG(rdx),REG(rax))
 	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rax))
@@ -610,7 +657,7 @@ asm_generic_$1_fail:
 	jmp	scheme_to_interface')
 
 define(define_binary_operation,
-`define_binary_operation_with_fixup($1,$2,$3,
+`define_binary_operation_with_fixup($1,$2,$3,$4,
 	`OP(shl,q)	TW(IMM(TC_LENGTH),REG(rax))')')
 
 define(define_binary_operation_with_fixup,
@@ -625,12 +672,21 @@ define_hook_label(generic_$1)
 	OP(cmp,b)	TW(REG(al),REG(cl))
 	jne	asm_generic_$1_fail
 	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(al))
+	je	asm_generic_$1_fix
+	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(al))
 	jne	asm_generic_$1_fail
 
+asm_generic_$1_flo:
+	OP(and,q)	TW(rmask,REG(rdx))
+	OP(and,q)	TW(rmask,REG(rbx))
+	movsd		TW(QOF(FLONUM_DATA_OFFSET,REG(rdx)),REG(xmm0))
+	$4		TW(QOF(FLONUM_DATA_OFFSET,REG(rbx)),REG(xmm0))
+	jmp	asm_generic_flonum_result
+
 asm_generic_$1_fix:
 	OP(mov,q)	TW(REG(rdx),REG(rax))
 	OP(mov,q)	TW(REG(rbx),REG(rcx))
-	$4						# Set up rax.
+	$5						# Set up rax.
 	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rcx))
 	OP($3,q)	TW(REG(rcx),REG(rax))		# subq
 	jno	asm_generic_fixnum_result
@@ -653,8 +709,18 @@ define_hook_label(generic_$1)
 	OP(cmp,b)	TW(REG(al),REG(cl))
 	jne	asm_generic_$1_fail
 	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(al))
+	je	asm_generic_$1_fix
+	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(al))
 	jne	asm_generic_$1_fail
 
+asm_generic_$1_flo:
+	OP(and,q)	TW(rmask,REG(rdx))
+	OP(and,q)	TW(rmask,REG(rbx))
+	movsd		TW(QOF(FLONUM_DATA_OFFSET,REG(rdx)),REG(xmm0))
+	ucomisd		TW(QOF(FLONUM_DATA_OFFSET,REG(rbx)),REG(xmm0))
+	$3	asm_generic_return_sharp_t
+	jmp	asm_generic_return_sharp_f
+
 asm_generic_$1_fix:
 	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rdx))
 	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rbx))
@@ -668,43 +734,122 @@ asm_generic_$1_fail:
 	OP(mov,b)	TW(IMM(HEX($2)),REG(al))
 	jmp	scheme_to_interface')
 
-#define_unary_operation(decrement,22,sub)
-#define_unary_operation(increment,26,add)
+# Division is hairy.  I'm not sure whether this will do the right
+# thing for infinities and NaNs.
+
+define_hook_label(generic_divide)
+	OP(pop,q)	REG(rdx)
+	OP(pop,q)	REG(rbx)
+	# We want to divide rdx by rbx.  First put the numerator's tag
+	# in al and the denominator's tag in cl.
+	OP(mov,q)	TW(REG(rdx),REG(rax))
+	OP(mov,q)	TW(REG(rbx),REG(rcx))
+	OP(shr,q)	TW(IMM(DATUM_LENGTH),REG(rax))
+	OP(shr,q)	TW(IMM(DATUM_LENGTH),REG(rcx))
+	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(al))
+	je	asm_generic_divide_fix
+	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(al))
+	jne	asm_generic_divide_fail
+	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(cl))
+	je	asm_generic_divide_flo_by_flo
+	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(cl))
+	jne	asm_generic_divide_fail
+
+asm_generic_divide_flo_by_fix:
+	# Numerator (rdx) is a flonum, denominator (rbx) is a fixnum.
+	OP(mov,q)	TW(REG(rbx),REG(rcx))
+	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rcx))
+	# Division by zero -- bail.
+	jz	asm_generic_divide_fail
+	OP(and,q)	TW(rmask,REG(rdx))
+	OP(sar,q)	TW(IMM(TC_LENGTH),REG(rcx))
+	movsd		TW(QOF(FLONUM_DATA_OFFSET,REG(rdx)),REG(xmm0))
+	OP(cvtsi2sd,q)	TW(REG(rcx),REG(xmm1))
+	divsd		TW(REG(xmm1),REG(xmm0))
+	jmp	asm_generic_flonum_result
+
+asm_generic_divide_fix:
+	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(cl))
+	jne asm_generic_divide_fail
+
+asm_generic_divide_fix_by_flo:
+	# Numerator (rdx) is a fixnum, denominator (rbx) is a flonum.
+	OP(mov,q)	TW(REG(rbx),REG(rax))
+	OP(and,q)	TW(rmask,REG(rax))
+	OP(mov,q)	TW(REG(rdx),REG(rcx))
+	movsd		TW(QOF(FLONUM_DATA_OFFSET,REG(rax)),REG(xmm1))
+	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rcx))
+	jz	asm_generic_divide_zero_by_flo
+	OP(sar,q)	TW(IMM(TC_LENGTH),REG(rcx))
+	OP(cvtsi2sd,q)	TW(REG(rcx),REG(xmm0))
+	divsd		TW(REG(xmm1),REG(xmm0))
+	jmp	asm_generic_flonum_result
+
+asm_generic_divide_zero_by_flo:
+	# rcx contains zero, representing a numerator exactly zero.
+	# Defer division of 0 by 0.0; otherwise, yield exactly zero.
+	OP(cvtsi2sd,q)	TW(REG(rcx),REG(xmm0))
+	ucomisd		TW(REG(xmm1),REG(xmm0))
+	je	asm_generic_divide_fail
+	OP(and,q)	TW(rmask,IND(REG(rsp)))
+	OP(mov,q)	TW(IMM_FIXNUM_0,REG(rax))
+	OP(mov,q)	TW(REG(rax),QOF(REGBLOCK_VAL(),regs))
+	ret
+
+asm_generic_divide_flo_by_flo:
+	# Numerator (rdx) and denominator (rbx) are both flonums.
+	OP(mov,q)	TW(REG(rdx),REG(rax))
+	OP(mov,q)	TW(REG(rbx),REG(rcx))
+	OP(and,q)	TW(rmask,REG(rax))
+	OP(and,q)	TW(rmask,REG(rcx))
+	movsd		TW(QOF(FLONUM_DATA_OFFSET,REG(rax)),REG(xmm0))
+	movsd		TW(QOF(FLONUM_DATA_OFFSET,REG(rcx)),REG(xmm1))
+	ucomisd		TW(ABS(flonum_zero),REG(xmm1))
+	je	asm_generic_divide_fail
+	divsd		TW(REG(xmm1),REG(xmm0))
+	jmp	asm_generic_flonum_result
+
+asm_generic_divide_fail:
+	OP(push,q)	REG(rbx)
+	OP(push,q)	REG(rdx)
+	OP(mov,b)	TW(IMM(HEX(23)),REG(al))
+	jmp	scheme_to_interface
+
+define_unary_operation(decrement,22,sub,subsd)
+define_unary_operation(increment,26,add,addsd)
 
-#define_unary_predicate(negative,2a,jl)
-#define_unary_predicate(positive,2c,jg)
-#define_unary_predicate(zero,2d,je)
+# define_unary_predicate(name,index,jcc)
+# define_unary_predicate(  $1,   $2, $3)
+define_unary_predicate(negative,2a,jl)
+define_unary_predicate(positive,2c,jg)
+define_unary_predicate(zero,2d,je)
 
-# define_binary_operation(name,index,op)
-# define_binary_operation(  $1,   $2,$3)
-#define_binary_operation(add,2b,add)
-#define_binary_operation(subtract,28,sub)
+# define_binary_operation(name,index,fxop,flop)
+# define_binary_operation(  $1,   $2,  $3,  $4)
+define_binary_operation(add,2b,add,addsd)
+define_binary_operation(subtract,28,sub,subsd)
 
 # No fixup -- leave it unshifted.
-#define_binary_operation_with_fixup(multiply,29,imul)
+define_binary_operation_with_fixup(multiply,29,imul,mulsd)
 
 # define_binary_predicate(name,index,jcc)
-#define_binary_predicate(equal,24,je)
-#define_binary_predicate(greater,25,jg)
-#define_binary_predicate(less,27,jl)
-
-# At the moment, there is no advantage to using the above code, and in
-# fact using it is a waste, since the compiler open-codes the fixnum
-# case already.  Later, the above code will also handle floating-point
-# arguments, which the compiler does not open-code.
-
-define_jump_indirection(generic_decrement,22)
-define_jump_indirection(generic_divide,23)
-define_jump_indirection(generic_equal,24)
-define_jump_indirection(generic_greater,25)
-define_jump_indirection(generic_increment,26)
-define_jump_indirection(generic_less,27)
-define_jump_indirection(generic_subtract,28)
-define_jump_indirection(generic_multiply,29)
-define_jump_indirection(generic_negative,2a)
-define_jump_indirection(generic_add,2b)
-define_jump_indirection(generic_positive,2c)
-define_jump_indirection(generic_zero,2d)
+# define_binary_predicate(  $1,   $2, $3)
+define_binary_predicate(equal,24,je)
+define_binary_predicate(greater,25,jg)
+define_binary_predicate(less,27,jl)
+
+#define_jump_indirection(generic_decrement,22)
+#define_jump_indirection(generic_divide,23)
+#define_jump_indirection(generic_equal,24)
+#define_jump_indirection(generic_greater,25)
+#define_jump_indirection(generic_increment,26)
+#define_jump_indirection(generic_less,27)
+#define_jump_indirection(generic_subtract,28)
+#define_jump_indirection(generic_multiply,29)
+#define_jump_indirection(generic_negative,2a)
+#define_jump_indirection(generic_add,2b)
+#define_jump_indirection(generic_positive,2c)
+#define_jump_indirection(generic_zero,2d)
 define_jump_indirection(generic_quotient,37)
 define_jump_indirection(generic_remainder,38)
 define_jump_indirection(generic_modulo,39)
-- 
2.25.1