From 1145cbc45354f5e2ca24f86ad525c92526419946 Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <campbell@mumble.net>
Date: Thu, 5 Nov 2009 23:40:26 -0500
Subject: [PATCH] Simplify x86-64 utility interface, and nuke all x87 code from
 it.

The calling convention for microcode utilities now more closely
matches the standard calling convention for C on the AMD64, which
reduces the amount of code needed in scheme_to_interface.  The i386
hack to split the assembly hook addresses between negative addresses
and positive addresses is no more; most of what little benefit it
added on the i386, it ceased to add on x86-64, since no hooks fit in
positive byte offsets from the register block start any longer, and
only sixteen would fit in negative byte offsets.  Perhaps later it
would be worth deciding which sixteen deserve byte offsets (e.g.,
scheme-to-interface certainly would), but this is simpler for now.
Note that there is still a space advantage to invoking a hook (jump
to an offset from Registers) over invoking an interface (move its
number to AL, jump to scheme-to-interface), since invoking an
interface requires going through a hook anyway.

These change serves mainly to simplify the code, not to improve
performance, although shuffling the calling convention may improve
performance as a bonus.

The x87 code appears to have been causing problems, and will later
be replaced by the AMD64's 128-bit media (SSEn) instructions anyway.
The compiler still has some x87 vestiges left in it, but not in any
code that is hit, and nearly all of it is commented out now.

*** NOTE:  Since this changes the calling convention of utilities
from compiled code, it breaks all existing x86-64 .com files, which
you must recompile with the new compiler.  This means that you can't
straightforwardly rebuild Scheme the usual way from a prior x86-64
installation.  The easiest way to proceed is to start again from a
LIARC snapshot and rebootstrap the x86-64 code.  Expect more of this
to come; in particular, since we now have a reasonable number of
machine registers, it may be worthwhile to assign one to be the
return value register.
---
 src/compiler/machines/x86-64/lapgen.scm |  46 +-
 src/compiler/machines/x86-64/machin.scm |   1 -
 src/compiler/machines/x86-64/rules3.scm |  64 ++-
 src/compiler/machines/x86-64/rules4.scm |   9 +-
 src/compiler/machines/x86-64/rulflo.scm |   3 +
 src/microcode/cmpauxmd/x86-64.m4        | 580 ++++--------------------
 src/microcode/cmpintmd/x86-64.c         | 106 ++---
 src/microcode/cmpintmd/x86-64.h         | 115 +----
 8 files changed, 171 insertions(+), 753 deletions(-)

diff --git a/src/compiler/machines/x86-64/lapgen.scm b/src/compiler/machines/x86-64/lapgen.scm
index 2d504ed61..a28795447 100644
--- a/src/compiler/machines/x86-64/lapgen.scm
+++ b/src/compiler/machines/x86-64/lapgen.scm
@@ -214,11 +214,11 @@ USA.
   ;; We don't have unsigned addressing modes.
   (byte-offset-reference register offset))
 
-;++ This computation is probably not quite right.
+;;; This returns an offset in objects, not bytes.
 
 (define-integrable (pseudo-register-offset register)
-  (+ (+ (* 16 address-units-per-object) (* 80 address-units-per-object))
-     (* 3 (register-renumber register))))
+  (+ (+ 16 80)				;Sixteen fixed, eighty hooks.
+     (register-renumber register)))
 
 (define-integrable (pseudo->machine-register source target)
   (memory->machine-register (pseudo-register-home source) target))
@@ -669,18 +669,10 @@ USA.
   (offset-reference regnum:regs-pointer
 		    register-block/environment-offset))
 
-(define reg:dynamic-link
-  (offset-reference regnum:regs-pointer
-		    register-block/dynamic-link-offset))
-
 (define reg:lexpr-primitive-arity
   (offset-reference regnum:regs-pointer
 		    register-block/lexpr-primitive-arity-offset))
 
-(define reg:utility-arg-4
-  (offset-reference regnum:regs-pointer
-		    register-block/utility-arg4-offset))
-
 (define reg:stack-guard
   (offset-reference regnum:regs-pointer
 		    register-block/stack-guard-offset))
@@ -726,34 +718,20 @@ USA.
   (LAP (MOV B (R ,rax) (& ,code))
        ,@(invoke-hook/call entry:compiler-scheme-to-interface/call)))
 
-;++ This uses a kludge to number entries by byte offsets from the
-;++ registers block, but that works only in the 32-bit i386 version;
-;++ for x86-64 version, all the entries' byte indices exceed the range
-;++ of signed bytes.  But this works for now.
-
 (define-syntax define-entries
   (sc-macro-transformer
    (lambda (form environment)
      environment
      `(BEGIN
-	,@(let loop
-	      ((names (cdddr form))
-	       (index (cadr form))
-	       (high (caddr form)))
+	,@(let loop ((names (cddr form)) (index (cadr form)))
 	    (if (pair? names)
-		(if (< index high)
-		    (cons `(DEFINE-INTEGRABLE
-			     ,(symbol-append 'ENTRY:COMPILER-
-					     (car names))
-			     (BYTE-OFFSET-REFERENCE REGNUM:REGS-POINTER
-						    ,index))
-			  (loop (cdr names) (+ index 8) high))
-		    (begin
-		      (warn "define-entries: Too many for byte offsets.")
-		      (loop names index (+ high 32000))))
+		(cons `(DEFINE-INTEGRABLE
+			   ,(symbol-append 'ENTRY:COMPILER- (car names))
+			 (BYTE-OFFSET-REFERENCE REGNUM:REGS-POINTER ,index))
+		      (loop (cdr names) (+ index 8)))
 		'()))))))
 
-(define-entries #x80 #x100		; (* 16 8)
+(define-entries #x80			; (* 16 8)
   scheme-to-interface			; Main entry point (only one necessary)
   scheme-to-interface/call		; Used by rules3&4, for convenience.
   trampoline-to-interface		; Used by trampolines, for convenience.
@@ -769,9 +747,6 @@ USA.
   link
   error
   primitive-error
-  short-primitive-apply)
-
-(define-entries #x-100 0
   &+
   &-
   &*
@@ -796,8 +771,7 @@ USA.
   shortcircuit-apply-size-6
   shortcircuit-apply-size-7
   shortcircuit-apply-size-8
-  interrupt-continuation-2
-  conditionally-serialize)
+  interrupt-continuation-2)
 
 ;; Operation tables
 
diff --git a/src/compiler/machines/x86-64/machin.scm b/src/compiler/machines/x86-64/machin.scm
index f4a30ce61..c28e8c63f 100644
--- a/src/compiler/machines/x86-64/machin.scm
+++ b/src/compiler/machines/x86-64/machin.scm
@@ -248,7 +248,6 @@ USA.
 (define-integrable register-block/environment-offset 3)
 (define-integrable register-block/dynamic-link-offset 4) ; compiler temp
 (define-integrable register-block/lexpr-primitive-arity-offset 7)
-(define-integrable register-block/utility-arg4-offset 9) ; closure free
 (define-integrable register-block/stack-guard-offset 11)
 
 (define-integrable (fits-in-signed-byte? value)
diff --git a/src/compiler/machines/x86-64/rules3.scm b/src/compiler/machines/x86-64/rules3.scm
index 1295ea959..608043ecb 100644
--- a/src/compiler/machines/x86-64/rules3.scm
+++ b/src/compiler/machines/x86-64/rules3.scm
@@ -64,7 +64,7 @@ USA.
   continuation
   (expect-no-exit-interrupt-checks)
   (LAP ,@(clear-map!)
-       (POP Q (R ,rcx))
+       (POP Q (R ,rbx))
        #|
        (MOV Q (R ,rdx) (&U ,frame-size))
        ,@(invoke-interface code:compiler-apply)
@@ -104,7 +104,7 @@ USA.
   continuation
   (expect-no-exit-interrupt-checks)
   (LAP ,@(clear-map!)
-       (LEA Q (R ,rcx) (@PCR ,label))
+       (LEA Q (R ,rbx) (@PCR ,label))
        (MOV Q (R ,rdx) (&U ,number-pushed))
        ,@(invoke-interface code:compiler-lexpr-apply)))
 
@@ -114,8 +114,8 @@ USA.
   ;; It expects the procedure at the top of the stack
   (expect-no-exit-interrupt-checks)
   (LAP ,@(clear-map!)
-       (POP Q (R ,rcx))
-       (AND Q (R ,rcx) (R ,regnum:datum-mask)) ; clear type code
+       (POP Q (R ,rbx))
+       (AND Q (R ,rbx) (R ,regnum:datum-mask)) ; clear type code
        (MOV Q (R ,rdx) (&U ,number-pushed))
        ,@(invoke-interface code:compiler-lexpr-apply)))
 
@@ -139,7 +139,7 @@ USA.
   continuation
   (expect-no-exit-interrupt-checks)
   (let* ((set-extension
-	  (interpreter-call-argument->machine-register! extension rcx))
+	  (interpreter-call-argument->machine-register! extension rbx))
 	 (set-address
 	  (begin (require-register! rdx)
 		 (load-pc-relative-address (INST-EA (R ,rdx))
@@ -148,7 +148,7 @@ USA.
     (LAP ,@set-extension
 	 ,@set-address
 	 ,@(clear-map!)
-	 (MOV Q (R ,rbx) (&U ,frame-size))
+	 (MOV Q (R ,rcx) (&U ,frame-size))
 	 ,@(invoke-interface code:compiler-cache-reference-apply))))
 
 (define-rule statement
@@ -157,13 +157,13 @@ USA.
   continuation
   (expect-no-entry-interrupt-checks)
   (let* ((set-environment
-	  (interpreter-call-argument->machine-register! environment rcx))
+	  (interpreter-call-argument->machine-register! environment rbx))
 	 (set-name (object->machine-register! name rdx)))
     (delete-dead-registers!)
     (LAP ,@set-environment
 	 ,@set-name
 	 ,@(clear-map!)
-	 (MOV Q (R ,rbx) (&U ,frame-size))
+	 (MOV Q (R ,rcx) (&U ,frame-size))
 	 ,@(invoke-interface code:compiler-lookup-apply))))
 
 (define-rule statement
@@ -171,9 +171,9 @@ USA.
   continuation				; ignored
   (if (eq? primitive compiled-error-procedure)
       (LAP ,@(clear-map!)
-	   (MOV Q (R ,rcx) (&U ,frame-size))
+	   (MOV Q (R ,rbx) (&U ,frame-size))
 	   ,@(invoke-hook entry:compiler-error))
-      (LAP ,@(object->machine-register! primitive rcx)
+      (LAP ,@(object->machine-register! primitive rbx)
 	   ,@(clear-map!)
 	   ,@(let ((arity (primitive-procedure-arity primitive)))
 	       (cond ((not (negative? arity))
@@ -621,11 +621,11 @@ USA.
 ;;; This is invoked by the top level of the LAP generator.
 
 (define (generate/quotation-header environment-label free-ref-label n-sections)
-  (LAP (MOV Q (R ,rcx) ,reg:environment)
-       (MOV Q (@PCR ,environment-label) (R ,rcx))
+  (LAP (MOV Q (R ,rax) ,reg:environment)
+       (MOV Q (@PCR ,environment-label) (R ,rax))
        (LEA Q (R ,rdx) (@PCR ,*block-label*))
-       (LEA Q (R ,rbx) (@PCR ,free-ref-label))
-       (MOV Q ,reg:utility-arg-4 (&U ,n-sections))
+       (LEA Q (R ,rcx) (@PCR ,free-ref-label))
+       (MOV Q (R ,r8) (&U ,n-sections))
        #|
        ,@(invoke-interface/call code:compiler-link)
        |#
@@ -639,10 +639,10 @@ USA.
 			      n-sections)
   (LAP (MOV Q (R ,rdx) (@PCR ,code-block-label))
        (AND Q (R ,rdx) (R ,regnum:datum-mask))
-       (LEA Q (R ,rbx) (@RO ,rdx ,free-ref-offset))
-       (MOV Q (R ,rcx) ,reg:environment)
-       (MOV Q (@RO ,rdx ,environment-offset) (R ,rcx))
-       (MOV Q ,reg:utility-arg-4 (&U ,n-sections))
+       (LEA Q (R ,rcx) (@RO ,rdx ,free-ref-offset))
+       (MOV Q (R ,rax) ,reg:environment)
+       (MOV Q (@RO ,rdx ,environment-offset) (R ,rax))
+       (MOV Q (R ,r8) (&U ,n-sections))
        #|
        ,@(invoke-interface/call code:compiler-link)
        |#
@@ -661,41 +661,39 @@ USA.
 	 (PUSH Q (& 0))
 	(LABEL ,loop)
 	 ;; Get index
-	 (MOV Q (R ,rcx) (@R ,rsp))
+	 (MOV Q (R ,rax) (@R ,rsp))
 	 ;; Get vector
 	 (MOV Q (R ,rdx) (@PCR ,vector-label))
 	 ;; Get n-sections for this cc-block
-	 (XOR Q (R ,rbx) (R ,rbx))
-	 (LEA Q (R ,rax) (@PCR ,bytes))
-	 (MOV B (R ,rbx) (@RI ,rax ,rcx 1))
+	 (XOR Q (R ,r8) (R ,r8))
+	 (LEA Q (R ,rcx) (@PCR ,bytes))
+	 (MOV B (R ,r8) (@RI ,rcx ,rax 1))
 	 ;; address of vector
 	 (AND Q (R ,rdx) (R ,regnum:datum-mask))
-	 ;; Store n-sections in arg
-	 (MOV Q ,reg:utility-arg-4 (R ,rbx))
 	 ;; vector-ref -> cc block
 	 (MOV Q
 	      (R ,rdx)
 	      (@ROI ,rdx ,address-units-per-object
-		    ,rcx ,address-units-per-object))
+		    ,rax ,address-units-per-object))
 	 ;; address of cc-block
 	 (AND Q (R ,rdx) (R ,regnum:datum-mask))
 	 ;; cc-block length
-	 (MOV Q (R ,rbx) (@R ,rdx))
+	 (MOV Q (R ,rcx) (@R ,rdx))
 	 ;; Get environment
-	 (MOV Q (R ,rcx) ,reg:environment)
+	 (MOV Q (R ,rax) ,reg:environment)
 	 ;; Eliminate length tags
-	 (AND Q (R ,rbx) (R ,regnum:datum-mask))
+	 (AND Q (R ,rcx) (R ,regnum:datum-mask))
 	 ;; Store environment
-	 (MOV Q (@RI ,rdx ,rbx ,address-units-per-object) (R ,rcx))
+	 (MOV Q (@RI ,rdx ,rcx ,address-units-per-object) (R ,rax))
 	 ;; Get NMV header
-	 (MOV Q (R ,rcx) (@RO ,rdx ,address-units-per-object))
+	 (MOV Q (R ,rax) (@RO ,rdx ,address-units-per-object))
 	 ;; Eliminate NMV tag
-	 (AND Q (R ,rcx) (R ,regnum:datum-mask))
+	 (AND Q (R ,rax) (R ,regnum:datum-mask))
 	 ;; Address of first free reference
 	 (LEA Q
-	      (R ,rbx)
+	      (R ,rcx)
 	      (@ROI ,rdx ,(* 2 address-units-per-object)
-		    ,rcx ,address-units-per-object))
+		    ,rax ,address-units-per-object))
 	 ;; Invoke linker
 	 ,@(invoke-hook/call entry:compiler-link)
 	 ,@(make-external-label (continuation-code-word false)
diff --git a/src/compiler/machines/x86-64/rules4.scm b/src/compiler/machines/x86-64/rules4.scm
index 785b55abc..1f7c06518 100644
--- a/src/compiler/machines/x86-64/rules4.scm
+++ b/src/compiler/machines/x86-64/rules4.scm
@@ -55,7 +55,7 @@ USA.
   cont					; ignored
   (let* ((set-extension
 	  (interpreter-call-argument->machine-register! extension rdx))
-	 (set-value (interpreter-call-argument->machine-register! value rbx)))
+	 (set-value (interpreter-call-argument->machine-register! value rcx)))
     (LAP ,@set-extension
 	 ,@set-value
 	 ,@(clear-map!)
@@ -110,7 +110,7 @@ USA.
 	  (interpreter-call-argument->machine-register! environment rdx)))
     (LAP ,@set-environment
 	 ,@(clear-map (clear-map!))
-	 ,@(load-constant (INST-EA (R ,rbx)) name)
+	 ,@(load-constant (INST-EA (R ,rcx)) name)
 	 ,@(invoke-interface/call code))))
 
 (define-rule statement
@@ -130,10 +130,9 @@ USA.
 (define (assignment-call code environment name value)
   (let* ((set-environment
 	  (interpreter-call-argument->machine-register! environment rdx))
-	 (set-value (interpreter-call-argument->machine-register! value rax)))
+	 (set-value (interpreter-call-argument->machine-register! value r8)))
     (LAP ,@set-environment
 	 ,@set-value
 	 ,@(clear-map!)
-	 (MOV Q ,reg:utility-arg-4 (R ,rax))
-	 ,@(load-constant (INST-EA (R ,rbx)) name)
+	 ,@(load-constant (INST-EA (R ,rcx)) name)
 	 ,@(invoke-interface/call code))))
\ No newline at end of file
diff --git a/src/compiler/machines/x86-64/rulflo.scm b/src/compiler/machines/x86-64/rulflo.scm
index 201a761e0..885624014 100644
--- a/src/compiler/machines/x86-64/rulflo.scm
+++ b/src/compiler/machines/x86-64/rulflo.scm
@@ -27,6 +27,8 @@ USA.
 ;;; package: (compiler lap-syntaxer)
 
 (declare (usual-integrations))
+
+#|
 
 ;; ****
 ;; Missing: 2 argument operations and predicates with non-trivial
@@ -826,3 +828,4 @@ USA.
   (if (flo:32-bit-representation-exact? fp-value)
       (generate-ea (single-flonum->label fp-value) 'S)
       (generate-ea (double-flonum->label fp-value) 'D)))
+|#
diff --git a/src/microcode/cmpauxmd/x86-64.m4 b/src/microcode/cmpauxmd/x86-64.m4
index 58d7faf23..d134362f5 100644
--- a/src/microcode/cmpauxmd/x86-64.m4
+++ b/src/microcode/cmpauxmd/x86-64.m4
@@ -71,7 +71,7 @@
 ###	them, they contain garbage.
 ###
 ### Compiled Scheme code uses the following register convention:
-###	- %rsp containts the Scheme stack pointer, not the C stack
+###	- %rsp contains the Scheme stack pointer, not the C stack
 ###	pointer.
 ###	- %rsi contains a pointer to the Scheme interpreter's "register"
 ###	block.  This block contains the compiler's copy of MemTop,
@@ -112,8 +112,6 @@
 ### TYPE_CODE_LENGTH
 ###	Normally defined to be 6.  Don't change this unless you know
 ###	what you're doing.
-### DISABLE_387
-###	If defined, do not generate 387 floating-point instructions.
 ### VALGRIND_MODE
 ###	If defined, modify code to make it work with valgrind.
 
@@ -123,14 +121,6 @@ ifdef(`WIN32',
       `define(IF_WIN32,`$1')',
       `define(IF_WIN32,`')')
 
-ifdef(`DISABLE_387',
-      `define(IF387,`')',
-      `define(IF387,`$1')')
-
-ifdef(`DISABLE_387',
-      `define(IFN387,`$1')',
-      `define(IFN387,`')')
-
 IF_WIN32(`define(DASM,1)')
 ifdef(`WCC386R',`define(WCC386,1)')
 
@@ -308,23 +298,7 @@ define(IMM_FALSE, `IMM(HEX(0000000000000000))')
 
 define(REGBLOCK_VAL,16)
 define(REGBLOCK_COMPILER_TEMP,32)
-define(REGBLOCK_LEXPR_ACTUALS,56)
-define(REGBLOCK_PRIMITIVE,64)
-define(REGBLOCK_CLOSURE_FREE,72)
-
 define(REGBLOCK_DLINK,REGBLOCK_COMPILER_TEMP)
-define(REGBLOCK_UTILITY_ARG4,REGBLOCK_CLOSURE_FREE)
-
-define(COMPILER_REGBLOCK_N_FIXED,16)
-define(COMPILER_REGBLOCK_N_HOOKS,80)
-define(COMPILER_REGBLOCK_N_TEMPS,256)
-define(COMPILER_FIXED_SIZE,1)
-define(COMPILER_HOOK_SIZE,1)
-define(COMPILER_TEMP_SIZE,2)
-define(REGBLOCK_SIZE_IN_OBJECTS,
-       eval((COMPILER_REGBLOCK_N_FIXED*COMPILER_FIXED_SIZE)
-	    +(COMPILER_REGBLOCK_N_HOOKS*COMPILER_HOOK_SIZE)
-	    +(COMPILER_REGBLOCK_N_TEMPS*COMPILER_TEMP_SIZE)))
 
 # Define the floating-point processor control word.  Always set
 # round-to-even and double precision.  Under Win32, mask all
@@ -351,187 +325,24 @@ use_external_data(EVR(utility_table))
 ifdef(`WIN32',`
 use_external_data(EVR(RegistersPtr))
 ',`
-define_data(Regstart)
-allocate_space(Regstart,256)
-
-define_data(Registers)
-allocate_space(Registers,eval(REGBLOCK_SIZE_IN_OBJECTS*8))
+use_external_data(EVR(Registers))
 ')
 
-define_data(i387_presence)
-allocate_quadword(i387_presence)
-
 define_data(C_Stack_Pointer)
 allocate_quadword(C_Stack_Pointer)
 
 define_data(C_Frame_Pointer)
 allocate_quadword(C_Frame_Pointer)
-
-# [TRC 20091025: CPUID is always supported.]
-# define_data(x86_64_cpuid_supported)
-# allocate_quadword(x86_64_cpuid_supported)
-
-# [TRC 20091025: The cache synchronization bug does not occur in any
-# x86-64 machines of which I am aware.]
-# define_data(x86_64_cpuid_needed)
-# allocate_quadword(x86_64_cpuid_needed)
 
 DECLARE_CODE_SEGMENT()
 declare_alignment(2)
 
-# [TRC 20091025: We need to check for MMX/SSEn instructions too.]
-
-define_c_label(x86_64_interface_initialize)
-	OP(push,q)	REG(rbp)
-	OP(mov,q)	TW(REG(rsp),REG(rbp))
-	OP(xor,q)	TW(REG(rax),REG(rax))		# No 387 available
-
-# [TRC 20091025: The AMD64 reference manual suggests using the CPUID
-# instruction to detect instruction subsets instead.]
-
-# Unfortunately, the `movl cr0,ecx' instruction is privileged.
-# Use the deprecated `smsw cx' instruction instead.
-
-IF387(`
-#	OP(mov,q)	TW(REG(cr0),REG(rcx))		# Test for 387 presence
-ifdef(`VALGRIND_MODE',`',`
-	smsw		REG(cx)
-	OP(mov,q)	TW(IMM(HEX(12)),REG(rdx))
-	OP(and,q)	TW(REG(rdx),REG(rcx))
-	OP(cmp,q)	TW(REG(rdx),REG(rcx))
-	jne	x86_64_initialize_no_fp
-')
-	OP(inc,q)	REG(rax)			# 387 available
-	OP(sub,q)	TW(IMM(8),REG(rsp))
-	fclex
-	fnstcw		WOF(-2,REG(rbp))
-	OP(and,w)	TW(IMM(HEX(f0e0)),WOF(-2,REG(rbp)))
-	OP(or,w)	TW(IMM(FP_CONTROL_WORD),WOF(-2,REG(rbp)))
-	fldcw		WOF(-2,REG(rbp))
-x86_64_initialize_no_fp:
-')
-	OP(mov,q)	TW(REG(rax),ABS(EVR(i387_presence)))
-
-# [TRC 20091025: CPUID is always supported.]
-
-# Do a bunch of hair to determine if we need to do cache synchronization.
-# See if the CPUID instruction is supported.
-
-#	OP(xor,q)	TW(REG(rax),REG(rax))
-#	OP(mov,q)	TW(REG(rax),ABS(EVR(x86_64_cpuid_supported)))
-#	OP(mov,q)	TW(REG(rax),ABS(EVR(x86_64_cpuid_needed)))
-
-# First test: can we toggle the AC bit?
-
-#	pushfd
-#	OP(pop,l)	REG(eax)
-#	OP(mov,l)	TW(REG(eax),REG(ecx))
-#	OP(xor,l)	TW(IMM(HEX(00040000)),REG(eax))
-#	OP(push,l)	REG(eax)
-#	popfd
-#	pushfd
-#	OP(pop,l)	REG(eax)
-
-# if AC bit can't be toggled, this is a 386 (and doesn't support CPUID).
-
-#	OP(xor,l)	TW(REG(ecx),REG(eax))
-#	jz		no_cpuid_instr
-#	OP(push,l)	REG(ecx)			# restore EFLAGS
-#	popfd
-
-# Now test to see if the ID bit can be toggled.
-
-#	OP(mov,l)	TW(REG(ecx),REG(eax))
-#	OP(xor,l)	TW(IMM(HEX(00200000)),REG(eax))
-#	OP(push,l)	REG(eax)
-#	popfd
-#	pushfd
-#	OP(pop,l)	REG(eax)
-
-# if ID bit can't be toggled, this is a 486 that doesn't support CPUID.
-
-#	OP(xor,l)	TW(REG(ecx),REG(eax))
-#	jz		no_cpuid_instr
-#	OP(push,l)	REG(ecx)			# restore EFLAGS
-#	popfd
-
-# Now we know that cpuid is supported.
-
-#	OP(mov,q)	TW(IMM(HEX(00000001)),ABS(EVR(x86_64_cpuid_supported)))
-
-# Next, use the CPUID instruction to determine the processor type.
-
-#	OP(push,l)	REG(ebx)
-#	OP(xor,l)	TW(REG(eax),REG(eax))
-#	cpuid
-
-# Check that CPUID accepts argument 1.
-
-#	OP(cmp,l)	TW(IMM(HEX(00000001)),REG(eax))
-#	jl		done_setting_up_cpuid
-
-# Detect "GenuineIntel".
-
-#	OP(cmp,l)	TW(IMM(HEX(756e6547)),REG(ebx))
-#	jne		not_intel_cpu
-#	OP(cmp,l)	TW(IMM(HEX(49656e69)),REG(edx))
-#	jne		not_intel_cpu
-#	OP(cmp,l)	TW(IMM(HEX(6c65746e)),REG(ecx))
-#	jne		not_intel_cpu
-
-# For CPU families 4 (486), 5 (Pentium), or 6 (Pentium Pro, Pentium
-# II, Pentium III), don't use CPUID synchronization.
-
-#	OP(mov,l)	TW(IMM(HEX(01)),REG(eax))
-#	cpuid
-#	OP(shr,l)	TW(IMM(HEX(08)),REG(eax))
-#	OP(and,l)	TW(IMM(HEX(0000000F)),REG(eax))
-#	OP(cmp,l)	TW(IMM(HEX(4)),REG(eax))
-#	jl		done_setting_up_cpuid
-#	OP(cmp,l)	TW(IMM(HEX(6)),REG(eax))
-#	jg		done_setting_up_cpuid
+# C_to_interface passes control from C into Scheme.  To C it is a
+# unary procedure; its one argument is passed in rdi.  It saves the
+# state of the C world (the C frame pointer and stack pointer) and
+# then passes control to interface_to_scheme to set up the state of
+# the Scheme world.
 #
-#	jmp		cpuid_not_needed
-#
-#not_intel_cpu:
-
-# Detect "AuthenticAMD".
-
-#	OP(cmp,l)	TW(IMM(HEX(68747541)),REG(ebx))
-#	jne		not_amd_cpu
-#	OP(cmp,l)	TW(IMM(HEX(69746e65)),REG(edx))
-#	jne		not_amd_cpu
-#	OP(cmp,l)	TW(IMM(HEX(444d4163)),REG(ecx))
-#	jne		not_amd_cpu
-
-# Problem appears to exist only on Athlon models 1, 3, and 4.
-
-#	OP(mov,l)	TW(IMM(HEX(01)),REG(eax))
-#	cpuid
-
-#	OP(mov,l)	TW(REG(eax),REG(ecx))
-#	OP(shr,l)	TW(IMM(HEX(08)),REG(eax))
-#	OP(and,l)	TW(IMM(HEX(0000000F)),REG(eax))
-#	OP(cmp,l)	TW(IMM(HEX(6)),REG(eax))	# family 6 = Athlon
-#	jne		done_setting_up_cpuid
-
-#	OP(mov,l)	TW(REG(ecx),REG(eax))
-#	OP(shr,l)	TW(IMM(HEX(04)),REG(eax))
-#	OP(and,l)	TW(IMM(HEX(0000000F)),REG(eax))
-#	OP(cmp,l)	TW(IMM(HEX(6)),REG(eax))	# model 6 and up OK
-#	jge		done_setting_up_cpuid
-#	OP(cmp,l)	TW(IMM(HEX(2)),REG(eax))	# model 2 OK
-#	je		done_setting_up_cpuid
-
-#	OP(mov,l)	TW(IMM(HEX(00000001)),ABS(EVR(x86_64_cpuid_needed)))
-
-#not_amd_cpu:
-#done_setting_up_cpuid:
-#	OP(pop,l)	REG(ebx)
-#no_cpuid_instr:
-	leave
-	ret
-
 # Note:  The AMD64 ABI mandates that on entry to a function, RSP - 8
 # must be a multiple of 0x10; that is, the stack must be 128-bit
 # aligned.  We push six quadwords onto the stack, but there is already
@@ -557,68 +368,46 @@ define_c_label(C_to_interface)
 
 define_hook_label(trampoline_to_interface)
 define_debugging_label(trampoline_to_interface)
-	OP(pop,q)	REG(rcx)			# trampoline storage
+	OP(pop,q)	REG(rbx)			# trampoline storage
 	jmp	scheme_to_interface
 
 define_hook_label(scheme_to_interface_call)
 define_debugging_label(scheme_to_interface_call)
-	OP(pop,q)	REG(rcx)			# arg1 = ret. add
-	OP(add,q)	TW(IMM(4),REG(rcx))		# Skip format info
+	OP(pop,q)	REG(rbx)			# arg1 = ret. add
+	OP(add,q)	TW(IMM(4),REG(rbx))		# Skip format info
 #	jmp	scheme_to_interface
 
+# scheme_to_interface passes control from compiled Scheme code to a
+# microcode utility.  The arguments for the utility go respectively in
+# rbx, rdx, rcx, and r8.  This loosely matches the AMD64 calling
+# convention, where arguments go respectively in rdi, rsi, rdx, rcx,
+# and r8.  The differences are that scheme_to_interface uses rdi as an
+# implicit first argument to the utility, and rsi is used in compiled
+# code for the registers block, since the compiler can't allocate it
+# as a general-purpose register because it doesn't admit byte-size
+# operations.  Moreover, Scheme uses rdi as the free pointer register,
+# which we have to save here in a location unknown to Scheme (the C
+# `Free' variable), so it can't be set by compiled code.
+
 define_hook_label(scheme_to_interface)
 define_debugging_label(scheme_to_interface)
-
-# These two moves must happen _before_ the ffree instructions below.
-# Otherwise recovery from SIGFPE there will fail.
 	OP(mov,q)	TW(REG(rsp),ABS(EVR(stack_pointer)))
 	OP(mov,q)	TW(rfree,ABS(EVR(Free)))
-
-# [TRC 20091025: I think this should be excised.]
-
-IF387(`
-	OP(cmp,q)	TW(IMM(0),ABS(EVR(i387_presence)))
-	je	scheme_to_interface_proceed
-	ffree	ST(0)					# Free floating "regs"
-	ffree	ST(1)
-	ffree	ST(2)
-	ffree	ST(3)
-	ffree	ST(4)
-	ffree	ST(5)
-	ffree	ST(6)
-	ffree	ST(7)
-scheme_to_interface_proceed:
-')
-
 	OP(mov,q)	TW(ABS(EVR(C_Stack_Pointer)),REG(rsp))
 	OP(mov,q)	TW(ABS(EVR(C_Frame_Pointer)),REG(rbp))
 
 	OP(sub,q)	TW(IMM(16),REG(rsp))	# alloc struct return
-
-	# Shuffle Scheme -> AMD64 calling conventions:
-	#   struct pointer -> rdi
-	#   rcx -> rsi
-	#   rdx -> rdx
-	#   rbx -> rcx
-	#   arg4 -> r8
-	# Parallel assignment problems:
-	#   arg4 depends on rsi: do arg4->r8 first
-	#   target depends on rcx (why?): use r11 as a temporary
-	# [TRC 20091025: Perhaps we can rearrange LIAR to generate
-	# arguments in the registers we want, to avoid this
-	# shuffling.]
-
-	OP(mov,q)	TW(REG(rcx),REG(r11))
-
-	OP(xor,q)	TW(REG(rcx),REG(rcx))
-	OP(mov,b)	TW(REG(al),REG(cl))
-	OP(lea,q)	TW(ABS(EVR(utility_table)),REG(rax))
-	OP(mov,q)	TW(SDX(,REG(rax),REG(rcx),8),REG(rax))
-
-	OP(mov,q)	TW(REG(rsp),REG(rdi))
-	OP(mov,q)	TW(DOF(REGBLOCK_UTILITY_ARG4(),regs),REG(r8))
-	OP(mov,q)	TW(REG(r11),REG(rsi))
-	OP(mov,q)	TW(REG(rbx),REG(rcx))
+	OP(mov,q)	TW(REG(rsp),REG(rdi))	# Structure is first argument.
+	OP(mov,q)	TW(REG(rbx),REG(rsi))	# rbx -> second argument.
+
+	# Find the utility.  rbx is now free as a temporary register
+	# to hold the utility table.  rax initially stores the utility
+	# number in its low eight bits and possibly garbage in the
+	# rest; mask it off and then use it as an index into the
+	# utility table, scaled by 8 (bytes per pointer).
+	OP(lea,q)	TW(ABS(EVR(utility_table)),REG(rbx))
+	OP(and,q)	TW(IMM(HEX(ff)),REG(rax))
+	OP(mov,q)	TW(SDX(,REG(rbx),REG(rax),8),REG(rax))
 
 	call		IJMP(REG(rax))
 
@@ -628,29 +417,12 @@ define_debugging_label(scheme_to_interface_return)
 	jmp		IJMP(REG(rax))		# Invoke handler
 
 define_c_label(interface_to_scheme)
-IF387(`
-	OP(cmp,q)	TW(IMM(0),ABS(EVR(i387_presence)))
-	je	interface_to_scheme_proceed
-	ffree	ST(0)					# Free floating "regs"
-	ffree	ST(1)
-	ffree	ST(2)
-	ffree	ST(3)
-	ffree	ST(4)
-	ffree	ST(5)
-	ffree	ST(6)
-	ffree	ST(7)
-interface_to_scheme_proceed:
-')
-							# Register block = %rsi
-							# Scheme offset in NT
-ifdef(`WIN32',
+ifdef(`WIN32',						# Register block = %rsi
 `	OP(mov,q)	TW(ABS(EVR(RegistersPtr)),regs)',
 `	OP(lea,q)	TW(ABS(EVR(Registers)),regs)')
-
 	OP(mov,q)	TW(ABS(EVR(Free)),rfree)	# Free pointer = %rdi
 	OP(mov,q)	TW(DOF(REGBLOCK_VAL(),regs),REG(rax)) # Value/dynamic link
 	OP(mov,q)	TW(IMM(ADDRESS_MASK),rmask)	# = %rbp
-
 	OP(mov,q)	TW(ABS(EVR(stack_pointer)),REG(rsp))
 	OP(mov,q)	TW(REG(rax),REG(rcx))		# Preserve if used
 	OP(and,q)	TW(rmask,REG(rcx))		# Restore potential dynamic link
@@ -665,19 +437,6 @@ define_code_label(EFR(callWinntExceptionTransferHook))
 ')
 
 define_c_label(interface_to_C)
-IF387(`
-	OP(cmp,q)	TW(IMM(0),ABS(EVR(i387_presence)))
-	je	interface_to_C_proceed
-	ffree	ST(0)					# Free floating "regs"
-	ffree	ST(1)
-	ffree	ST(2)
-	ffree	ST(3)
-	ffree	ST(4)
-	ffree	ST(5)
-	ffree	ST(6)
-	ffree	ST(7)
-interface_to_C_proceed:')
-
 	OP(mov,q)	TW(REG(rdx),REG(rax))		# Set up result
 	# We need a dummy register for the POP (which is three bytes
 	# shorter than ADD $8,RSP); since we're about to pop into r15
@@ -691,33 +450,6 @@ interface_to_C_proceed:')
 	leave
 	ret
 
-# [TRC 20091025: The cache synchronization bug does not occur in any
-# x86-64 machines of which I am aware.]
-
-#define_code_label(EFR(x86_64_cache_synchronize))
-#	OP(push,q)	REG(rbp)
-#	OP(mov,q)	TW(REG(rsp),REG(rbp))
-#	OP(push,q)	REG(rbx)
-#	OP(xor,q)	TW(REG(rax),REG(rax))
-#	cpuid
-#	OP(pop,q)	REG(rbx)
-#	leave
-#	ret
-
-### Run the CPUID instruction for serialization.
-
-#define_hook_label(serialize_cache)
-#	pushad
-#	OP(xor,q)	TW(REG(rax),REG(rax))
-#	cpuid
-#	popad
-#	ret
-
-### Stub to be used in place of above on machines that don't need it.
-
-#define_hook_label(dont_serialize_cache)
-#	ret
-
 ###	Assembly language hooks used to reduce code size.
 ###	There is no time advantage to using these over using
 ###	scheme_to_interface (or scheme_to_interface_call), but the
@@ -743,27 +475,6 @@ define_hook_label(interrupt_dlink)
 	OP(mov,b)	TW(IMM(HEX(19)),REG(al))
 	jmp	scheme_to_interface_call
 
-###
-###	This saves even more instructions than primitive_apply
-###	When the PC is not available.  Instead of jumping here,
-###	a call instruction is used, and the longword offset to
-###	the primitive object follows the call instruction.
-###	This code loads the primitive object and merges with
-###	apply_primitive
-###
-###     [TRC 20091025: But on the x86-64, we have RIP-relative
-###     addressing, so we don't need this.]
-###
-
-#declare_alignment(2)
-#define_hook_label(short_primitive_apply)
-#	OP(pop,l)	REG(edx)			# offset pointer
-#	OP(mov,l)	TW(IND(REG(edx)),REG(ecx))	# offset
-#							# Primitive object
-#	OP(mov,l)	TW(IDX(REG(edx),REG(ecx)),REG(ecx))
-#							# Merge
-#	jmp	hook_reference(primitive_apply)
-
 declare_alignment(2)
 define_jump_indirection(primitive_apply,12)
 
@@ -787,17 +498,16 @@ define_call_indirection(primitive_error,36)
 
 declare_alignment(2)
 define_hook_label(sc_apply)
-	OP(mov,q)	TW(REG(rcx),REG(rax))		# Copy for type code
-	OP(mov,q)	TW(REG(rcx),REG(rbx))		# Copy for address
+	OP(mov,q)	TW(REG(rbx),REG(rax))		# Copy for type code
+	OP(mov,q)	TW(REG(rbx),REG(rcx))		# Copy for address
 	OP(shr,q)	TW(IMM(DATUM_LENGTH),REG(rax))	# Select type code
-	OP(and,q)	TW(rmask,REG(rbx))		# Select datum
+	OP(and,q)	TW(rmask,REG(rcx))		# Select datum
 	OP(cmp,b)	TW(IMM(TC_COMPILED_ENTRY),REG(al))
 	jne	asm_sc_apply_generic
-	# [TRC 20091025: How big are the frame sizes?]
-	OP(movs,bq,x)	TW(BOF(-4,REG(rbx)),REG(rax))	# Extract frame size
+	OP(movs,bq,x)	TW(BOF(-4,REG(rcx)),REG(rax))	# Extract frame size
 	OP(cmp,q)	TW(REG(rax),REG(rdx))		# Compare to nargs+1
 	jne	asm_sc_apply_generic
-	jmp	IJMP(REG(rbx))				# Invoke
+	jmp	IJMP(REG(rcx))				# Invoke
 
 define_debugging_label(asm_sc_apply_generic)
 	OP(mov,q)	TW(IMM(HEX(14)),REG(rax))
@@ -806,16 +516,15 @@ define_debugging_label(asm_sc_apply_generic)
 define(define_apply_fixed_size,
 `declare_alignment(2)
 define_hook_label(sc_apply_size_$1)
-	OP(mov,q)	TW(REG(rcx),REG(rax))		# Copy for type code
-	OP(mov,q)	TW(REG(rcx),REG(rbx))		# Copy for address
+	OP(mov,q)	TW(REG(rbx),REG(rax))		# Copy for type code
+	OP(mov,q)	TW(REG(rbx),REG(rcx))		# Copy for address
 	OP(shr,q)	TW(IMM(DATUM_LENGTH),REG(rax))	# Select type code
-	OP(and,q)	TW(rmask,REG(rbx))		# Select datum
+	OP(and,q)	TW(rmask,REG(rcx))		# Select datum
 	OP(cmp,b)	TW(IMM(TC_COMPILED_ENTRY),REG(al))
 	jne	asm_sc_apply_generic_$1
-	# [TRC 20091025: How big are the frame sizes?]
-	OP(cmp,b)	TW(IMM($1),BOF(-4,REG(rbx)))	# Compare frame size
-	jne	asm_sc_apply_generic_$1	# to nargs+1
-	jmp	IJMP(REG(rbx))
+	OP(cmp,b)	TW(IMM($1),BOF(-4,REG(rcx)))	# Compare frame size
+	jne	asm_sc_apply_generic_$1			# to nargs+1
+	jmp	IJMP(REG(rcx))
 
 asm_sc_apply_generic_$1:
 	OP(mov,q)	TW(IMM($1),REG(rdx))
@@ -837,25 +546,6 @@ define_apply_fixed_size(8)
 ###	numeric types are much faster than the rare ones
 ###	(bignums, ratnums, recnums)
 
-IF387(`declare_alignment(2)
-asm_generic_flonum_result:
-	# The MOV instruction can take a 64-bit immediate operand only
-	# if the target is a register, so we store the manifest in rax
-	# before moving it to memory.
-	OP(mov,q)	TW(IMM_MANIFEST_NM_VECTOR_1,REG(rax))
-	OP(mov,q)	TW(REG(rax), IND(rfree))
-	# The OR instruction cannot take a 64-bit immediate either, so
-	# we need to store the tag in rax first, shift it up, and then
-	# OR the datum into it.
-	OP(mov,q)	TW(IMM(TC_FLONUM),REG(rax))
-	OP(shl,q)	TW(IMM(DATUM_LENGTH),REG(rax))
-	OP(or,q)	TW(rfree,REG(rax))
-	OP(fstp,l)	DOF(8,rfree)			# fstpd
-	OP(and,q)	TW(rmask,IND(REG(rsp)))
-	OP(add,q)	TW(IMM(16),rfree)
-	OP(mov,q)	TW(REG(rax),DOF(REGBLOCK_VAL(),regs))
-	ret
-
 declare_alignment(2)
 asm_generic_fixnum_result:
 	OP(and,q)	TW(rmask,IND(REG(rsp)))
@@ -876,7 +566,7 @@ asm_generic_return_sharp_f:
 	OP(and,q)	TW(rmask,IND(REG(rsp)))
 	OP(mov,q)	TW(IMM_FALSE,REG(rax))
 	OP(mov,q)	TW(REG(rax),LOF(REGBLOCK_VAL(),regs))
-	ret')
+	ret
 
 define(define_unary_operation,
 `declare_alignment(2)
@@ -885,13 +575,7 @@ define_hook_label(generic_$1)
 	OP(mov,q)	TW(REG(rdx),REG(rax))
 	OP(shr,q)	TW(IMM(DATUM_LENGTH),REG(rax))
 	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(al))
-	je	asm_generic_$1_fix
-	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(al))
 	jne	asm_generic_$1_fail
-	OP(and,q)	TW(rmask,REG(rdx))
-	fld1
-	OP($4,l)	DOF(8,REG(rdx))
-	jmp	asm_generic_flonum_result
 
 asm_generic_$1_fix:
 	OP(mov,q)	TW(REG(rdx),REG(rax))
@@ -911,17 +595,7 @@ define_hook_label(generic_$1)
 	OP(mov,q)	TW(REG(rdx),REG(rax))
 	OP(shr,q)	TW(IMM(DATUM_LENGTH),REG(rax))
 	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(al))
-	je	asm_generic_$1_fix
-	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(al))
 	jne	asm_generic_$1_fail
-	OP(and,q)	TW(rmask,REG(rdx))
-	OP(fld,l)	DOF(8,REG(rdx))
-	ftst
-	fstsw	REG(ax)
-	fstp	ST(0)
-	sahf
-	$4	asm_generic_return_sharp_t
-	jmp	asm_generic_return_sharp_f
 
 asm_generic_$1_fix:
 	OP(mov,q)	TW(REG(rdx),REG(rax))
@@ -936,6 +610,10 @@ asm_generic_$1_fail:
 	jmp	scheme_to_interface')
 
 define(define_binary_operation,
+`define_binary_operation_with_fixup($1,$2,$3,
+	`OP(shl,q)	TW(IMM(TC_LENGTH),REG(rax))')')
+
+define(define_binary_operation_with_fixup,
 `declare_alignment(2)
 define_hook_label(generic_$1)
 	OP(pop,q)	REG(rdx)
@@ -947,93 +625,22 @@ define_hook_label(generic_$1)
 	OP(cmp,b)	TW(REG(al),REG(cl))
 	jne	asm_generic_$1_fail
 	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(al))
-	je	asm_generic_$1_fix
-	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(al))
-	je	asm_generic_$1_flo
-
-asm_generic_$1_fail:
-	OP(push,q)	REG(rbx)
-	OP(push,q)	REG(rdx)
-	OP(mov,b)	TW(IMM(HEX($2)),REG(al))
-	jmp	scheme_to_interface
+	jne	asm_generic_$1_fail
 
 asm_generic_$1_fix:
 	OP(mov,q)	TW(REG(rdx),REG(rax))
 	OP(mov,q)	TW(REG(rbx),REG(rcx))
-	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rax))
+	$4						# Set up rax.
 	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rcx))
-	$5
 	OP($3,q)	TW(REG(rcx),REG(rax))		# subq
-	jo	asm_generic_$1_fail
-	jmp	asm_generic_fixnum_result
-
-asm_generic_$1_flo:
-	OP(and,q)	TW(rmask,REG(rdx))
-	OP(and,q)	TW(rmask,REG(rbx))
-	OP(fld,l)	DOF(8,REG(rdx))			# fldd
-	OP($4,l)	DOF(8,REG(rbx))			# fsubl
-	jmp	asm_generic_flonum_result')
-
-IF387(`declare_alignment(2)
-define_hook_label(generic_divide)
-	OP(pop,q)	REG(rdx)
-	OP(pop,q)	REG(rbx)
-	OP(mov,q)	TW(REG(rdx),REG(rax))
-	OP(mov,q)	TW(REG(rbx),REG(rcx))
-	OP(shr,q)	TW(IMM(DATUM_LENGTH),REG(rax))
-	OP(shr,q)	TW(IMM(DATUM_LENGTH),REG(rcx))
-	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(al))
-	je	asm_generic_divide_fix
-	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(al))
-	jne	asm_generic_divide_fail
-	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(cl))
-	je	asm_generic_divide_flo_flo
-	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(cl))
-	jne	asm_generic_divide_fail
-	OP(mov,q)	TW(REG(rbx),REG(rcx))
-	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rcx))
-	je	asm_generic_divide_fail
-	OP(and,q)	TW(rmask,REG(rdx))
-	OP(sar,q)	TW(IMM(TC_LENGTH),REG(rcx))
-	OP(fld,l)	DOF(8,REG(rdx))			# fldd
-	OP(mov,q)	TW(REG(rcx),IND(rfree))
-	OP(fidiv,l)	IND(rfree)
-	jmp	asm_generic_flonum_result
-
-asm_generic_divide_fix:
-	OP(cmp,b)	TW(IMM(TC_FLONUM),REG(cl))
-	jne	asm_generic_divide_fail
-	OP(mov,q)	TW(REG(rdx),REG(rcx))
-	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rcx))
-	je	asm_generic_divide_fail
-	OP(and,q)	TW(rmask,REG(rbx))
-	OP(sar,q)	TW(IMM(TC_LENGTH),REG(rcx))
-	OP(fld,l)	DOF(8,REG(rbx))			# fldd
-	OP(mov,q)	TW(REG(rcx),IND(rfree))
-	OP(fidivr,l)	IND(rfree)
-	jmp	asm_generic_flonum_result
-
-asm_generic_divide_flo_flo:
-	OP(mov,q)	TW(REG(rbx),REG(rcx))
-	OP(and,q)	TW(rmask,REG(rcx))
-	OP(fld,l)	DOF(8,REG(rcx))			# fldd
-	ftst
-	fstsw	REG(ax)
-	sahf
-	je	asm_generic_divide_by_zero
-	OP(and,q)	TW(rmask,REG(rdx))
-	OP(fdivr,l)	DOF(8,REG(rdx))
-	jmp	asm_generic_flonum_result	
-
-asm_generic_divide_by_zero:
-	fstp	ST(0)					# Pop second arg
-
-asm_generic_divide_fail:
+	jno	asm_generic_fixnum_result
+
+asm_generic_$1_fail:
 	OP(push,q)	REG(rbx)
 	OP(push,q)	REG(rdx)
-	OP(mov,b)	TW(IMM(HEX(23)),REG(al))
+	OP(mov,b)	TW(IMM(HEX($2)),REG(al))
 	jmp	scheme_to_interface')
-
+
 define(define_binary_predicate,
 `declare_alignment(2)
 define_hook_label(generic_$1)
@@ -1048,6 +655,7 @@ define_hook_label(generic_$1)
 	OP(cmp,b)	TW(IMM(TC_FIXNUM),REG(al))
 	jne	asm_generic_$1_fail
 
+asm_generic_$1_fix:
 	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rdx))
 	OP(shl,q)	TW(IMM(TC_LENGTH),REG(rbx))
 	OP(cmp,q)	TW(REG(rbx),REG(rdx))
@@ -1060,30 +668,32 @@ asm_generic_$1_fail:
 	OP(mov,b)	TW(IMM(HEX($2)),REG(al))
 	jmp	scheme_to_interface')
 
-IF387(`define_unary_operation(decrement,22,sub,fsubr)
-define_unary_operation(increment,26,add,fadd)
-
-define_unary_predicate(negative,2a,jl,jb)
-define_unary_predicate(positive,2c,jg,ja)
-define_unary_predicate(zero,2d,je,je)
-
-# define_binary_operation(name,index,fix*fix,flo*flo, fixup)
-# define_binary_operation(  $1,   $2,     $3,     $4, $5)
-# The fixup is optional; only multiplication needs it to shift the
-# result back down by six bits.
-define_binary_operation(add,2b,add,fadd)
-define_binary_operation(subtract,28,sub,fsub)
-define_binary_operation(multiply,29,imul,fmul,
-	`OP(shr,q)	TW(IMM(6),REG(rax))')
-# Divide needs to check for 0, so we cant really use the following
-# define_binary_operation(divide,23,NONE,fdiv)
-
-# define_binary_predicate(name,index,fix*fix,flo*flo)
-define_binary_predicate(equal,24,je,je)
-define_binary_predicate(greater,25,jg,ja)
-define_binary_predicate(less,27,jl,jb)')
-
-IFN387(`define_jump_indirection(generic_decrement,22)
+#define_unary_operation(decrement,22,sub)
+#define_unary_operation(increment,26,add)
+
+#define_unary_predicate(negative,2a,jl)
+#define_unary_predicate(positive,2c,jg)
+#define_unary_predicate(zero,2d,je)
+
+# define_binary_operation(name,index,op)
+# define_binary_operation(  $1,   $2,$3)
+#define_binary_operation(add,2b,add)
+#define_binary_operation(subtract,28,sub)
+
+# No fixup -- leave it unshifted.
+#define_binary_operation_with_fixup(multiply,29,imul)
+
+# define_binary_predicate(name,index,jcc)
+#define_binary_predicate(equal,24,je)
+#define_binary_predicate(greater,25,jg)
+#define_binary_predicate(less,27,jl)
+
+# At the moment, there is no advantage to using the above code, and in
+# fact using it is a waste, since the compiler open-codes the fixnum
+# case already.  Later, the above code will also handle floating-point
+# arguments, which the compiler does not open-code.
+
+define_jump_indirection(generic_decrement,22)
 define_jump_indirection(generic_divide,23)
 define_jump_indirection(generic_equal,24)
 define_jump_indirection(generic_greater,25)
@@ -1094,30 +704,10 @@ define_jump_indirection(generic_multiply,29)
 define_jump_indirection(generic_negative,2a)
 define_jump_indirection(generic_add,2b)
 define_jump_indirection(generic_positive,2c)
-define_jump_indirection(generic_zero,2d)')
-
-# These don't currently differ according to whether there
-# is a 387 or not.
-
+define_jump_indirection(generic_zero,2d)
 define_jump_indirection(generic_quotient,37)
 define_jump_indirection(generic_remainder,38)
 define_jump_indirection(generic_modulo,39)
-
-define_jump_indirection(nofp_decrement,22)
-define_jump_indirection(nofp_divide,23)
-define_jump_indirection(nofp_equal,24)
-define_jump_indirection(nofp_greater,25)
-define_jump_indirection(nofp_increment,26)
-define_jump_indirection(nofp_less,27)
-define_jump_indirection(nofp_subtract,28)
-define_jump_indirection(nofp_multiply,29)
-define_jump_indirection(nofp_negative,2a)
-define_jump_indirection(nofp_add,2b)
-define_jump_indirection(nofp_positive,2c)
-define_jump_indirection(nofp_zero,2d)
-define_jump_indirection(nofp_quotient,37)
-define_jump_indirection(nofp_remainder,38)
-define_jump_indirection(nofp_modulo,39)
 
 IFDASM(`end')
 
diff --git a/src/microcode/cmpintmd/x86-64.c b/src/microcode/cmpintmd/x86-64.c
index a7cc7f0d4..3f364c144 100644
--- a/src/microcode/cmpintmd/x86-64.c
+++ b/src/microcode/cmpintmd/x86-64.c
@@ -198,10 +198,9 @@ store_trampoline_insns (insn_t * entry, byte_t code)
 {
   (*entry++) = 0xB0;		/* MOV AL,code */
   (*entry++) = code;
-  (*entry++) = 0xFF;		/* CALL /2 disp32(ESI) */
+  (*entry++) = 0xFF;		/* CALL /2 disp32(RSI) */
   (*entry++) = 0x96;
   (* ((uint32_t *) entry)) = RSI_TRAMPOLINE_TO_INTERFACE_OFFSET;
-  X86_64_CACHE_SYNCHRONIZE ();
   return (false);
 }
 
@@ -223,7 +222,6 @@ x86_64_reset_hook (void)
 {
   int offset = (COMPILER_REGBLOCK_N_FIXED * (sizeof (SCHEME_OBJECT)));
   unsigned char * rsi_value = ((unsigned char *) Registers);
-  bool fp_support_present = (x86_64_interface_initialize ());
 
   /* These must match machines/x86-64/lapgen.scm */
 
@@ -249,78 +247,36 @@ x86_64_reset_hook (void)
   SETUP_REGISTER (asm_link);				/* 12 */
   SETUP_REGISTER (asm_error);				/* 13 */
   SETUP_REGISTER (asm_primitive_error);			/* 14 */
-  /* [TRC 20091025: This was an i386 hack for when the PC is not
-     available, which on x86-64 it always is. */
-  /* SETUP_REGISTER (asm_short_primitive_apply); */		/* 15 */
-
-  /* No more room for positive offsets without going to 32-bit
-     offsets!  */
-
-  /* This is a hack to make all the hooks be addressable with byte
-     offsets (instead of longword offsets).  The register block
-     extends to negative offsets as well, so all the following hooks
-     are accessed with negative offsets, and all fit in a byte.  */
-
-  /* [TRC 20091029: This hack doesn't work any longer; this code
-     should be cleaned up, since we must use longword offsets anyway.]
-     */
-
-  offset = -256;
-  if (fp_support_present)
-    {
-      SETUP_REGISTER (asm_generic_add);			/* -32 */
-      SETUP_REGISTER (asm_generic_subtract);		/* -31 */
-      SETUP_REGISTER (asm_generic_multiply);		/* -30 */
-      SETUP_REGISTER (asm_generic_divide);		/* -29 */
-      SETUP_REGISTER (asm_generic_equal);		/* -28 */
-      SETUP_REGISTER (asm_generic_less);		/* -27 */
-      SETUP_REGISTER (asm_generic_greater);		/* -26 */
-      SETUP_REGISTER (asm_generic_increment);		/* -25 */
-      SETUP_REGISTER (asm_generic_decrement);		/* -24 */
-      SETUP_REGISTER (asm_generic_zero);		/* -23 */
-      SETUP_REGISTER (asm_generic_positive);		/* -22 */
-      SETUP_REGISTER (asm_generic_negative);		/* -21 */
-      SETUP_REGISTER (asm_generic_quotient);		/* -20 */
-      SETUP_REGISTER (asm_generic_remainder);		/* -19 */
-      SETUP_REGISTER (asm_generic_modulo);		/* -18 */
-    }
-  else
-    {
-      SETUP_REGISTER (asm_nofp_add);			/* -32 */
-      SETUP_REGISTER (asm_nofp_subtract);		/* -31 */
-      SETUP_REGISTER (asm_nofp_multiply);		/* -30 */
-      SETUP_REGISTER (asm_nofp_divide);			/* -29 */
-      SETUP_REGISTER (asm_nofp_equal);			/* -28 */
-      SETUP_REGISTER (asm_nofp_less);			/* -27 */
-      SETUP_REGISTER (asm_nofp_greater);		/* -26 */
-      SETUP_REGISTER (asm_nofp_increment);		/* -25 */
-      SETUP_REGISTER (asm_nofp_decrement);		/* -24 */
-      SETUP_REGISTER (asm_nofp_zero);			/* -23 */
-      SETUP_REGISTER (asm_nofp_positive);		/* -22 */
-      SETUP_REGISTER (asm_nofp_negative);		/* -21 */
-      SETUP_REGISTER (asm_nofp_quotient);		/* -20 */
-      SETUP_REGISTER (asm_nofp_remainder);		/* -19 */
-      SETUP_REGISTER (asm_nofp_modulo);			/* -18 */
-    }
-
-  SETUP_REGISTER (asm_sc_apply);			/* -17 */
-  SETUP_REGISTER (asm_sc_apply_size_1);			/* -16 */
-  SETUP_REGISTER (asm_sc_apply_size_2);			/* -15 */
-  SETUP_REGISTER (asm_sc_apply_size_3);			/* -14 */
-  SETUP_REGISTER (asm_sc_apply_size_4);			/* -13 */
-  SETUP_REGISTER (asm_sc_apply_size_5);			/* -12 */
-  SETUP_REGISTER (asm_sc_apply_size_6);			/* -11 */
-  SETUP_REGISTER (asm_sc_apply_size_7);			/* -10 */
-  SETUP_REGISTER (asm_sc_apply_size_8);			/* -9 */
-  SETUP_REGISTER (asm_interrupt_continuation_2);	/* -8 */
-  /* [TRC 20091025: The cache synchronization bug does not occur in any
-      x86-64 machines of which I am aware.]
-
-  if (x86_64_cpuid_needed)
-    SETUP_REGISTER (asm_serialize_cache);		/\* -7 *\/
-  else
-    SETUP_REGISTER (asm_dont_serialize_cache);		/\* -7 *\/
-  */
+  SETUP_REGISTER (asm_generic_add);			/* 15 */
+  SETUP_REGISTER (asm_generic_subtract);		/* 16 */
+  SETUP_REGISTER (asm_generic_multiply);		/* 17 */
+  SETUP_REGISTER (asm_generic_divide);			/* 18 */
+  SETUP_REGISTER (asm_generic_equal);			/* 19 */
+  SETUP_REGISTER (asm_generic_less);			/* 20 */
+  SETUP_REGISTER (asm_generic_greater);			/* 21 */
+  SETUP_REGISTER (asm_generic_increment);		/* 22 */
+  SETUP_REGISTER (asm_generic_decrement);		/* 23 */
+  SETUP_REGISTER (asm_generic_zero);			/* 24 */
+  SETUP_REGISTER (asm_generic_positive);		/* 25 */
+  SETUP_REGISTER (asm_generic_negative);		/* 26 */
+  SETUP_REGISTER (asm_generic_quotient);		/* 27 */
+  SETUP_REGISTER (asm_generic_remainder);		/* 28 */
+  SETUP_REGISTER (asm_generic_modulo);			/* 29 */
+  SETUP_REGISTER (asm_sc_apply);			/* 30 */
+  SETUP_REGISTER (asm_sc_apply_size_1);			/* 31 */
+  SETUP_REGISTER (asm_sc_apply_size_2);			/* 32 */
+  SETUP_REGISTER (asm_sc_apply_size_3);			/* 33 */
+  SETUP_REGISTER (asm_sc_apply_size_4);			/* 34 */
+  SETUP_REGISTER (asm_sc_apply_size_5);			/* 35 */
+  SETUP_REGISTER (asm_sc_apply_size_6);			/* 36 */
+  SETUP_REGISTER (asm_sc_apply_size_7);			/* 37 */
+  SETUP_REGISTER (asm_sc_apply_size_8);			/* 38 */
+
+  /* Logically, this should be up by the other interrupt routines, but
+     I just wrote all those numbers above by hand and am too exhausted
+     by that gruelling effort to be inclined to go to the trouble to
+     renumber them now.  */
+  SETUP_REGISTER (asm_interrupt_continuation_2);	/* 39 */
 
 #ifdef _MACH_UNIX
   {
diff --git a/src/microcode/cmpintmd/x86-64.h b/src/microcode/cmpintmd/x86-64.h
index 6076bfdf6..6b29d7db0 100644
--- a/src/microcode/cmpintmd/x86-64.h
+++ b/src/microcode/cmpintmd/x86-64.h
@@ -30,53 +30,6 @@ USA.
 
 /*
 
-Problems with the AMD x86-64 instruction set architecture
-====================================================
-
-1. Jumps are PC-relative.  There are absolute jumps, assuming the PC
-   is in a data location, or with immediate destinations that include
-   a segment descriptor (16 bits).  The short forms have a PC-relative
-   offset defined with respect to the immediately following
-   instruction.
-
-Problem: Closures and execute caches need their address in old space
-   in order to be relocated correctly.
-
-Fix:
-
-For execute caches we can define a new linker field, called
-load-relocation-address which on every GC/relocation stores the new
-address and the old contents into global variables and stores the new
-address in the field.  Alternatively the difference between the new
-address and the old contents can be stored into a single global
-variable, and this can be used, together with the new address of each
-cache, to find the old code.
-
-For closures the code that reads the header (manifest closure) can do
-the same.
-
-
-2. The CALL and JMP instructions do not accept 64-bit displacements.
-
-Problem: We want heaps bigger than 4 GB.
-
-Fix:
-
-Assemble more than one instruction for closure entry points, expanding
-them even more.  Yech.
-
-
-3. The stack pointer register (RSP) cannot be used as the base in
-   (base + displacement) addressing mode.
-
-Problem: Common operation in the compiler, which assumes direct access
-   to the stack.
-
-Fix: Use base + indexed mode, which allows specification of RSP as
-   base and nullification of the index (by using RSP again).  This is
-   one byte longer than otherwise, but...
-
-
 Register assignments
 ====================
 
@@ -92,10 +45,6 @@ RDI (7)		Free Pointer
 
 R8-R15		Unassigned
 
-The dynamic link and value "registers" are not processor registers.
-Slots in the register array must be reserved for them.
-[TRC 20091025: Later, we ought to use machine registers for these.]
-
 The Free Pointer is RDI because RDI is the implicit base register for
 the memory-to-memory move instructions, and the string store
 instruction.  Perhaps we can make use of it.
@@ -165,51 +114,17 @@ entry	0		MOV	AL,code		0xB0, code-byte
 	2		CALL	n(RSI)		0xFF 0x96 n-longword
 	8		<trampoline dependent storage>
 
-
-[TRC 20091027: The next two are wrong; need to update.]
-
-- GC & interrupt check at procedure/continuation entry:
-
-gc_lab	-7		CALL	n(RSI)		0xFF 0x56 n-byte
-	-4		<type/arity info>
-	-2		<gc offset>
-entry	0		CMP	RDI,(RSI)	0x48 0x39 0x3e
-	3		JAE	gc_lab		0x73 -12
-	5		<real code>
-
-
-- GC & interrupt check at closure entry:
-
-gc_lab	-11		ADD	(RSP),&offset	0x83 0x04 0x24 offset-byte
-  	-7		JMP	n(RSI)		0xFF 0x66 n-byte
-	-4		<type/arity info>
-	-2		<gc offset>
-entry	0		ADD	(RSP),&magic	0x81 0x04 0x24 magic-longword
-	7		CMP	RDI,(RSI)	0x39 0x3e
-	9		JAE	gc_lab		0x73 0xea (= -22)
-	11		<real code>
-
-The magic value depends on the closure because of canonicalization.
-
-The ADD instruction at offset -11 is not present for the 0th closure
-entry, since it is the canonical entry point.  Its format depends on
-the value of offset, since the sign-extending forms often suffice.
-
-offset = entry_number * entry_size
-magic = ([TC_COMPILED_ENTRY | 0] - (offset + length_of_CALL_instruction))
-
 */
 
 #define ASM_RESET_HOOK x86_64_reset_hook
-#define FPE_RESET_TRAPS x86_64_interface_initialize
 
 #define CMPINT_USE_STRUCS 1
 
 /* These next definitions must agree with "cmpauxmd/x86-64.m4", which is
    where the register block is allocated.  */
 #define COMPILER_REGBLOCK_N_FIXED 16
-/* Big enough to hold 80-bit floating-point value: */
-#define COMPILER_TEMP_SIZE 2
+/* Size in objects of the largest quantities that RTL registers can hold.  */
+#define COMPILER_TEMP_SIZE 1
 #define COMPILER_REGBLOCK_N_TEMPS 256
 #define COMPILER_REGBLOCK_N_HOOKS 80
 #define COMPILER_HOOK_SIZE 1
@@ -217,8 +132,6 @@ magic = ([TC_COMPILED_ENTRY | 0] - (offset + length_of_CALL_instruction))
 #define COMPILER_REGBLOCK_EXTRA_SIZE					\
   (COMPILER_REGBLOCK_N_HOOKS * COMPILER_HOOK_SIZE)
 
-#define REGBLOCK_ALLOCATED_BY_INTERFACE true
-
 typedef byte_t insn_t;
 
 /* Number of insn_t units preceding entry address in which header
@@ -228,8 +141,11 @@ typedef byte_t insn_t;
 #define CC_ENTRY_OFFSET_SIZE 2
 
 /* Number of insn_t units preceding entry header in which GC trap
-   instructions are stored.  */
-#define CC_ENTRY_GC_TRAP_SIZE 3
+   instructions are stored.  This is an approximation: it matches only
+   those non-closure procedures for which LIAR has generated interrupt
+   checks, in which case there is one CALL n(RSI), which is encoded as
+   #xff #x96 <n>, where n is a longword (32 bits).  */
+#define CC_ENTRY_GC_TRAP_SIZE 6
 
 #define EMBEDDED_CLOSURE_ADDRS_P 1
 
@@ -250,23 +166,6 @@ typedef byte_t insn_t;
 #define UUO_COUNT_TO_WORDS(nc) ((nc) * UUO_LINK_SIZE)
 
 #define READ_UUO_TARGET(a, r) read_uuo_target (a)
-
-#define FLUSH_I_CACHE() X86_64_CACHE_SYNCHRONIZE ()
-#define FLUSH_I_CACHE_REGION(address, nwords) X86_64_CACHE_SYNCHRONIZE ()
-#define PUSH_D_CACHE_REGION(address, nwords) X86_64_CACHE_SYNCHRONIZE ()
-
-/* [TRC 20091025: The cache synchronization bug does not occur in any
-    x86-64 machines of which I am aware.] */
-
-#define X86_64_CACHE_SYNCHRONIZE() do {} while (0)
-
-/*
-#define X86_64_CACHE_SYNCHRONIZE() do					\
-{									\
-  if (x86_64_cpuid_needed)						\
-    x86_64_cache_synchronize ();					\
-} while (false)
-*/
 
 #if defined(__WIN32__) && defined(__WATCOMC__)
 #  define ASM_ENTRY_POINT(name) (__cdecl name)
-- 
2.25.1