Simplify x86-64 utility interface, and nuke all x87 code from it.

author Taylor R Campbell <campbell@mumble.net>

Fri, 6 Nov 2009 04:40:26 +0000 (23:40 -0500)

committer Taylor R Campbell <campbell@mumble.net>

Fri, 6 Nov 2009 04:40:26 +0000 (23:40 -0500)
author Taylor R Campbell <campbell@mumble.net>
Fri, 6 Nov 2009 04:40:26 +0000 (23:40 -0500)
committer Taylor R Campbell <campbell@mumble.net>
Fri, 6 Nov 2009 04:40:26 +0000 (23:40 -0500)
diff --git a/src/compiler/machines/x86-64/lapgen.scm b/src/compiler/machines/x86-64/lapgen.scm

index 2d504ed61b1bbb144cdc5613a9e8651ea240da26..a28795447f4246a1175b8cf79bcaef60515b1323 100644 (file)
--- a/src/compiler/machines/x86-64/lapgen.scm
+++ b/src/compiler/machines/x86-64/lapgen.scm
@@ -214,11 +214,11 @@ USA.
    ;; We don't have unsigned addressing modes.
    (byte-offset-reference register offset))
  \f
-;++ This computation is probably not quite right.
+;;; This returns an offset in objects, not bytes.
  
  (define-integrable (pseudo-register-offset register)
-  (+ (+ (* 16 address-units-per-object) (* 80 address-units-per-object))
-     (* 3 (register-renumber register))))
+  (+ (+ 16 80)                         ;Sixteen fixed, eighty hooks.
+     (register-renumber register)))
  
  (define-integrable (pseudo->machine-register source target)
    (memory->machine-register (pseudo-register-home source) target))
@@ -669,18 +669,10 @@ USA.
    (offset-reference regnum:regs-pointer
                     register-block/environment-offset))
  
-(define reg:dynamic-link
-  (offset-reference regnum:regs-pointer
-                   register-block/dynamic-link-offset))
-
  (define reg:lexpr-primitive-arity
    (offset-reference regnum:regs-pointer
                     register-block/lexpr-primitive-arity-offset))
  
-(define reg:utility-arg-4
-  (offset-reference regnum:regs-pointer
-                   register-block/utility-arg4-offset))
-
  (define reg:stack-guard
    (offset-reference regnum:regs-pointer
                     register-block/stack-guard-offset))
@@ -726,34 +718,20 @@ USA.
    (LAP (MOV B (R ,rax) (& ,code))
         ,@(invoke-hook/call entry:compiler-scheme-to-interface/call)))
  \f
-;++ This uses a kludge to number entries by byte offsets from the
-;++ registers block, but that works only in the 32-bit i386 version;
-;++ for x86-64 version, all the entries' byte indices exceed the range
-;++ of signed bytes.  But this works for now.
-
  (define-syntax define-entries
    (sc-macro-transformer
     (lambda (form environment)
       environment
       `(BEGIN
-       ,@(let loop
-             ((names (cdddr form))
-              (index (cadr form))
-              (high (caddr form)))
+       ,@(let loop ((names (cddr form)) (index (cadr form)))
             (if (pair? names)
-               (if (< index high)
-                   (cons `(DEFINE-INTEGRABLE
-                            ,(symbol-append 'ENTRY:COMPILER-
-                                            (car names))
-                            (BYTE-OFFSET-REFERENCE REGNUM:REGS-POINTER
-                                                   ,index))
-                         (loop (cdr names) (+ index 8) high))
-                   (begin
-                     (warn "define-entries: Too many for byte offsets.")
-                     (loop names index (+ high 32000))))
+               (cons `(DEFINE-INTEGRABLE
+                          ,(symbol-append 'ENTRY:COMPILER- (car names))
+                        (BYTE-OFFSET-REFERENCE REGNUM:REGS-POINTER ,index))
+                     (loop (cdr names) (+ index 8)))
                 '()))))))
  
-(define-entries #x80 #x100             ; (* 16 8)
+(define-entries #x80                   ; (* 16 8)
    scheme-to-interface                  ; Main entry point (only one necessary)
    scheme-to-interface/call             ; Used by rules3&4, for convenience.
    trampoline-to-interface              ; Used by trampolines, for convenience.
@@ -769,9 +747,6 @@ USA.
    link
    error
    primitive-error
-  short-primitive-apply)
-
-(define-entries #x-100 0
    &+
    &-
    &*
@@ -796,8 +771,7 @@ USA.
    shortcircuit-apply-size-6
    shortcircuit-apply-size-7
    shortcircuit-apply-size-8
-  interrupt-continuation-2
-  conditionally-serialize)
+  interrupt-continuation-2)
  \f
  ;; Operation tables
  
diff --git a/src/compiler/machines/x86-64/machin.scm b/src/compiler/machines/x86-64/machin.scm

index f4a30ce612fd357fc69d0e808b22a2deb63c1ca0..c28e8c63f517529ae3e81c02de9bb4fd47a792af 100644 (file)
--- a/src/compiler/machines/x86-64/machin.scm
+++ b/src/compiler/machines/x86-64/machin.scm
@@ -248,7 +248,6 @@ USA.
  (define-integrable register-block/environment-offset 3)
  (define-integrable register-block/dynamic-link-offset 4) ; compiler temp
  (define-integrable register-block/lexpr-primitive-arity-offset 7)
-(define-integrable register-block/utility-arg4-offset 9) ; closure free
  (define-integrable register-block/stack-guard-offset 11)
  
  (define-integrable (fits-in-signed-byte? value)
diff --git a/src/compiler/machines/x86-64/rules3.scm b/src/compiler/machines/x86-64/rules3.scm

index 1295ea959eea9749e72d41adcee388f3dd514982..608043ecb888dfa53a40d86e05f531271db71bb0 100644 (file)
--- a/src/compiler/machines/x86-64/rules3.scm
+++ b/src/compiler/machines/x86-64/rules3.scm
@@ -64,7 +64,7 @@ USA.
    continuation
    (expect-no-exit-interrupt-checks)
    (LAP ,@(clear-map!)
-       (POP Q (R ,rcx))
+       (POP Q (R ,rbx))
         #|
         (MOV Q (R ,rdx) (&U ,frame-size))
         ,@(invoke-interface code:compiler-apply)
@@ -104,7 +104,7 @@ USA.
    continuation
    (expect-no-exit-interrupt-checks)
    (LAP ,@(clear-map!)
-       (LEA Q (R ,rcx) (@PCR ,label))
+       (LEA Q (R ,rbx) (@PCR ,label))
         (MOV Q (R ,rdx) (&U ,number-pushed))
         ,@(invoke-interface code:compiler-lexpr-apply)))
  
@@ -114,8 +114,8 @@ USA.
    ;; It expects the procedure at the top of the stack
    (expect-no-exit-interrupt-checks)
    (LAP ,@(clear-map!)
-       (POP Q (R ,rcx))
-       (AND Q (R ,rcx) (R ,regnum:datum-mask)) ; clear type code
+       (POP Q (R ,rbx))
+       (AND Q (R ,rbx) (R ,regnum:datum-mask)) ; clear type code
         (MOV Q (R ,rdx) (&U ,number-pushed))
         ,@(invoke-interface code:compiler-lexpr-apply)))
  
@@ -139,7 +139,7 @@ USA.
    continuation
    (expect-no-exit-interrupt-checks)
    (let* ((set-extension
-         (interpreter-call-argument->machine-register! extension rcx))
+         (interpreter-call-argument->machine-register! extension rbx))
          (set-address
           (begin (require-register! rdx)
                  (load-pc-relative-address (INST-EA (R ,rdx))
@@ -148,7 +148,7 @@ USA.
      (LAP ,@set-extension
          ,@set-address
          ,@(clear-map!)
-        (MOV Q (R ,rbx) (&U ,frame-size))
+        (MOV Q (R ,rcx) (&U ,frame-size))
          ,@(invoke-interface code:compiler-cache-reference-apply))))
  
  (define-rule statement
@@ -157,13 +157,13 @@ USA.
    continuation
    (expect-no-entry-interrupt-checks)
    (let* ((set-environment
-         (interpreter-call-argument->machine-register! environment rcx))
+         (interpreter-call-argument->machine-register! environment rbx))
          (set-name (object->machine-register! name rdx)))
      (delete-dead-registers!)
      (LAP ,@set-environment
          ,@set-name
          ,@(clear-map!)
-        (MOV Q (R ,rbx) (&U ,frame-size))
+        (MOV Q (R ,rcx) (&U ,frame-size))
          ,@(invoke-interface code:compiler-lookup-apply))))
  \f
  (define-rule statement
@@ -171,9 +171,9 @@ USA.
    continuation                         ; ignored
    (if (eq? primitive compiled-error-procedure)
        (LAP ,@(clear-map!)
-          (MOV Q (R ,rcx) (&U ,frame-size))
+          (MOV Q (R ,rbx) (&U ,frame-size))
            ,@(invoke-hook entry:compiler-error))
-      (LAP ,@(object->machine-register! primitive rcx)
+      (LAP ,@(object->machine-register! primitive rbx)
            ,@(clear-map!)
            ,@(let ((arity (primitive-procedure-arity primitive)))
                (cond ((not (negative? arity))
@@ -621,11 +621,11 @@ USA.
  ;;; This is invoked by the top level of the LAP generator.
  
  (define (generate/quotation-header environment-label free-ref-label n-sections)
-  (LAP (MOV Q (R ,rcx) ,reg:environment)
-       (MOV Q (@PCR ,environment-label) (R ,rcx))
+  (LAP (MOV Q (R ,rax) ,reg:environment)
+       (MOV Q (@PCR ,environment-label) (R ,rax))
         (LEA Q (R ,rdx) (@PCR ,*block-label*))
-       (LEA Q (R ,rbx) (@PCR ,free-ref-label))
-       (MOV Q ,reg:utility-arg-4 (&U ,n-sections))
+       (LEA Q (R ,rcx) (@PCR ,free-ref-label))
+       (MOV Q (R ,r8) (&U ,n-sections))
         #|
         ,@(invoke-interface/call code:compiler-link)
         |#
@@ -639,10 +639,10 @@ USA.
                               n-sections)
    (LAP (MOV Q (R ,rdx) (@PCR ,code-block-label))
         (AND Q (R ,rdx) (R ,regnum:datum-mask))
-       (LEA Q (R ,rbx) (@RO ,rdx ,free-ref-offset))
-       (MOV Q (R ,rcx) ,reg:environment)
-       (MOV Q (@RO ,rdx ,environment-offset) (R ,rcx))
-       (MOV Q ,reg:utility-arg-4 (&U ,n-sections))
+       (LEA Q (R ,rcx) (@RO ,rdx ,free-ref-offset))
+       (MOV Q (R ,rax) ,reg:environment)
+       (MOV Q (@RO ,rdx ,environment-offset) (R ,rax))
+       (MOV Q (R ,r8) (&U ,n-sections))
         #|
         ,@(invoke-interface/call code:compiler-link)
         |#
@@ -661,41 +661,39 @@ USA.
          (PUSH Q (& 0))
         (LABEL ,loop)
          ;; Get index
-        (MOV Q (R ,rcx) (@R ,rsp))
+        (MOV Q (R ,rax) (@R ,rsp))
          ;; Get vector
          (MOV Q (R ,rdx) (@PCR ,vector-label))
          ;; Get n-sections for this cc-block
-        (XOR Q (R ,rbx) (R ,rbx))
-        (LEA Q (R ,rax) (@PCR ,bytes))
-        (MOV B (R ,rbx) (@RI ,rax ,rcx 1))
+        (XOR Q (R ,r8) (R ,r8))
+        (LEA Q (R ,rcx) (@PCR ,bytes))
+        (MOV B (R ,r8) (@RI ,rcx ,rax 1))
          ;; address of vector
          (AND Q (R ,rdx) (R ,regnum:datum-mask))
-        ;; Store n-sections in arg
-        (MOV Q ,reg:utility-arg-4 (R ,rbx))
          ;; vector-ref -> cc block
          (MOV Q
               (R ,rdx)
               (@ROI ,rdx ,address-units-per-object
-                   ,rcx ,address-units-per-object))
+                   ,rax ,address-units-per-object))
          ;; address of cc-block
          (AND Q (R ,rdx) (R ,regnum:datum-mask))
          ;; cc-block length
-        (MOV Q (R ,rbx) (@R ,rdx))
+        (MOV Q (R ,rcx) (@R ,rdx))
          ;; Get environment
-        (MOV Q (R ,rcx) ,reg:environment)
+        (MOV Q (R ,rax) ,reg:environment)
          ;; Eliminate length tags
-        (AND Q (R ,rbx) (R ,regnum:datum-mask))
+        (AND Q (R ,rcx) (R ,regnum:datum-mask))
          ;; Store environment
-        (MOV Q (@RI ,rdx ,rbx ,address-units-per-object) (R ,rcx))
+        (MOV Q (@RI ,rdx ,rcx ,address-units-per-object) (R ,rax))
          ;; Get NMV header
-        (MOV Q (R ,rcx) (@RO ,rdx ,address-units-per-object))
+        (MOV Q (R ,rax) (@RO ,rdx ,address-units-per-object))
          ;; Eliminate NMV tag
-        (AND Q (R ,rcx) (R ,regnum:datum-mask))
+        (AND Q (R ,rax) (R ,regnum:datum-mask))
          ;; Address of first free reference
          (LEA Q
-             (R ,rbx)
+             (R ,rcx)
               (@ROI ,rdx ,(* 2 address-units-per-object)
-                   ,rcx ,address-units-per-object))
+                   ,rax ,address-units-per-object))
          ;; Invoke linker
          ,@(invoke-hook/call entry:compiler-link)
          ,@(make-external-label (continuation-code-word false)
diff --git a/src/compiler/machines/x86-64/rules4.scm b/src/compiler/machines/x86-64/rules4.scm

index 785b55abc535491e5ba0a97ae8779b2059a089ea..1f7c06518e7454da184c2b2c8f6ab79850721547 100644 (file)
--- a/src/compiler/machines/x86-64/rules4.scm
+++ b/src/compiler/machines/x86-64/rules4.scm
@@ -55,7 +55,7 @@ USA.
    cont                                 ; ignored
    (let* ((set-extension
           (interpreter-call-argument->machine-register! extension rdx))
-        (set-value (interpreter-call-argument->machine-register! value rbx)))
+        (set-value (interpreter-call-argument->machine-register! value rcx)))
      (LAP ,@set-extension
          ,@set-value
          ,@(clear-map!)
@@ -110,7 +110,7 @@ USA.
           (interpreter-call-argument->machine-register! environment rdx)))
      (LAP ,@set-environment
          ,@(clear-map (clear-map!))
-        ,@(load-constant (INST-EA (R ,rbx)) name)
+        ,@(load-constant (INST-EA (R ,rcx)) name)
          ,@(invoke-interface/call code))))
  \f
  (define-rule statement
@@ -130,10 +130,9 @@ USA.
  (define (assignment-call code environment name value)
    (let* ((set-environment
           (interpreter-call-argument->machine-register! environment rdx))
-        (set-value (interpreter-call-argument->machine-register! value rax)))
+        (set-value (interpreter-call-argument->machine-register! value r8)))
      (LAP ,@set-environment
          ,@set-value
          ,@(clear-map!)
-        (MOV Q ,reg:utility-arg-4 (R ,rax))
-        ,@(load-constant (INST-EA (R ,rbx)) name)
+        ,@(load-constant (INST-EA (R ,rcx)) name)
          ,@(invoke-interface/call code))))
 \ No newline at end of file
diff --git a/src/compiler/machines/x86-64/rulflo.scm b/src/compiler/machines/x86-64/rulflo.scm

index 201a761e068d5a9ae35b8b0bf812a364640e9aa8..88562401484364f9e6574a967048c79d8e69953d 100644 (file)
--- a/src/compiler/machines/x86-64/rulflo.scm
+++ b/src/compiler/machines/x86-64/rulflo.scm
@@ -27,6 +27,8 @@ USA.
  ;;; package: (compiler lap-syntaxer)
  
  (declare (usual-integrations))
+
+#|
  \f
  ;; ****
  ;; Missing: 2 argument operations and predicates with non-trivial
@@ -826,3 +828,4 @@ USA.
    (if (flo:32-bit-representation-exact? fp-value)
        (generate-ea (single-flonum->label fp-value) 'S)
        (generate-ea (double-flonum->label fp-value) 'D)))
+|#
diff --git a/src/microcode/cmpauxmd/x86-64.m4 b/src/microcode/cmpauxmd/x86-64.m4

index 58d7faf23ae29655289720e72e530d943daf622c..d134362f5851f6815290d001aec3743c3ce43336 100644 (file)
--- a/src/microcode/cmpauxmd/x86-64.m4
+++ b/src/microcode/cmpauxmd/x86-64.m4
@@ -71,7 +71,7 @@
  ###    them, they contain garbage.
  ###
  ### Compiled Scheme code uses the following register convention:
-###    - %rsp containts the Scheme stack pointer, not the C stack
+###    - %rsp contains the Scheme stack pointer, not the C stack
  ###    pointer.
  ###    - %rsi contains a pointer to the Scheme interpreter's "register"
  ###    block.  This block contains the compiler's copy of MemTop,
@@ -112,8 +112,6 @@
  ### TYPE_CODE_LENGTH
  ###    Normally defined to be 6.  Don't change this unless you know
  ###    what you're doing.
-### DISABLE_387
-###    If defined, do not generate 387 floating-point instructions.
  ### VALGRIND_MODE
  ###    If defined, modify code to make it work with valgrind.
  \f
@@ -123,14 +121,6 @@ ifdef(`WIN32',
        `define(IF_WIN32,`$1')',
        `define(IF_WIN32,`')')
  
-ifdef(`DISABLE_387',
-      `define(IF387,`')',
-      `define(IF387,`$1')')
-
-ifdef(`DISABLE_387',
-      `define(IFN387,`$1')',
-      `define(IFN387,`')')
-
  IF_WIN32(`define(DASM,1)')
  ifdef(`WCC386R',`define(WCC386,1)')
  
@@ -308,23 +298,7 @@ define(IMM_FALSE, `IMM(HEX(0000000000000000))')
  
  define(REGBLOCK_VAL,16)
  define(REGBLOCK_COMPILER_TEMP,32)
-define(REGBLOCK_LEXPR_ACTUALS,56)
-define(REGBLOCK_PRIMITIVE,64)
-define(REGBLOCK_CLOSURE_FREE,72)
-
  define(REGBLOCK_DLINK,REGBLOCK_COMPILER_TEMP)
-define(REGBLOCK_UTILITY_ARG4,REGBLOCK_CLOSURE_FREE)
-
-define(COMPILER_REGBLOCK_N_FIXED,16)
-define(COMPILER_REGBLOCK_N_HOOKS,80)
-define(COMPILER_REGBLOCK_N_TEMPS,256)
-define(COMPILER_FIXED_SIZE,1)
-define(COMPILER_HOOK_SIZE,1)
-define(COMPILER_TEMP_SIZE,2)
-define(REGBLOCK_SIZE_IN_OBJECTS,
-       eval((COMPILER_REGBLOCK_N_FIXED*COMPILER_FIXED_SIZE)
-           +(COMPILER_REGBLOCK_N_HOOKS*COMPILER_HOOK_SIZE)
-           +(COMPILER_REGBLOCK_N_TEMPS*COMPILER_TEMP_SIZE)))
  
  # Define the floating-point processor control word.  Always set
  # round-to-even and double precision.  Under Win32, mask all
@@ -351,187 +325,24 @@ use_external_data(EVR(utility_table))
  ifdef(`WIN32',`
  use_external_data(EVR(RegistersPtr))
  ',`
-define_data(Regstart)
-allocate_space(Regstart,256)
-
-define_data(Registers)
-allocate_space(Registers,eval(REGBLOCK_SIZE_IN_OBJECTS*8))
+use_external_data(EVR(Registers))
  ')
  
-define_data(i387_presence)
-allocate_quadword(i387_presence)
-
  define_data(C_Stack_Pointer)
  allocate_quadword(C_Stack_Pointer)
  
  define_data(C_Frame_Pointer)
  allocate_quadword(C_Frame_Pointer)
-
-# [TRC 20091025: CPUID is always supported.]
-# define_data(x86_64_cpuid_supported)
-# allocate_quadword(x86_64_cpuid_supported)
-
-# [TRC 20091025: The cache synchronization bug does not occur in any
-# x86-64 machines of which I am aware.]
-# define_data(x86_64_cpuid_needed)
-# allocate_quadword(x86_64_cpuid_needed)
  \f
  DECLARE_CODE_SEGMENT()
  declare_alignment(2)
  
-# [TRC 20091025: We need to check for MMX/SSEn instructions too.]
-
-define_c_label(x86_64_interface_initialize)
-       OP(push,q)      REG(rbp)
-       OP(mov,q)       TW(REG(rsp),REG(rbp))
-       OP(xor,q)       TW(REG(rax),REG(rax))           # No 387 available
-
-# [TRC 20091025: The AMD64 reference manual suggests using the CPUID
-# instruction to detect instruction subsets instead.]
-
-# Unfortunately, the `movl cr0,ecx' instruction is privileged.
-# Use the deprecated `smsw cx' instruction instead.
-
-IF387(`
-#      OP(mov,q)       TW(REG(cr0),REG(rcx))           # Test for 387 presence
-ifdef(`VALGRIND_MODE',`',`
-       smsw            REG(cx)
-       OP(mov,q)       TW(IMM(HEX(12)),REG(rdx))
-       OP(and,q)       TW(REG(rdx),REG(rcx))
-       OP(cmp,q)       TW(REG(rdx),REG(rcx))
-       jne     x86_64_initialize_no_fp
-')
-       OP(inc,q)       REG(rax)                        # 387 available
-       OP(sub,q)       TW(IMM(8),REG(rsp))
-       fclex
-       fnstcw          WOF(-2,REG(rbp))
-       OP(and,w)       TW(IMM(HEX(f0e0)),WOF(-2,REG(rbp)))
-       OP(or,w)        TW(IMM(FP_CONTROL_WORD),WOF(-2,REG(rbp)))
-       fldcw           WOF(-2,REG(rbp))
-x86_64_initialize_no_fp:
-')
-       OP(mov,q)       TW(REG(rax),ABS(EVR(i387_presence)))
-
-# [TRC 20091025: CPUID is always supported.]
-
-# Do a bunch of hair to determine if we need to do cache synchronization.
-# See if the CPUID instruction is supported.
-
-#      OP(xor,q)       TW(REG(rax),REG(rax))
-#      OP(mov,q)       TW(REG(rax),ABS(EVR(x86_64_cpuid_supported)))
-#      OP(mov,q)       TW(REG(rax),ABS(EVR(x86_64_cpuid_needed)))
-
-# First test: can we toggle the AC bit?
-
-#      pushfd
-#      OP(pop,l)       REG(eax)
-#      OP(mov,l)       TW(REG(eax),REG(ecx))
-#      OP(xor,l)       TW(IMM(HEX(00040000)),REG(eax))
-#      OP(push,l)      REG(eax)
-#      popfd
-#      pushfd
-#      OP(pop,l)       REG(eax)
-
-# if AC bit can't be toggled, this is a 386 (and doesn't support CPUID).
-
-#      OP(xor,l)       TW(REG(ecx),REG(eax))
-#      jz              no_cpuid_instr
-#      OP(push,l)      REG(ecx)                        # restore EFLAGS
-#      popfd
-
-# Now test to see if the ID bit can be toggled.
-
-#      OP(mov,l)       TW(REG(ecx),REG(eax))
-#      OP(xor,l)       TW(IMM(HEX(00200000)),REG(eax))
-#      OP(push,l)      REG(eax)
-#      popfd
-#      pushfd
-#      OP(pop,l)       REG(eax)
-
-# if ID bit can't be toggled, this is a 486 that doesn't support CPUID.
-
-#      OP(xor,l)       TW(REG(ecx),REG(eax))
-#      jz              no_cpuid_instr
-#      OP(push,l)      REG(ecx)                        # restore EFLAGS
-#      popfd
-
-# Now we know that cpuid is supported.
-
-#      OP(mov,q)       TW(IMM(HEX(00000001)),ABS(EVR(x86_64_cpuid_supported)))
-
-# Next, use the CPUID instruction to determine the processor type.
-
-#      OP(push,l)      REG(ebx)
-#      OP(xor,l)       TW(REG(eax),REG(eax))
-#      cpuid
-
-# Check that CPUID accepts argument 1.
-
-#      OP(cmp,l)       TW(IMM(HEX(00000001)),REG(eax))
-#      jl              done_setting_up_cpuid
-
-# Detect "GenuineIntel".
-
-#      OP(cmp,l)       TW(IMM(HEX(756e6547)),REG(ebx))
-#      jne             not_intel_cpu
-#      OP(cmp,l)       TW(IMM(HEX(49656e69)),REG(edx))
-#      jne             not_intel_cpu
-#      OP(cmp,l)       TW(IMM(HEX(6c65746e)),REG(ecx))
-#      jne             not_intel_cpu
-
-# For CPU families 4 (486), 5 (Pentium), or 6 (Pentium Pro, Pentium
-# II, Pentium III), don't use CPUID synchronization.
-
-#      OP(mov,l)       TW(IMM(HEX(01)),REG(eax))
-#      cpuid
-#      OP(shr,l)       TW(IMM(HEX(08)),REG(eax))
-#      OP(and,l)       TW(IMM(HEX(0000000F)),REG(eax))
-#      OP(cmp,l)       TW(IMM(HEX(4)),REG(eax))
-#      jl              done_setting_up_cpuid
-#      OP(cmp,l)       TW(IMM(HEX(6)),REG(eax))
-#      jg              done_setting_up_cpuid
+# C_to_interface passes control from C into Scheme.  To C it is a
+# unary procedure; its one argument is passed in rdi.  It saves the
+# state of the C world (the C frame pointer and stack pointer) and
+# then passes control to interface_to_scheme to set up the state of
+# the Scheme world.
  #
-#      jmp             cpuid_not_needed
-#
-#not_intel_cpu:
-
-# Detect "AuthenticAMD".
-
-#      OP(cmp,l)       TW(IMM(HEX(68747541)),REG(ebx))
-#      jne             not_amd_cpu
-#      OP(cmp,l)       TW(IMM(HEX(69746e65)),REG(edx))
-#      jne             not_amd_cpu
-#      OP(cmp,l)       TW(IMM(HEX(444d4163)),REG(ecx))
-#      jne             not_amd_cpu
-
-# Problem appears to exist only on Athlon models 1, 3, and 4.
-
-#      OP(mov,l)       TW(IMM(HEX(01)),REG(eax))
-#      cpuid
-
-#      OP(mov,l)       TW(REG(eax),REG(ecx))
-#      OP(shr,l)       TW(IMM(HEX(08)),REG(eax))
-#      OP(and,l)       TW(IMM(HEX(0000000F)),REG(eax))
-#      OP(cmp,l)       TW(IMM(HEX(6)),REG(eax))        # family 6 = Athlon
-#      jne             done_setting_up_cpuid
-
-#      OP(mov,l)       TW(REG(ecx),REG(eax))
-#      OP(shr,l)       TW(IMM(HEX(04)),REG(eax))
-#      OP(and,l)       TW(IMM(HEX(0000000F)),REG(eax))
-#      OP(cmp,l)       TW(IMM(HEX(6)),REG(eax))        # model 6 and up OK
-#      jge             done_setting_up_cpuid
-#      OP(cmp,l)       TW(IMM(HEX(2)),REG(eax))        # model 2 OK
-#      je              done_setting_up_cpuid
-
-#      OP(mov,l)       TW(IMM(HEX(00000001)),ABS(EVR(x86_64_cpuid_needed)))
-
-#not_amd_cpu:
-#done_setting_up_cpuid:
-#      OP(pop,l)       REG(ebx)
-#no_cpuid_instr:
-       leave
-       ret
-
  # Note:  The AMD64 ABI mandates that on entry to a function, RSP - 8
  # must be a multiple of 0x10; that is, the stack must be 128-bit
  # aligned.  We push six quadwords onto the stack, but there is already
@@ -557,68 +368,46 @@ define_c_label(C_to_interface)
  
  define_hook_label(trampoline_to_interface)
  define_debugging_label(trampoline_to_interface)
-       OP(pop,q)       REG(rcx)                        # trampoline storage
+       OP(pop,q)       REG(rbx)                        # trampoline storage
         jmp     scheme_to_interface
  
  define_hook_label(scheme_to_interface_call)
  define_debugging_label(scheme_to_interface_call)
-       OP(pop,q)       REG(rcx)                        # arg1 = ret. add
-       OP(add,q)       TW(IMM(4),REG(rcx))             # Skip format info
+       OP(pop,q)       REG(rbx)                        # arg1 = ret. add
+       OP(add,q)       TW(IMM(4),REG(rbx))             # Skip format info
  #      jmp     scheme_to_interface
  \f
+# scheme_to_interface passes control from compiled Scheme code to a
+# microcode utility.  The arguments for the utility go respectively in
+# rbx, rdx, rcx, and r8.  This loosely matches the AMD64 calling
+# convention, where arguments go respectively in rdi, rsi, rdx, rcx,
+# and r8.  The differences are that scheme_to_interface uses rdi as an
+# implicit first argument to the utility, and rsi is used in compiled
+# code for the registers block, since the compiler can't allocate it
+# as a general-purpose register because it doesn't admit byte-size
+# operations.  Moreover, Scheme uses rdi as the free pointer register,
+# which we have to save here in a location unknown to Scheme (the C
+# `Free' variable), so it can't be set by compiled code.
+
  define_hook_label(scheme_to_interface)
  define_debugging_label(scheme_to_interface)
-
-# These two moves must happen _before_ the ffree instructions below.
-# Otherwise recovery from SIGFPE there will fail.
         OP(mov,q)       TW(REG(rsp),ABS(EVR(stack_pointer)))
         OP(mov,q)       TW(rfree,ABS(EVR(Free)))
-
-# [TRC 20091025: I think this should be excised.]
-
-IF387(`
-       OP(cmp,q)       TW(IMM(0),ABS(EVR(i387_presence)))
-       je      scheme_to_interface_proceed
-       ffree   ST(0)                                   # Free floating "regs"
-       ffree   ST(1)
-       ffree   ST(2)
-       ffree   ST(3)
-       ffree   ST(4)
-       ffree   ST(5)
-       ffree   ST(6)
-       ffree   ST(7)
-scheme_to_interface_proceed:
-')
-
         OP(mov,q)       TW(ABS(EVR(C_Stack_Pointer)),REG(rsp))
         OP(mov,q)       TW(ABS(EVR(C_Frame_Pointer)),REG(rbp))
  
         OP(sub,q)       TW(IMM(16),REG(rsp))    # alloc struct return
-
-       # Shuffle Scheme -> AMD64 calling conventions:
-       #   struct pointer -> rdi
-       #   rcx -> rsi
-       #   rdx -> rdx
-       #   rbx -> rcx
-       #   arg4 -> r8
-       # Parallel assignment problems:
-       #   arg4 depends on rsi: do arg4->r8 first
-       #   target depends on rcx (why?): use r11 as a temporary
-       # [TRC 20091025: Perhaps we can rearrange LIAR to generate
-       # arguments in the registers we want, to avoid this
-       # shuffling.]
-
-       OP(mov,q)       TW(REG(rcx),REG(r11))
-
-       OP(xor,q)       TW(REG(rcx),REG(rcx))
-       OP(mov,b)       TW(REG(al),REG(cl))
-       OP(lea,q)       TW(ABS(EVR(utility_table)),REG(rax))
-       OP(mov,q)       TW(SDX(,REG(rax),REG(rcx),8),REG(rax))
-
-       OP(mov,q)       TW(REG(rsp),REG(rdi))
-       OP(mov,q)       TW(DOF(REGBLOCK_UTILITY_ARG4(),regs),REG(r8))
-       OP(mov,q)       TW(REG(r11),REG(rsi))
-       OP(mov,q)       TW(REG(rbx),REG(rcx))
+       OP(mov,q)       TW(REG(rsp),REG(rdi))   # Structure is first argument.
+       OP(mov,q)       TW(REG(rbx),REG(rsi))   # rbx -> second argument.
+
+       # Find the utility.  rbx is now free as a temporary register
+       # to hold the utility table.  rax initially stores the utility
+       # number in its low eight bits and possibly garbage in the
+       # rest; mask it off and then use it as an index into the
+       # utility table, scaled by 8 (bytes per pointer).
+       OP(lea,q)       TW(ABS(EVR(utility_table)),REG(rbx))
+       OP(and,q)       TW(IMM(HEX(ff)),REG(rax))
+       OP(mov,q)       TW(SDX(,REG(rbx),REG(rax),8),REG(rax))
  
         call            IJMP(REG(rax))
  
@@ -628,29 +417,12 @@ define_debugging_label(scheme_to_interface_return)
         jmp             IJMP(REG(rax))          # Invoke handler
  
  define_c_label(interface_to_scheme)
-IF387(`
-       OP(cmp,q)       TW(IMM(0),ABS(EVR(i387_presence)))
-       je      interface_to_scheme_proceed
-       ffree   ST(0)                                   # Free floating "regs"
-       ffree   ST(1)
-       ffree   ST(2)
-       ffree   ST(3)
-       ffree   ST(4)
-       ffree   ST(5)
-       ffree   ST(6)
-       ffree   ST(7)
-interface_to_scheme_proceed:
-')
-                                                       # Register block = %rsi
-                                                       # Scheme offset in NT
-ifdef(`WIN32',
+ifdef(`WIN32',                                         # Register block = %rsi
  `      OP(mov,q)       TW(ABS(EVR(RegistersPtr)),regs)',
  `      OP(lea,q)       TW(ABS(EVR(Registers)),regs)')
-
         OP(mov,q)       TW(ABS(EVR(Free)),rfree)        # Free pointer = %rdi
         OP(mov,q)       TW(DOF(REGBLOCK_VAL(),regs),REG(rax)) # Value/dynamic link
         OP(mov,q)       TW(IMM(ADDRESS_MASK),rmask)     # = %rbp
-
         OP(mov,q)       TW(ABS(EVR(stack_pointer)),REG(rsp))
         OP(mov,q)       TW(REG(rax),REG(rcx))           # Preserve if used
         OP(and,q)       TW(rmask,REG(rcx))              # Restore potential dynamic link
@@ -665,19 +437,6 @@ define_code_label(EFR(callWinntExceptionTransferHook))
  ')
  
  define_c_label(interface_to_C)
-IF387(`
-       OP(cmp,q)       TW(IMM(0),ABS(EVR(i387_presence)))
-       je      interface_to_C_proceed
-       ffree   ST(0)                                   # Free floating "regs"
-       ffree   ST(1)
-       ffree   ST(2)
-       ffree   ST(3)
-       ffree   ST(4)
-       ffree   ST(5)
-       ffree   ST(6)
-       ffree   ST(7)
-interface_to_C_proceed:')
-
         OP(mov,q)       TW(REG(rdx),REG(rax))           # Set up result
         # We need a dummy register for the POP (which is three bytes
         # shorter than ADD $8,RSP); since we're about to pop into r15
@@ -691,33 +450,6 @@ interface_to_C_proceed:')
         leave
         ret
  \f
-# [TRC 20091025: The cache synchronization bug does not occur in any
-# x86-64 machines of which I am aware.]
-
-#define_code_label(EFR(x86_64_cache_synchronize))
-#      OP(push,q)      REG(rbp)
-#      OP(mov,q)       TW(REG(rsp),REG(rbp))
-#      OP(push,q)      REG(rbx)
-#      OP(xor,q)       TW(REG(rax),REG(rax))
-#      cpuid
-#      OP(pop,q)       REG(rbx)
-#      leave
-#      ret
-
-### Run the CPUID instruction for serialization.
-
-#define_hook_label(serialize_cache)
-#      pushad
-#      OP(xor,q)       TW(REG(rax),REG(rax))
-#      cpuid
-#      popad
-#      ret
-
-### Stub to be used in place of above on machines that don't need it.
-
-#define_hook_label(dont_serialize_cache)
-#      ret
-\f
  ###    Assembly language hooks used to reduce code size.
  ###    There is no time advantage to using these over using
  ###    scheme_to_interface (or scheme_to_interface_call), but the
@@ -743,27 +475,6 @@ define_hook_label(interrupt_dlink)
         OP(mov,b)       TW(IMM(HEX(19)),REG(al))
         jmp     scheme_to_interface_call
  
-###
-###    This saves even more instructions than primitive_apply
-###    When the PC is not available.  Instead of jumping here,
-###    a call instruction is used, and the longword offset to
-###    the primitive object follows the call instruction.
-###    This code loads the primitive object and merges with
-###    apply_primitive
-###
-###     [TRC 20091025: But on the x86-64, we have RIP-relative
-###     addressing, so we don't need this.]
-###
-
-#declare_alignment(2)
-#define_hook_label(short_primitive_apply)
-#      OP(pop,l)       REG(edx)                        # offset pointer
-#      OP(mov,l)       TW(IND(REG(edx)),REG(ecx))      # offset
-#                                                      # Primitive object
-#      OP(mov,l)       TW(IDX(REG(edx),REG(ecx)),REG(ecx))
-#                                                      # Merge
-#      jmp     hook_reference(primitive_apply)
-
  declare_alignment(2)
  define_jump_indirection(primitive_apply,12)
  
@@ -787,17 +498,16 @@ define_call_indirection(primitive_error,36)
  
  declare_alignment(2)
  define_hook_label(sc_apply)
-       OP(mov,q)       TW(REG(rcx),REG(rax))           # Copy for type code
-       OP(mov,q)       TW(REG(rcx),REG(rbx))           # Copy for address
+       OP(mov,q)       TW(REG(rbx),REG(rax))           # Copy for type code
+       OP(mov,q)       TW(REG(rbx),REG(rcx))           # Copy for address
         OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))  # Select type code
-       OP(and,q)       TW(rmask,REG(rbx))              # Select datum
+       OP(and,q)       TW(rmask,REG(rcx))              # Select datum
         OP(cmp,b)       TW(IMM(TC_COMPILED_ENTRY),REG(al))
         jne     asm_sc_apply_generic
-       # [TRC 20091025: How big are the frame sizes?]
-       OP(movs,bq,x)   TW(BOF(-4,REG(rbx)),REG(rax))   # Extract frame size
+       OP(movs,bq,x)   TW(BOF(-4,REG(rcx)),REG(rax))   # Extract frame size
         OP(cmp,q)       TW(REG(rax),REG(rdx))           # Compare to nargs+1
         jne     asm_sc_apply_generic
-       jmp     IJMP(REG(rbx))                          # Invoke
+       jmp     IJMP(REG(rcx))                          # Invoke
  
  define_debugging_label(asm_sc_apply_generic)
         OP(mov,q)       TW(IMM(HEX(14)),REG(rax))
@@ -806,16 +516,15 @@ define_debugging_label(asm_sc_apply_generic)
  define(define_apply_fixed_size,
  `declare_alignment(2)
  define_hook_label(sc_apply_size_$1)
-       OP(mov,q)       TW(REG(rcx),REG(rax))           # Copy for type code
-       OP(mov,q)       TW(REG(rcx),REG(rbx))           # Copy for address
+       OP(mov,q)       TW(REG(rbx),REG(rax))           # Copy for type code
+       OP(mov,q)       TW(REG(rbx),REG(rcx))           # Copy for address
         OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))  # Select type code
-       OP(and,q)       TW(rmask,REG(rbx))              # Select datum
+       OP(and,q)       TW(rmask,REG(rcx))              # Select datum
         OP(cmp,b)       TW(IMM(TC_COMPILED_ENTRY),REG(al))
         jne     asm_sc_apply_generic_$1
-       # [TRC 20091025: How big are the frame sizes?]
-       OP(cmp,b)       TW(IMM($1),BOF(-4,REG(rbx)))    # Compare frame size
-       jne     asm_sc_apply_generic_$1 # to nargs+1
-       jmp     IJMP(REG(rbx))
+       OP(cmp,b)       TW(IMM($1),BOF(-4,REG(rcx)))    # Compare frame size
+       jne     asm_sc_apply_generic_$1                 # to nargs+1
+       jmp     IJMP(REG(rcx))
  
  asm_sc_apply_generic_$1:
         OP(mov,q)       TW(IMM($1),REG(rdx))
@@ -837,25 +546,6 @@ define_apply_fixed_size(8)
  ###    numeric types are much faster than the rare ones
  ###    (bignums, ratnums, recnums)
  
-IF387(`declare_alignment(2)
-asm_generic_flonum_result:
-       # The MOV instruction can take a 64-bit immediate operand only
-       # if the target is a register, so we store the manifest in rax
-       # before moving it to memory.
-       OP(mov,q)       TW(IMM_MANIFEST_NM_VECTOR_1,REG(rax))
-       OP(mov,q)       TW(REG(rax), IND(rfree))
-       # The OR instruction cannot take a 64-bit immediate either, so
-       # we need to store the tag in rax first, shift it up, and then
-       # OR the datum into it.
-       OP(mov,q)       TW(IMM(TC_FLONUM),REG(rax))
-       OP(shl,q)       TW(IMM(DATUM_LENGTH),REG(rax))
-       OP(or,q)        TW(rfree,REG(rax))
-       OP(fstp,l)      DOF(8,rfree)                    # fstpd
-       OP(and,q)       TW(rmask,IND(REG(rsp)))
-       OP(add,q)       TW(IMM(16),rfree)
-       OP(mov,q)       TW(REG(rax),DOF(REGBLOCK_VAL(),regs))
-       ret
-
  declare_alignment(2)
  asm_generic_fixnum_result:
         OP(and,q)       TW(rmask,IND(REG(rsp)))
@@ -876,7 +566,7 @@ asm_generic_return_sharp_f:
         OP(and,q)       TW(rmask,IND(REG(rsp)))
         OP(mov,q)       TW(IMM_FALSE,REG(rax))
         OP(mov,q)       TW(REG(rax),LOF(REGBLOCK_VAL(),regs))
-       ret')
+       ret
  \f
  define(define_unary_operation,
  `declare_alignment(2)
@@ -885,13 +575,7 @@ define_hook_label(generic_$1)
         OP(mov,q)       TW(REG(rdx),REG(rax))
         OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))
         OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(al))
-       je      asm_generic_$1_fix
-       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(al))
         jne     asm_generic_$1_fail
-       OP(and,q)       TW(rmask,REG(rdx))
-       fld1
-       OP($4,l)        DOF(8,REG(rdx))
-       jmp     asm_generic_flonum_result
  
  asm_generic_$1_fix:
         OP(mov,q)       TW(REG(rdx),REG(rax))
@@ -911,17 +595,7 @@ define_hook_label(generic_$1)
         OP(mov,q)       TW(REG(rdx),REG(rax))
         OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))
         OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(al))
-       je      asm_generic_$1_fix
-       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(al))
         jne     asm_generic_$1_fail
-       OP(and,q)       TW(rmask,REG(rdx))
-       OP(fld,l)       DOF(8,REG(rdx))
-       ftst
-       fstsw   REG(ax)
-       fstp    ST(0)
-       sahf
-       $4      asm_generic_return_sharp_t
-       jmp     asm_generic_return_sharp_f
  
  asm_generic_$1_fix:
         OP(mov,q)       TW(REG(rdx),REG(rax))
@@ -936,6 +610,10 @@ asm_generic_$1_fail:
         jmp     scheme_to_interface')
  \f
  define(define_binary_operation,
+`define_binary_operation_with_fixup($1,$2,$3,
+       `OP(shl,q)      TW(IMM(TC_LENGTH),REG(rax))')')
+
+define(define_binary_operation_with_fixup,
  `declare_alignment(2)
  define_hook_label(generic_$1)
         OP(pop,q)       REG(rdx)
@@ -947,93 +625,22 @@ define_hook_label(generic_$1)
         OP(cmp,b)       TW(REG(al),REG(cl))
         jne     asm_generic_$1_fail
         OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(al))
-       je      asm_generic_$1_fix
-       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(al))
-       je      asm_generic_$1_flo
-
-asm_generic_$1_fail:
-       OP(push,q)      REG(rbx)
-       OP(push,q)      REG(rdx)
-       OP(mov,b)       TW(IMM(HEX($2)),REG(al))
-       jmp     scheme_to_interface
+       jne     asm_generic_$1_fail
  
  asm_generic_$1_fix:
         OP(mov,q)       TW(REG(rdx),REG(rax))
         OP(mov,q)       TW(REG(rbx),REG(rcx))
-       OP(shl,q)       TW(IMM(TC_LENGTH),REG(rax))
+       $4                                              # Set up rax.
         OP(shl,q)       TW(IMM(TC_LENGTH),REG(rcx))
-       $5
         OP($3,q)        TW(REG(rcx),REG(rax))           # subq
-       jo      asm_generic_$1_fail
-       jmp     asm_generic_fixnum_result
-
-asm_generic_$1_flo:
-       OP(and,q)       TW(rmask,REG(rdx))
-       OP(and,q)       TW(rmask,REG(rbx))
-       OP(fld,l)       DOF(8,REG(rdx))                 # fldd
-       OP($4,l)        DOF(8,REG(rbx))                 # fsubl
-       jmp     asm_generic_flonum_result')
-\f
-IF387(`declare_alignment(2)
-define_hook_label(generic_divide)
-       OP(pop,q)       REG(rdx)
-       OP(pop,q)       REG(rbx)
-       OP(mov,q)       TW(REG(rdx),REG(rax))
-       OP(mov,q)       TW(REG(rbx),REG(rcx))
-       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rax))
-       OP(shr,q)       TW(IMM(DATUM_LENGTH),REG(rcx))
-       OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(al))
-       je      asm_generic_divide_fix
-       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(al))
-       jne     asm_generic_divide_fail
-       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(cl))
-       je      asm_generic_divide_flo_flo
-       OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(cl))
-       jne     asm_generic_divide_fail
-       OP(mov,q)       TW(REG(rbx),REG(rcx))
-       OP(shl,q)       TW(IMM(TC_LENGTH),REG(rcx))
-       je      asm_generic_divide_fail
-       OP(and,q)       TW(rmask,REG(rdx))
-       OP(sar,q)       TW(IMM(TC_LENGTH),REG(rcx))
-       OP(fld,l)       DOF(8,REG(rdx))                 # fldd
-       OP(mov,q)       TW(REG(rcx),IND(rfree))
-       OP(fidiv,l)     IND(rfree)
-       jmp     asm_generic_flonum_result
-
-asm_generic_divide_fix:
-       OP(cmp,b)       TW(IMM(TC_FLONUM),REG(cl))
-       jne     asm_generic_divide_fail
-       OP(mov,q)       TW(REG(rdx),REG(rcx))
-       OP(shl,q)       TW(IMM(TC_LENGTH),REG(rcx))
-       je      asm_generic_divide_fail
-       OP(and,q)       TW(rmask,REG(rbx))
-       OP(sar,q)       TW(IMM(TC_LENGTH),REG(rcx))
-       OP(fld,l)       DOF(8,REG(rbx))                 # fldd
-       OP(mov,q)       TW(REG(rcx),IND(rfree))
-       OP(fidivr,l)    IND(rfree)
-       jmp     asm_generic_flonum_result
-
-asm_generic_divide_flo_flo:
-       OP(mov,q)       TW(REG(rbx),REG(rcx))
-       OP(and,q)       TW(rmask,REG(rcx))
-       OP(fld,l)       DOF(8,REG(rcx))                 # fldd
-       ftst
-       fstsw   REG(ax)
-       sahf
-       je      asm_generic_divide_by_zero
-       OP(and,q)       TW(rmask,REG(rdx))
-       OP(fdivr,l)     DOF(8,REG(rdx))
-       jmp     asm_generic_flonum_result       
-
-asm_generic_divide_by_zero:
-       fstp    ST(0)                                   # Pop second arg
-
-asm_generic_divide_fail:
+       jno     asm_generic_fixnum_result
+
+asm_generic_$1_fail:
         OP(push,q)      REG(rbx)
         OP(push,q)      REG(rdx)
-       OP(mov,b)       TW(IMM(HEX(23)),REG(al))
+       OP(mov,b)       TW(IMM(HEX($2)),REG(al))
         jmp     scheme_to_interface')
-\f
+
  define(define_binary_predicate,
  `declare_alignment(2)
  define_hook_label(generic_$1)
@@ -1048,6 +655,7 @@ define_hook_label(generic_$1)
         OP(cmp,b)       TW(IMM(TC_FIXNUM),REG(al))
         jne     asm_generic_$1_fail
  
+asm_generic_$1_fix:
         OP(shl,q)       TW(IMM(TC_LENGTH),REG(rdx))
         OP(shl,q)       TW(IMM(TC_LENGTH),REG(rbx))
         OP(cmp,q)       TW(REG(rbx),REG(rdx))
@@ -1060,30 +668,32 @@ asm_generic_$1_fail:
         OP(mov,b)       TW(IMM(HEX($2)),REG(al))
         jmp     scheme_to_interface')
  \f
-IF387(`define_unary_operation(decrement,22,sub,fsubr)
-define_unary_operation(increment,26,add,fadd)
-
-define_unary_predicate(negative,2a,jl,jb)
-define_unary_predicate(positive,2c,jg,ja)
-define_unary_predicate(zero,2d,je,je)
-
-# define_binary_operation(name,index,fix*fix,flo*flo, fixup)
-# define_binary_operation(  $1,   $2,     $3,     $4, $5)
-# The fixup is optional; only multiplication needs it to shift the
-# result back down by six bits.
-define_binary_operation(add,2b,add,fadd)
-define_binary_operation(subtract,28,sub,fsub)
-define_binary_operation(multiply,29,imul,fmul,
-       `OP(shr,q)      TW(IMM(6),REG(rax))')
-# Divide needs to check for 0, so we cant really use the following
-# define_binary_operation(divide,23,NONE,fdiv)
-
-# define_binary_predicate(name,index,fix*fix,flo*flo)
-define_binary_predicate(equal,24,je,je)
-define_binary_predicate(greater,25,jg,ja)
-define_binary_predicate(less,27,jl,jb)')
-
-IFN387(`define_jump_indirection(generic_decrement,22)
+#define_unary_operation(decrement,22,sub)
+#define_unary_operation(increment,26,add)
+
+#define_unary_predicate(negative,2a,jl)
+#define_unary_predicate(positive,2c,jg)
+#define_unary_predicate(zero,2d,je)
+
+# define_binary_operation(name,index,op)
+# define_binary_operation(  $1,   $2,$3)
+#define_binary_operation(add,2b,add)
+#define_binary_operation(subtract,28,sub)
+
+# No fixup -- leave it unshifted.
+#define_binary_operation_with_fixup(multiply,29,imul)
+
+# define_binary_predicate(name,index,jcc)
+#define_binary_predicate(equal,24,je)
+#define_binary_predicate(greater,25,jg)
+#define_binary_predicate(less,27,jl)
+
+# At the moment, there is no advantage to using the above code, and in
+# fact using it is a waste, since the compiler open-codes the fixnum
+# case already.  Later, the above code will also handle floating-point
+# arguments, which the compiler does not open-code.
+
+define_jump_indirection(generic_decrement,22)
  define_jump_indirection(generic_divide,23)
  define_jump_indirection(generic_equal,24)
  define_jump_indirection(generic_greater,25)
@@ -1094,30 +704,10 @@ define_jump_indirection(generic_multiply,29)
  define_jump_indirection(generic_negative,2a)
  define_jump_indirection(generic_add,2b)
  define_jump_indirection(generic_positive,2c)
-define_jump_indirection(generic_zero,2d)')
-
-# These don't currently differ according to whether there
-# is a 387 or not.
-
+define_jump_indirection(generic_zero,2d)
  define_jump_indirection(generic_quotient,37)
  define_jump_indirection(generic_remainder,38)
  define_jump_indirection(generic_modulo,39)
-
-define_jump_indirection(nofp_decrement,22)
-define_jump_indirection(nofp_divide,23)
-define_jump_indirection(nofp_equal,24)
-define_jump_indirection(nofp_greater,25)
-define_jump_indirection(nofp_increment,26)
-define_jump_indirection(nofp_less,27)
-define_jump_indirection(nofp_subtract,28)
-define_jump_indirection(nofp_multiply,29)
-define_jump_indirection(nofp_negative,2a)
-define_jump_indirection(nofp_add,2b)
-define_jump_indirection(nofp_positive,2c)
-define_jump_indirection(nofp_zero,2d)
-define_jump_indirection(nofp_quotient,37)
-define_jump_indirection(nofp_remainder,38)
-define_jump_indirection(nofp_modulo,39)
  \f
  IFDASM(`end')
  
diff --git a/src/microcode/cmpintmd/x86-64.c b/src/microcode/cmpintmd/x86-64.c

index a7cc7f0d4d2d610b5a858254f3b671e186d66e4a..3f364c1440476c7e3b0a140a89f4919075fa66ab 100644 (file)
--- a/src/microcode/cmpintmd/x86-64.c
+++ b/src/microcode/cmpintmd/x86-64.c
@@ -198,10 +198,9 @@ store_trampoline_insns (insn_t * entry, byte_t code)
  {
    (*entry++) = 0xB0;           /* MOV AL,code */
    (*entry++) = code;
-  (*entry++) = 0xFF;           /* CALL /2 disp32(ESI) */
+  (*entry++) = 0xFF;           /* CALL /2 disp32(RSI) */
    (*entry++) = 0x96;
    (* ((uint32_t *) entry)) = RSI_TRAMPOLINE_TO_INTERFACE_OFFSET;
-  X86_64_CACHE_SYNCHRONIZE ();
    return (false);
  }
  \f
@@ -223,7 +222,6 @@ x86_64_reset_hook (void)
  {
    int offset = (COMPILER_REGBLOCK_N_FIXED * (sizeof (SCHEME_OBJECT)));
    unsigned char * rsi_value = ((unsigned char *) Registers);
-  bool fp_support_present = (x86_64_interface_initialize ());
  
    /* These must match machines/x86-64/lapgen.scm */
  
@@ -249,78 +247,36 @@ x86_64_reset_hook (void)
    SETUP_REGISTER (asm_link);                           /* 12 */
    SETUP_REGISTER (asm_error);                          /* 13 */
    SETUP_REGISTER (asm_primitive_error);                        /* 14 */
-  /* [TRC 20091025: This was an i386 hack for when the PC is not
-     available, which on x86-64 it always is. */
-  /* SETUP_REGISTER (asm_short_primitive_apply); */            /* 15 */
-
-  /* No more room for positive offsets without going to 32-bit
-     offsets!  */
-
-  /* This is a hack to make all the hooks be addressable with byte
-     offsets (instead of longword offsets).  The register block
-     extends to negative offsets as well, so all the following hooks
-     are accessed with negative offsets, and all fit in a byte.  */
-
-  /* [TRC 20091029: This hack doesn't work any longer; this code
-     should be cleaned up, since we must use longword offsets anyway.]
-     */
-
-  offset = -256;
-  if (fp_support_present)
-    {
-      SETUP_REGISTER (asm_generic_add);                        /* -32 */
-      SETUP_REGISTER (asm_generic_subtract);           /* -31 */
-      SETUP_REGISTER (asm_generic_multiply);           /* -30 */
-      SETUP_REGISTER (asm_generic_divide);             /* -29 */
-      SETUP_REGISTER (asm_generic_equal);              /* -28 */
-      SETUP_REGISTER (asm_generic_less);               /* -27 */
-      SETUP_REGISTER (asm_generic_greater);            /* -26 */
-      SETUP_REGISTER (asm_generic_increment);          /* -25 */
-      SETUP_REGISTER (asm_generic_decrement);          /* -24 */
-      SETUP_REGISTER (asm_generic_zero);               /* -23 */
-      SETUP_REGISTER (asm_generic_positive);           /* -22 */
-      SETUP_REGISTER (asm_generic_negative);           /* -21 */
-      SETUP_REGISTER (asm_generic_quotient);           /* -20 */
-      SETUP_REGISTER (asm_generic_remainder);          /* -19 */
-      SETUP_REGISTER (asm_generic_modulo);             /* -18 */
-    }
-  else
-    {
-      SETUP_REGISTER (asm_nofp_add);                   /* -32 */
-      SETUP_REGISTER (asm_nofp_subtract);              /* -31 */
-      SETUP_REGISTER (asm_nofp_multiply);              /* -30 */
-      SETUP_REGISTER (asm_nofp_divide);                        /* -29 */
-      SETUP_REGISTER (asm_nofp_equal);                 /* -28 */
-      SETUP_REGISTER (asm_nofp_less);                  /* -27 */
-      SETUP_REGISTER (asm_nofp_greater);               /* -26 */
-      SETUP_REGISTER (asm_nofp_increment);             /* -25 */
-      SETUP_REGISTER (asm_nofp_decrement);             /* -24 */
-      SETUP_REGISTER (asm_nofp_zero);                  /* -23 */
-      SETUP_REGISTER (asm_nofp_positive);              /* -22 */
-      SETUP_REGISTER (asm_nofp_negative);              /* -21 */
-      SETUP_REGISTER (asm_nofp_quotient);              /* -20 */
-      SETUP_REGISTER (asm_nofp_remainder);             /* -19 */
-      SETUP_REGISTER (asm_nofp_modulo);                        /* -18 */
-    }
-
-  SETUP_REGISTER (asm_sc_apply);                       /* -17 */
-  SETUP_REGISTER (asm_sc_apply_size_1);                        /* -16 */
-  SETUP_REGISTER (asm_sc_apply_size_2);                        /* -15 */
-  SETUP_REGISTER (asm_sc_apply_size_3);                        /* -14 */
-  SETUP_REGISTER (asm_sc_apply_size_4);                        /* -13 */
-  SETUP_REGISTER (asm_sc_apply_size_5);                        /* -12 */
-  SETUP_REGISTER (asm_sc_apply_size_6);                        /* -11 */
-  SETUP_REGISTER (asm_sc_apply_size_7);                        /* -10 */
-  SETUP_REGISTER (asm_sc_apply_size_8);                        /* -9 */
-  SETUP_REGISTER (asm_interrupt_continuation_2);       /* -8 */
-  /* [TRC 20091025: The cache synchronization bug does not occur in any
-      x86-64 machines of which I am aware.]
-
-  if (x86_64_cpuid_needed)
-    SETUP_REGISTER (asm_serialize_cache);              /\* -7 *\/
-  else
-    SETUP_REGISTER (asm_dont_serialize_cache);         /\* -7 *\/
-  */
+  SETUP_REGISTER (asm_generic_add);                    /* 15 */
+  SETUP_REGISTER (asm_generic_subtract);               /* 16 */
+  SETUP_REGISTER (asm_generic_multiply);               /* 17 */
+  SETUP_REGISTER (asm_generic_divide);                 /* 18 */
+  SETUP_REGISTER (asm_generic_equal);                  /* 19 */
+  SETUP_REGISTER (asm_generic_less);                   /* 20 */
+  SETUP_REGISTER (asm_generic_greater);                        /* 21 */
+  SETUP_REGISTER (asm_generic_increment);              /* 22 */
+  SETUP_REGISTER (asm_generic_decrement);              /* 23 */
+  SETUP_REGISTER (asm_generic_zero);                   /* 24 */
+  SETUP_REGISTER (asm_generic_positive);               /* 25 */
+  SETUP_REGISTER (asm_generic_negative);               /* 26 */
+  SETUP_REGISTER (asm_generic_quotient);               /* 27 */
+  SETUP_REGISTER (asm_generic_remainder);              /* 28 */
+  SETUP_REGISTER (asm_generic_modulo);                 /* 29 */
+  SETUP_REGISTER (asm_sc_apply);                       /* 30 */
+  SETUP_REGISTER (asm_sc_apply_size_1);                        /* 31 */
+  SETUP_REGISTER (asm_sc_apply_size_2);                        /* 32 */
+  SETUP_REGISTER (asm_sc_apply_size_3);                        /* 33 */
+  SETUP_REGISTER (asm_sc_apply_size_4);                        /* 34 */
+  SETUP_REGISTER (asm_sc_apply_size_5);                        /* 35 */
+  SETUP_REGISTER (asm_sc_apply_size_6);                        /* 36 */
+  SETUP_REGISTER (asm_sc_apply_size_7);                        /* 37 */
+  SETUP_REGISTER (asm_sc_apply_size_8);                        /* 38 */
+
+  /* Logically, this should be up by the other interrupt routines, but
+     I just wrote all those numbers above by hand and am too exhausted
+     by that gruelling effort to be inclined to go to the trouble to
+     renumber them now.  */
+  SETUP_REGISTER (asm_interrupt_continuation_2);       /* 39 */
  
  #ifdef _MACH_UNIX
    {
diff --git a/src/microcode/cmpintmd/x86-64.h b/src/microcode/cmpintmd/x86-64.h

index 6076bfdf663ed9aedacf8ca428c15b7dfff8449c..6b29d7db00654d342c2657f4d104079577802443 100644 (file)
--- a/src/microcode/cmpintmd/x86-64.h
+++ b/src/microcode/cmpintmd/x86-64.h
@@ -30,53 +30,6 @@ USA.
  \f
  /*
  
-Problems with the AMD x86-64 instruction set architecture
-====================================================
-
-1. Jumps are PC-relative.  There are absolute jumps, assuming the PC
-   is in a data location, or with immediate destinations that include
-   a segment descriptor (16 bits).  The short forms have a PC-relative
-   offset defined with respect to the immediately following
-   instruction.
-
-Problem: Closures and execute caches need their address in old space
-   in order to be relocated correctly.
-
-Fix:
-
-For execute caches we can define a new linker field, called
-load-relocation-address which on every GC/relocation stores the new
-address and the old contents into global variables and stores the new
-address in the field.  Alternatively the difference between the new
-address and the old contents can be stored into a single global
-variable, and this can be used, together with the new address of each
-cache, to find the old code.
-
-For closures the code that reads the header (manifest closure) can do
-the same.
-
-
-2. The CALL and JMP instructions do not accept 64-bit displacements.
-
-Problem: We want heaps bigger than 4 GB.
-
-Fix:
-
-Assemble more than one instruction for closure entry points, expanding
-them even more.  Yech.
-
-
-3. The stack pointer register (RSP) cannot be used as the base in
-   (base + displacement) addressing mode.
-
-Problem: Common operation in the compiler, which assumes direct access
-   to the stack.
-
-Fix: Use base + indexed mode, which allows specification of RSP as
-   base and nullification of the index (by using RSP again).  This is
-   one byte longer than otherwise, but...
-
-
  Register assignments
  ====================
  
@@ -92,10 +45,6 @@ RDI (7)              Free Pointer
  
  R8-R15         Unassigned
  
-The dynamic link and value "registers" are not processor registers.
-Slots in the register array must be reserved for them.
-[TRC 20091025: Later, we ought to use machine registers for these.]
-
  The Free Pointer is RDI because RDI is the implicit base register for
  the memory-to-memory move instructions, and the string store
  instruction.  Perhaps we can make use of it.
@@ -165,51 +114,17 @@ entry     0               MOV     AL,code         0xB0, code-byte
         2               CALL    n(RSI)          0xFF 0x96 n-longword
         8               <trampoline dependent storage>
  
-
-[TRC 20091027: The next two are wrong; need to update.]
-
-- GC & interrupt check at procedure/continuation entry:
-
-gc_lab -7              CALL    n(RSI)          0xFF 0x56 n-byte
-       -4              <type/arity info>
-       -2              <gc offset>
-entry  0               CMP     RDI,(RSI)       0x48 0x39 0x3e
-       3               JAE     gc_lab          0x73 -12
-       5               <real code>
-
-
-- GC & interrupt check at closure entry:
-
-gc_lab -11             ADD     (RSP),&offset   0x83 0x04 0x24 offset-byte
-       -7              JMP     n(RSI)          0xFF 0x66 n-byte
-       -4              <type/arity info>
-       -2              <gc offset>
-entry  0               ADD     (RSP),&magic    0x81 0x04 0x24 magic-longword
-       7               CMP     RDI,(RSI)       0x39 0x3e
-       9               JAE     gc_lab          0x73 0xea (= -22)
-       11              <real code>
-
-The magic value depends on the closure because of canonicalization.
-
-The ADD instruction at offset -11 is not present for the 0th closure
-entry, since it is the canonical entry point.  Its format depends on
-the value of offset, since the sign-extending forms often suffice.
-
-offset = entry_number * entry_size
-magic = ([TC_COMPILED_ENTRY | 0] - (offset + length_of_CALL_instruction))
-
  */
  \f
  #define ASM_RESET_HOOK x86_64_reset_hook
-#define FPE_RESET_TRAPS x86_64_interface_initialize
  
  #define CMPINT_USE_STRUCS 1
  
  /* These next definitions must agree with "cmpauxmd/x86-64.m4", which is
     where the register block is allocated.  */
  #define COMPILER_REGBLOCK_N_FIXED 16
-/* Big enough to hold 80-bit floating-point value: */
-#define COMPILER_TEMP_SIZE 2
+/* Size in objects of the largest quantities that RTL registers can hold.  */
+#define COMPILER_TEMP_SIZE 1
  #define COMPILER_REGBLOCK_N_TEMPS 256
  #define COMPILER_REGBLOCK_N_HOOKS 80
  #define COMPILER_HOOK_SIZE 1
@@ -217,8 +132,6 @@ magic = ([TC_COMPILED_ENTRY | 0] - (offset + length_of_CALL_instruction))
  #define COMPILER_REGBLOCK_EXTRA_SIZE                                   \
    (COMPILER_REGBLOCK_N_HOOKS * COMPILER_HOOK_SIZE)
  
-#define REGBLOCK_ALLOCATED_BY_INTERFACE true
-
  typedef byte_t insn_t;
  
  /* Number of insn_t units preceding entry address in which header
@@ -228,8 +141,11 @@ typedef byte_t insn_t;
  #define CC_ENTRY_OFFSET_SIZE 2
  
  /* Number of insn_t units preceding entry header in which GC trap
-   instructions are stored.  */
-#define CC_ENTRY_GC_TRAP_SIZE 3
+   instructions are stored.  This is an approximation: it matches only
+   those non-closure procedures for which LIAR has generated interrupt
+   checks, in which case there is one CALL n(RSI), which is encoded as
+   #xff #x96 <n>, where n is a longword (32 bits).  */
+#define CC_ENTRY_GC_TRAP_SIZE 6
  \f
  #define EMBEDDED_CLOSURE_ADDRS_P 1
  
@@ -250,23 +166,6 @@ typedef byte_t insn_t;
  #define UUO_COUNT_TO_WORDS(nc) ((nc) * UUO_LINK_SIZE)
  
  #define READ_UUO_TARGET(a, r) read_uuo_target (a)
-
-#define FLUSH_I_CACHE() X86_64_CACHE_SYNCHRONIZE ()
-#define FLUSH_I_CACHE_REGION(address, nwords) X86_64_CACHE_SYNCHRONIZE ()
-#define PUSH_D_CACHE_REGION(address, nwords) X86_64_CACHE_SYNCHRONIZE ()
-
-/* [TRC 20091025: The cache synchronization bug does not occur in any
-    x86-64 machines of which I am aware.] */
-
-#define X86_64_CACHE_SYNCHRONIZE() do {} while (0)
-
-/*
-#define X86_64_CACHE_SYNCHRONIZE() do                                  \
-{                                                                      \
-  if (x86_64_cpuid_needed)                                             \
-    x86_64_cache_synchronize ();                                       \
-} while (false)
-*/
  \f
  #if defined(__WIN32__) && defined(__WATCOMC__)
  #  define ASM_ENTRY_POINT(name) (__cdecl name)
author	Taylor R Campbell <campbell@mumble.net>
	Fri, 6 Nov 2009 04:40:26 +0000 (23:40 -0500)
committer	Taylor R Campbell <campbell@mumble.net>
	Fri, 6 Nov 2009 04:40:26 +0000 (23:40 -0500)
src/compiler/machines/x86-64/lapgen.scm		patch \| blob \| history
src/compiler/machines/x86-64/machin.scm		patch \| blob \| history
src/compiler/machines/x86-64/rules3.scm		patch \| blob \| history
src/compiler/machines/x86-64/rules4.scm		patch \| blob \| history
src/compiler/machines/x86-64/rulflo.scm		patch \| blob \| history
src/microcode/cmpauxmd/x86-64.m4		patch \| blob \| history
src/microcode/cmpintmd/x86-64.c		patch \| blob \| history
src/microcode/cmpintmd/x86-64.h		patch \| blob \| history