From 987d9624b221bac31f6413ff3a6fe17b5a90a2fb Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <campbell@mumble.net>
Date: Mon, 31 Dec 2018 21:08:22 +0000
Subject: [PATCH] Make entries point to _after_ the PC offset.

This saves a jump in closure headers, and makes non-closure entries
have a nice PC offset of 0 rather than an awkward PC offset of 8.
However, this causes all indirect calls to have an additional offset
of -8 in the addressing mode -- not clear yet how much this hurts.

WARNING: This changes the amd64 compiled code interface so that new
compiled code requires a new microcode and vice versa.  Further, you
must set compiler:cross-compiling? to #t to compile the system,
because compiled code block offsets are now in a different place
relative to compiled entries, so the native fasdumper of an old
microcode can't handle compiled entries produced by a new compiler.
---
 src/compiler/machines/x86-64/lapgen.scm |  14 +--
 src/compiler/machines/x86-64/machin.scm |  18 +--
 src/compiler/machines/x86-64/rules1.scm |  22 ++--
 src/compiler/machines/x86-64/rules3.scm | 149 +++++++++++-------------
 src/microcode/cmpauxmd/x86-64.m4        |  18 +--
 src/microcode/cmpintmd/x86-64.c         |  66 +++++------
 src/microcode/cmpintmd/x86-64.h         |  46 ++++----
 7 files changed, 163 insertions(+), 170 deletions(-)

diff --git a/src/compiler/machines/x86-64/lapgen.scm b/src/compiler/machines/x86-64/lapgen.scm
index df1e76c6f..44241a11a 100644
--- a/src/compiler/machines/x86-64/lapgen.scm
+++ b/src/compiler/machines/x86-64/lapgen.scm
@@ -114,8 +114,8 @@ USA.
   (set! *external-labels* (cons label *external-labels*))
   (LAP (WORD U ,code)
        (BLOCK-OFFSET ,label)
-       (LABEL ,label)
-       (QUAD U 8)))
+       (QUAD U 0)
+       (LABEL ,label)))
 
 (define-integrable (make-code-word min max)
   (+ (* #x100 min) max))
@@ -218,11 +218,11 @@ USA.
   (move-to-alias-register! source (register-type target) target)
   (LAP))
 
-(define (load-pc-relative target label-expr offset)
-  (LAP (MOV Q ,target (@PCRO ,label-expr ,offset))))
+(define (load-pc-relative target label-expr)
+  (LAP (MOV Q ,target (@PCR ,label-expr))))
 
-(define (load-pc-relative-address target label-expr offset)
-  (LAP (LEA Q ,target (@PCRO ,label-expr ,offset))))
+(define (load-pc-relative-address target label-expr)
+  (LAP (LEA Q ,target (@PCR ,label-expr))))
 
 (define (compare/register*register reg1 reg2)
   (cond ((register-alias reg1 'GENERAL)
@@ -723,7 +723,7 @@ USA.
 
 (define (invoke-hook/reentry entry)
   (let ((label (generate-label 'HOOK-REENTRY)))
-    (LAP (LEA Q (R ,rbx) (@PCRO ,label 4)) ;Skip format word.
+    (LAP (LEA Q (R ,rbx) (@PCRO ,label 12)) ;Skip format word and PC offset.
 	 ,@(invoke-hook entry)
 	 (LABEL ,label))))
 
diff --git a/src/compiler/machines/x86-64/machin.scm b/src/compiler/machines/x86-64/machin.scm
index 35e9bfbfd..96a1cfe92 100644
--- a/src/compiler/machines/x86-64/machin.scm
+++ b/src/compiler/machines/x86-64/machin.scm
@@ -88,14 +88,15 @@ USA.
 (define-integrable address-units-per-closure-manifest address-units-per-object)
 (define-integrable address-units-per-entry-format-code 4)
 (define-integrable address-units-per-closure-entry-count 4)
-(define-integrable address-units-per-closure-padding 4)
+(define-integrable address-units-per-closure-padding -4)
 
-;;; Just a 64-bit offset and four bytes of padding.
-(define-integrable address-units-per-closure-entry-instructions 12)
+(define-integrable address-units-per-closure-pc-offset 8)
+(define-integrable address-units-per-closure-entry-padding 4)
 
 (define-integrable address-units-per-closure-entry
   (+ address-units-per-entry-format-code
-     address-units-per-closure-entry-instructions))
+     address-units-per-closure-pc-offset
+     address-units-per-closure-entry-padding))
 
 ;;; Note:
 ;;;
@@ -119,7 +120,7 @@ USA.
 (define (closure-first-offset nentries entry)
   (if (zero? nentries)
       1
-      (* (- nentries entry) closure-entry-size)))
+      (* (- nentries entry 1) closure-entry-size)))
 
 ;;; Given the number of entry points in a closure, return the distance
 ;;; in objects from the address of the manifest closure to the address
@@ -128,9 +129,10 @@ USA.
 (define (closure-object-first-offset nentries)
   (if (zero? nentries)
       1					;One vector manifest.
-      ;; One object for the closure manifest, and one object for the
-      ;; leading entry count and the trailing padding.
-      (+ 2 (* nentries closure-entry-size))))
+      ;; One object for the closure manifest, half an object for the
+      ;; leading entry count, and minus half an object for the trailing
+      ;; non-padding.
+      (+ 1 (* nentries closure-entry-size))))
 
 ;;; Given the number of entries in a closure, and the indices of two
 ;;; entries, return the number of bytes separating the two entries.
diff --git a/src/compiler/machines/x86-64/rules1.scm b/src/compiler/machines/x86-64/rules1.scm
index 68614917f..899b89f1b 100644
--- a/src/compiler/machines/x86-64/rules1.scm
+++ b/src/compiler/machines/x86-64/rules1.scm
@@ -177,15 +177,14 @@ USA.
   (ASSIGN (REGISTER (? target)) (ENTRY:PROCEDURE (? label)))
   (load-pc-relative-address
    (target-register-reference target)
-   (rtl-procedure/external-label (label->object label))
-   0))
+   (rtl-procedure/external-label (label->object label))))
 
 (define-rule statement
   (ASSIGN (REGISTER (? target)) (ENTRY:CONTINUATION (? label)))
   (let* ((target (target-register-reference target))
 	 (get-pc (generate-label 'GET-PC)))
     (LAP (CALL (@PCR ,get-pc))
-	 (JMP (@PCRO ,label 8))
+	 (JMP (@PCR ,label))
 	(LABEL ,get-pc)
 	 (POP Q ,target))))
 
@@ -198,8 +197,7 @@ USA.
   (load-pc-relative-address/typed (target-register-reference target)
 				  type
 				  (rtl-procedure/external-label
-				   (label->object label))
-				  0))
+				   (label->object label))))
 
 (define-rule statement
   ;; This is an intermediate rule -- not intended to produce code.
@@ -210,7 +208,7 @@ USA.
   (let* ((target (target-register-reference target))
 	 (pushed (generate-label 'PUSHED)))
     (LAP (CALL (@PCR ,pushed))
-	 (JMP (@PCRO ,label 8))
+	 (JMP (@PCR ,label))
 	(LABEL ,pushed)
 	 (POP Q ,target)
 	 ,@(affix-type target type))))
@@ -222,21 +220,19 @@ USA.
   (assert (= type type-code:compiled-return))
   (let ((pushed (generate-label 'PUSHED)))
     (LAP (CALL (@PCR ,pushed))
-	 (JMP (@PCRO ,label 8))
+	 (JMP (@PCR ,label))
 	(LABEL ,pushed)
 	 ,@(affix-type (INST-EA (@R 4)) type))))
 
 (define-rule statement
   (ASSIGN (REGISTER (? target)) (VARIABLE-CACHE (? name)))
   (load-pc-relative (target-register-reference target)
-		    (free-reference-label name)
-		    0))
+		    (free-reference-label name)))
 
 (define-rule statement
   (ASSIGN (REGISTER (? target)) (ASSIGNMENT-CACHE (? name)))
   (load-pc-relative (target-register-reference target)
-		    (free-assignment-label name)
-		    0))
+		    (free-assignment-label name)))
 
 (define-rule statement
   (ASSIGN (REGISTER (? target)) (OBJECT->DATUM (CONSTANT (? constant))))
@@ -432,8 +428,8 @@ USA.
 	 (target (target-register-reference target)))
     (LAP (LEA Q ,target ,source))))  
 
-(define (load-pc-relative-address/typed target type label offset)
-  (LAP (LEA Q ,target (@PCRO ,label ,offset))
+(define (load-pc-relative-address/typed target type label)
+  (LAP (LEA Q ,target (@PCR ,label))
        ,@(affix-type target type))
   #|
   ;++ This is pretty horrid, especially since it happens for every
diff --git a/src/compiler/machines/x86-64/rules3.scm b/src/compiler/machines/x86-64/rules3.scm
index 3193f00e6..c240ae356 100644
--- a/src/compiler/machines/x86-64/rules3.scm
+++ b/src/compiler/machines/x86-64/rules3.scm
@@ -62,67 +62,61 @@ USA.
   (INVOCATION:APPLY (? frame-size) (? continuation))
   continuation
   (expect-no-exit-interrupt-checks)
-  (let ((generic (generate-label 'GENERIC)))
-    (LAP ,@(clear-map!)
-	 (POP Q (R ,rbx))
-	 #|
-	 (MOV Q (R ,rdx) (&U ,frame-size))
-	 ,@(invoke-interface code:compiler-apply)
-	 |#
-	 #|
-	 ,@(case frame-size
-	     ((1) (invoke-hook entry:compiler-shortcircuit-apply-size-1))
-	     ((2) (invoke-hook entry:compiler-shortcircuit-apply-size-2))
-	     ((3) (invoke-hook entry:compiler-shortcircuit-apply-size-3))
-	     ((4) (invoke-hook entry:compiler-shortcircuit-apply-size-4))
-	     ((5) (invoke-hook entry:compiler-shortcircuit-apply-size-5))
-	     ((6) (invoke-hook entry:compiler-shortcircuit-apply-size-6))
-	     ((7) (invoke-hook entry:compiler-shortcircuit-apply-size-7))
-	     ((8) (invoke-hook entry:compiler-shortcircuit-apply-size-8))
-	     (else
-	      (LAP (MOV Q (R ,rdx) (&U ,frame-size))
-		   ,@(invoke-hook entry:compiler-shortcircuit-apply))))
-	 |#
-	 #|
-	 (POP Q (R ,rcx))		;Pop tagged entry into RCX.
-	 (MOV Q (R ,rax) (R ,rcx))	;Copy tagged entry into RAX.
-	 (SHR Q (R ,rax) (&U ,scheme-datum-width)) ;Select tag in RAX.
-	 (AND Q (R ,rcx) (R ,regnum:datum-mask)) ;Select datum in RCX.
-	 (CMP B (R ,rax) (&U ,(ucode-type COMPILED-ENTRY))) ;Check tag.
-	 (JNE (@PCR ,generic))		;Bail if not compiled entry.
-	 (CMP B (@RO ,rcx -4) (&U ,frame-size))	;Check arity.
-	 (JNE (@PCR ,generic))		;Bail if not exact arity match.
-	 (MOV Q (R ,rax) (@R ,rcx))	;Load offset into RAX.
-	 (ADD Q (R ,rax) (R ,rcx))	;Add offset to entry address in RAX.
-	 (JMP (R ,rax))
-	(LABEL ,generic)
-	 ,@(invoke-hook entry:compiler-shortcircuit-apply)
-	 |#
-	 ,@(case frame-size
-	     ((1) (invoke-hook/subroutine entry:compiler-apply-setup-size-1))
-	     ((2) (invoke-hook/subroutine entry:compiler-apply-setup-size-2))
-	     ((3) (invoke-hook/subroutine entry:compiler-apply-setup-size-3))
-	     ((4) (invoke-hook/subroutine entry:compiler-apply-setup-size-4))
-	     ((5) (invoke-hook/subroutine entry:compiler-apply-setup-size-5))
-	     ((6) (invoke-hook/subroutine entry:compiler-apply-setup-size-6))
-	     ((7) (invoke-hook/subroutine entry:compiler-apply-setup-size-7))
-	     ((8) (invoke-hook/subroutine entry:compiler-apply-setup-size-8))
-	     (else
-	      (LAP (MOV Q (R ,rdx) (&U ,frame-size))
-		   ,@(invoke-hook/subroutine entry:compiler-apply-setup))))
-	 (JMP (R ,rax)))))
+  (LAP ,@(clear-map!)
+       (POP Q (R ,rbx))
+       #|
+       (MOV Q (R ,rdx) (&U ,frame-size))
+       ,@(invoke-interface code:compiler-apply)
+       |#
+       #|
+       ,@(case frame-size
+	   ((1) (invoke-hook entry:compiler-shortcircuit-apply-size-1))
+	   ((2) (invoke-hook entry:compiler-shortcircuit-apply-size-2))
+	   ((3) (invoke-hook entry:compiler-shortcircuit-apply-size-3))
+	   ((4) (invoke-hook entry:compiler-shortcircuit-apply-size-4))
+	   ((5) (invoke-hook entry:compiler-shortcircuit-apply-size-5))
+	   ((6) (invoke-hook entry:compiler-shortcircuit-apply-size-6))
+	   ((7) (invoke-hook entry:compiler-shortcircuit-apply-size-7))
+	   ((8) (invoke-hook entry:compiler-shortcircuit-apply-size-8))
+	   (else
+	    (LAP (MOV Q (R ,rdx) (&U ,frame-size))
+		 ,@(invoke-hook entry:compiler-shortcircuit-apply))))
+       |#
+       #|
+       (POP Q (R ,rcx))			;Pop tagged entry into RCX.
+       (MOV Q (R ,rax) (R ,rcx))	;Copy tagged entry into RAX.
+       (SHR Q (R ,rax) (&U ,scheme-datum-width)) ;Select tag in RAX.
+       (AND Q (R ,rcx) (R ,regnum:datum-mask)) ;Select datum in RCX.
+       (CMP B (R ,rax) (&U ,(ucode-type COMPILED-ENTRY))) ;Check tag.
+       (JNE (@PCR ,generic))		;Bail if not compiled entry.
+       (CMP B (@RO ,rcx -4) (&U ,frame-size))	;Check arity.
+       (JNE (@PCR ,generic))		;Bail if not exact arity match.
+       (MOV Q (R ,rax) (@RO ,rcx -8))	;Load offset into RAX.
+       (ADD Q (R ,rax) (R ,rcx))	;Add offset to entry address in RAX.
+       (JMP (R ,rax))
+      (LABEL ,generic)
+       ,@(invoke-hook entry:compiler-shortcircuit-apply)
+       |#
+       ,@(case frame-size
+	   ((1) (invoke-hook/subroutine entry:compiler-apply-setup-size-1))
+	   ((2) (invoke-hook/subroutine entry:compiler-apply-setup-size-2))
+	   ((3) (invoke-hook/subroutine entry:compiler-apply-setup-size-3))
+	   ((4) (invoke-hook/subroutine entry:compiler-apply-setup-size-4))
+	   ((5) (invoke-hook/subroutine entry:compiler-apply-setup-size-5))
+	   ((6) (invoke-hook/subroutine entry:compiler-apply-setup-size-6))
+	   ((7) (invoke-hook/subroutine entry:compiler-apply-setup-size-7))
+	   ((8) (invoke-hook/subroutine entry:compiler-apply-setup-size-8))
+	   (else
+	    (LAP (MOV Q (R ,rdx) (&U ,frame-size))
+		 ,@(invoke-hook/subroutine entry:compiler-apply-setup))))
+       (JMP (R ,rax))))
 
 (define-rule statement
   (INVOCATION:JUMP (? frame-size) (? continuation) (? label))
   frame-size continuation
   (expect-no-exit-interrupt-checks)
   (LAP ,@(clear-map!)
-       ;; Every label for code we can jump to starts with a 64-bit
-       ;; offset to the actual code, always equal to 8.  We could
-       ;; invent the bookkeeping to map the external label to the
-       ;; actual code label, but that's more work than I want to do
-       ;; right now.
-       (JMP (@PCRO ,label 8))))
+       (JMP (@PCR ,label))))
 
 (define-rule statement
   (INVOCATION:COMPUTED-JUMP (? frame-size) (? continuation))
@@ -132,7 +126,7 @@ USA.
   (LAP ,@(clear-map!)
        (POP Q (R ,rcx))
        (AND Q (R ,rcx) (R ,regnum:datum-mask)) ;clear type code
-       (MOV Q (R ,rax) (@R ,rcx))	;rax := PC offset
+       (MOV Q (R ,rax) (@RO ,rcx -8))	;rax := PC offset
        (ADD Q (R ,rax) (R ,rcx))	;rax := PC
        (JMP (R ,rax))))
 
@@ -180,8 +174,7 @@ USA.
 	 (set-address
 	  (begin (require-register! rdx)
 		 (load-pc-relative-address (INST-EA (R ,rdx))
-					   *block-label*
-					   0))))
+					   *block-label*))))
     (delete-dead-registers!)
     (LAP ,@set-extension
 	 ,@set-address
@@ -506,10 +499,11 @@ USA.
 	 (temp (temporary-register-reference))
 	 (data-offset address-units-per-closure-manifest)
 	 (format-offset (+ data-offset address-units-per-closure-entry-count))
-	 (pc-offset (+ format-offset address-units-per-entry-format-code))
+	 (offset-offset (+ format-offset address-units-per-entry-format-code))
+	 (entry-offset (+ offset-offset address-units-per-closure-pc-offset))
 	 (slots-offset
-	  (+ pc-offset
-	     address-units-per-closure-entry-instructions
+	  (+ entry-offset
+	     address-units-per-closure-entry-padding
 	     address-units-per-closure-padding))
 	 (free-offset
 	  (+ slots-offset (* (+ 1 size) address-units-per-object))))
@@ -519,7 +513,7 @@ USA.
 	 (MOV L (@RO ,regnum:free-pointer ,data-offset) (&U 1))
 	 ,@(generate-closure-entry procedure-label min max format-offset temp)
 	 ;; Load the address of the entry instruction into TARGET.
-	 (LEA Q ,target (@RO ,regnum:free-pointer ,pc-offset))
+	 (LEA Q ,target (@RO ,regnum:free-pointer ,entry-offset))
 	 ;; Bump FREE.
 	 ,@(with-signed-immediate-operand free-offset
 	     (lambda (addend)
@@ -545,8 +539,10 @@ USA.
     (let* ((data-offset address-units-per-closure-manifest)
 	   (first-format-offset
 	    (+ data-offset address-units-per-closure-entry-count))
-	   (first-pc-offset
+	   (first-offset-offset
 	    (+ first-format-offset address-units-per-entry-format-code))
+	   (first-entry-offset
+	    (+ first-offset-offset address-units-per-closure-pc-offset))
 	   (free-offset
 	    (+ first-format-offset
 	       (* nentries address-units-per-closure-entry)
@@ -555,7 +551,7 @@ USA.
 	   (MOV Q (@R ,regnum:free-pointer) ,temp)
 	   (MOV L (@RO ,regnum:free-pointer ,data-offset) (&U ,nentries))
 	   ,@(generate-entries entries first-format-offset)
-	   (LEA Q ,target (@RO ,regnum:free-pointer ,first-pc-offset))
+	   (LEA Q ,target (@RO ,regnum:free-pointer ,first-entry-offset))
 	   ,@(with-signed-immediate-operand free-offset
 	       (lambda (addend)
 		 (LAP (ADD Q (R ,regnum:free-pointer) ,addend))))
@@ -567,17 +563,16 @@ USA.
 
 (define (generate-closure-entry label min max offset temp)
   (let* ((procedure-label (rtl-procedure/external-label (label->object label)))
-	 (addr-offset (+ offset address-units-per-entry-format-code))
-	 (padding-offset (+ addr-offset 8)))
-    padding-offset
+	 (offset-offset (+ offset address-units-per-entry-format-code))
+	 (entry-offset (+ offset-offset address-units-per-closure-pc-offset)))
     (LAP (MOV L (@RO ,regnum:free-pointer ,offset)
-	      (&U ,(make-closure-code-longword min max addr-offset)))
-	 ;; Set temp := procedure-label + 8 - addr-offset.
-	 (LEA Q ,temp (@PCR (- (+ ,procedure-label 8) ,addr-offset)))
-	 ;; Set temp := procedure-label + 8 - addr-offset - free.
+	      (&U ,(make-closure-code-longword min max entry-offset)))
+	 ;; Set temp := procedure-label - entry-offset.
+	 (LEA Q ,temp (@PCR (- ,procedure-label ,entry-offset)))
+	 ;; Set temp := procedure-label - entry-offset - free.
 	 (SUB Q ,temp (R ,regnum:free-pointer))
-	 ;; Store temp = procedure-label + 8 - (free + addr-offset).
-	 (MOV Q (@RO ,regnum:free-pointer ,addr-offset) ,temp))))
+	 ;; Store temp = procedure-label - (free + entry-offset).
+	 (MOV Q (@RO ,regnum:free-pointer ,offset-offset) ,temp))))
 
 (define (generate/closure-header internal-label nentries)
   (let* ((rtl-proc (label->object internal-label))
@@ -591,13 +586,7 @@ USA.
 	   (MOV Q (R ,rax) (&U ,(make-non-pointer-literal type 0)))
 	   (OR Q (R ,rcx) (R ,rax))
 	   (PUSH Q (R ,rcx))
-	   ;; Jump past a bogus faux offset.  We need this because
-	   ;; INVOCATION:JUMP jumps to the label + 8, and at the moment
-	   ;; I haven't found a good way to make it skip the +8 part
-	   ;; for closures.
-	   (JMP (@PCRO ,internal-label 8))
-	   (LABEL ,internal-label)
-	   (QUAD U 8)))
+	   (LABEL ,internal-label)))
     (cond ((zero? nentries)
 	   (LAP (EQUATE ,external-label ,internal-label)
 		,@(simple-procedure-header
diff --git a/src/microcode/cmpauxmd/x86-64.m4 b/src/microcode/cmpauxmd/x86-64.m4
index 38dbabaf6..a5eb18e48 100644
--- a/src/microcode/cmpauxmd/x86-64.m4
+++ b/src/microcode/cmpauxmd/x86-64.m4
@@ -422,7 +422,7 @@ define_c_label(C_to_interface)
 
 define_hook_label(trampoline_to_interface)
 define_debugging_label(trampoline_to_interface)
-	OP(add,q)	TW(IMM(24),REG(rcx))		# trampoline storage
+	OP(add,q)	TW(IMM(16),REG(rcx))		# trampoline storage
 	OP(mov,q)	TW(REG(rcx),REG(rbx))		# argument in rbx
 	jmp	scheme_to_interface
 
@@ -603,11 +603,11 @@ define_hook_label(apply_setup)
 	# We now have a compiled entry, so it is safe to compute the
 	# PC.  Do that first, because it sets flags, which are used by
 	# the caller.
-	OP(mov,q)	TW(IND(REG(rcx)),REG(rax))	# rax := PC offset
+	OP(mov,q)	TW(BOF(-8,REG(rcx)),REG(rax))	# rax := PC offset
 	OP(add,q)	TW(REG(rcx),REG(rax))		# rax := PC
 	# Now check the frame size.  The caller will test the flags
 	# again for another conditional jump.
-	OP(movs,bq,x)	TW(BOF(-4,REG(rcx)),REG(r9))	# Extract frame size
+	OP(movs,bq,x)	TW(BOF(-12,REG(rcx)),REG(r9))	# Extract frame size
 	OP(cmp,q)	TW(REG(r9),REG(rdx))		# Compare to nargs+1
 	jne	asm_apply_setup_fail
 	ret
@@ -627,9 +627,9 @@ define_hook_label(apply_setup_size_$1)
 	OP(and,q)	TW(rmask,REG(rcx))		# Select datum
 	OP(cmp,b)	TW(IMM(TC_COMPILED_ENTRY),REG(al))
 	jne	asm_apply_setup_size_$1_fail
-	OP(mov,q)	TW(IND(REG(rcx)),REG(rax))	# rax := PC offset
+	OP(mov,q)	TW(BOF(-8,REG(rcx)),REG(rax))	# rax := PC offset
 	OP(add,q)	TW(REG(rcx),REG(rax))		# rax := PC
-	OP(cmp,b)	TW(IMM($1),BOF(-4,REG(rcx)))	# Compare frame size
+	OP(cmp,b)	TW(IMM($1),BOF(-12,REG(rcx)))	# Compare frame size
 	jne	asm_apply_setup_size_$1_fail		# to nargs+1
 	ret
 
@@ -655,10 +655,10 @@ define_hook_label(sc_apply)
 	OP(and,q)	TW(rmask,REG(rcx))		# Select datum
 	OP(cmp,b)	TW(IMM(TC_COMPILED_ENTRY),REG(al))
 	jne	asm_sc_apply_generic
-	OP(movs,bq,x)	TW(BOF(-4,REG(rcx)),REG(rax))	# Extract frame size
+	OP(movs,bq,x)	TW(BOF(-12,REG(rcx)),REG(rax))	# Extract frame size
 	OP(cmp,q)	TW(REG(rax),REG(rdx))		# Compare to nargs+1
 	jne	asm_sc_apply_generic
-	OP(mov,q)	TW(IND(REG(rcx)),REG(rax))	# rax := PC offset
+	OP(mov,q)	TW(BOF(-8,REG(rcx)),REG(rax))	# rax := PC offset
 	OP(add,q)	TW(REG(rcx),REG(rax))		# rax := PC
 	jmp	IJMP(REG(rax))			# Invoke entry
 
@@ -675,9 +675,9 @@ define_hook_label(sc_apply_size_$1)
 	OP(and,q)	TW(rmask,REG(rcx))		# Select datum
 	OP(cmp,b)	TW(IMM(TC_COMPILED_ENTRY),REG(al))
 	jne	asm_sc_apply_generic_$1
-	OP(cmp,b)	TW(IMM($1),BOF(-4,REG(rcx)))	# Compare frame size
+	OP(cmp,b)	TW(IMM($1),BOF(-12,REG(rcx)))	# Compare frame size
 	jne	asm_sc_apply_generic_$1			# to nargs+1
-	OP(mov,q)	TW(IND(REG(rcx)),REG(rax))	# rax := PC offset
+	OP(mov,q)	TW(BOF(-8,REG(rcx)),REG(rax))	# rax := PC offset
 	OP(add,q)	TW(REG(rcx),REG(rax))		# rax := PC
 	jmp	IJMP(REG(rax))			# Invoke entry
 
diff --git a/src/microcode/cmpintmd/x86-64.c b/src/microcode/cmpintmd/x86-64.c
index 26b3cd9e0..195d75d09 100644
--- a/src/microcode/cmpintmd/x86-64.c
+++ b/src/microcode/cmpintmd/x86-64.c
@@ -37,19 +37,19 @@ extern void * newspace_to_tospace (void *);
 bool
 read_cc_entry_type (cc_entry_type_t * cet, insn_t * address)
 {
-  return (decode_old_style_format_word (cet, (((uint16_t *) address) [-2])));
+  return (decode_old_style_format_word (cet, (((uint16_t *) address) [-6])));
 }
 
 bool
 write_cc_entry_type (cc_entry_type_t * cet, insn_t * address)
 {
-  return (encode_old_style_format_word (cet, ((uint16_t *) address) - 2));
+  return (encode_old_style_format_word (cet, (((uint16_t *) address) - 6)));
 }
 
 bool
 read_cc_entry_offset (cc_entry_offset_t * ceo, insn_t * address)
 {
-  uint16_t n = (((uint16_t *) address) [-1]);
+  uint16_t n = (((uint16_t *) address) [-5]);
   (ceo->offset) = (n >> 1);
   (ceo->continued_p) = ((n & 1) != 0);
   return (false);
@@ -60,7 +60,7 @@ write_cc_entry_offset (cc_entry_offset_t * ceo, insn_t * address)
 {
   if (! ((ceo->offset) < 0x4000))
     return (true);
-  (((uint16_t *) address) [-1])
+  (((uint16_t *) address) [-5])
     = (((ceo->offset) << 1) | ((ceo->continued_p) ? 1 : 0));
   return (false);
 }
@@ -69,11 +69,11 @@ insn_t *
 cc_return_address_to_entry_address (insn_t * pc)
 {
   if ((pc[0]) == 0xeb)		/* JMP rel8 */
-    return ((pc + 2) + (* ((int8_t *) &pc[1])) - 8);
+    return ((pc + 2) + (* ((int8_t *) &pc[1])));
   else if ((pc[0]) == 0xe9)	/* JMP rel32 */
-    return ((pc + 5) + (* ((int32_t *) &pc[1])) - 8);
+    return ((pc + 5) + (* ((int32_t *) &pc[1])));
   else
-    return (pc - 8);
+    return (pc);
 }
 
 /* Compiled closures */
@@ -110,7 +110,7 @@ read_compiled_closure_target (insn_t * start, reloc_ref_t * ref)
   /* If we're relocating, find where base was in the oldspace.  */
   if (ref)
     base += (ref->old_addr - ref->new_addr);
-  return (base + (* ((int64_t *) addr)) - 8);
+  return (base + (((int64_t *) addr)[-1]));
 }
 
 /* write_compiled_closure_target(target, start)
@@ -124,8 +124,8 @@ void
 write_compiled_closure_target (insn_t * target, insn_t * start)
 {
   insn_t * addr = (start + CC_ENTRY_HEADER_SIZE);
-  (* ((int64_t *) addr)) =
-    (target - ((insn_t *) (tospace_to_newspace (addr))) + 8);
+  (((int64_t *) addr)[-1]) =
+    (target - ((insn_t *) (tospace_to_newspace (addr))));
 }
 
 unsigned long
@@ -152,20 +152,20 @@ compiled_closure_entry (insn_t * start)
 insn_t *
 compiled_closure_next (insn_t * start)
 {
-  return (start + CC_ENTRY_HEADER_SIZE + 12);
+  return (start + CC_ENTRY_HEADER_SIZE + 4);
 }
 
 SCHEME_OBJECT *
 skip_compiled_closure_padding (insn_t * start)
 {
-  /* The padding is the same size as the entry header (format word).  */
-  return ((SCHEME_OBJECT *) (start + CC_ENTRY_HEADER_SIZE));
+  /* The last entry is _not_ padded, so undo the padding skip.  */
+  return ((SCHEME_OBJECT *) (start - 4));
 }
 
 SCHEME_OBJECT
 compiled_closure_entry_to_target (insn_t * entry)
 {
-  return (MAKE_CC_ENTRY (entry + (* ((int64_t *) entry)) - 8));
+  return (MAKE_CC_ENTRY (entry + (((int64_t *) entry)[-1])));
 }
 
 /* Execution caches (UUO links)
@@ -224,10 +224,9 @@ write_uuo_target (insn_t * target, SCHEME_OBJECT * saddr)
      But if the target is a compiled closure pointing into a block
      somewhere else, the block may not have been relocated yet and so
      we don't know where the PC will be in the newspace.  */
-  if ((* ((int64_t *) (newspace_to_tospace (target)))) == 8)
+  if ((((int64_t *) (newspace_to_tospace (target)))[-1]) == 0)
     {
-      insn_t * pc = (target + 8);
-      ptrdiff_t jmprel32_offset = (pc - (&addr[15]));
+      ptrdiff_t jmprel32_offset = (target - (&addr[15]));
       if ((INT32_MIN <= jmprel32_offset) && (jmprel32_offset <= INT32_MAX))
 	{
 	  (addr[10]) = 0xe9;	/* JMP rel32 */
@@ -237,21 +236,22 @@ write_uuo_target (insn_t * target, SCHEME_OBJECT * saddr)
 	{
 	  (addr[10]) = 0x48;	/* MOV RAX,imm64 */
 	  (addr[11]) = 0xb8;
-	  (* ((insn_t **) (&addr[12]))) = (target + 8);
+	  (* ((insn_t **) (&addr[12]))) = target;
 	  (addr[20]) = 0xff;	/* JMP RAX */
 	  (addr[21]) = 0xe0;
 	}
     }
   else
     {
-      (addr[10]) = 0x48;	/* MOV RAX,(RCX) */
+      (addr[10]) = 0x48;	/* MOV RAX,-8(RCX) */
       (addr[11]) = 0x8b;
-      (addr[12]) = 0x01;
-      (addr[13]) = 0x48;	/* ADD RAX,RCX */
-      (addr[14]) = 0x01;
-      (addr[15]) = 0xc8;
-      (addr[16]) = 0xff;	/* JMP RAX */
-      (addr[17]) = 0xe0;
+      (addr[12]) = 0x41;
+      (addr[13]) = 0xf8;
+      (addr[14]) = 0x48;	/* ADD RAX,RCX */
+      (addr[15]) = 0x01;
+      (addr[16]) = 0xc8;
+      (addr[17]) = 0xff;	/* JMP RAX */
+      (addr[18]) = 0xe0;
     }
 }
 
@@ -278,19 +278,19 @@ trampoline_entry_addr (SCHEME_OBJECT * block, unsigned long index)
 insn_t *
 trampoline_return_addr (SCHEME_OBJECT * block, unsigned long index)
 {
-  return ((trampoline_entry_addr (block, index)) + 8);
+  return (trampoline_entry_addr (block, index));
 }
 
 bool
 store_trampoline_insns (insn_t * entry, uint8_t code)
 {
-  (* ((int64_t *) (&entry[0]))) = 8;
-  (entry[8]) = 0x41;		/* MOVB R9,imm8 */
-  (entry[9]) = 0xb1;
-  (entry[10]) = code;
-  (entry[11]) = 0xff;		/* JMP r/m64 */
-  (entry[12]) = 0xa6;		/* disp32(RSI) */
-  (* ((uint32_t *) (&entry[13]))) = RSI_TRAMPOLINE_TO_INTERFACE_OFFSET;
+  (((int64_t *) entry)[-1]) = 0;
+  (entry[0]) = 0x41;		/* MOVB R9,imm8 */
+  (entry[1]) = 0xb1;
+  (entry[2]) = code;
+  (entry[3]) = 0xff;		/* JMP r/m64 */
+  (entry[4]) = 0xa6;		/* disp32(RSI) */
+  (* ((uint32_t *) (&entry[5]))) = RSI_TRAMPOLINE_TO_INTERFACE_OFFSET;
   return (false);
 }
 
diff --git a/src/microcode/cmpintmd/x86-64.h b/src/microcode/cmpintmd/x86-64.h
index 579832eae..1d775e783 100644
--- a/src/microcode/cmpintmd/x86-64.h
+++ b/src/microcode/cmpintmd/x86-64.h
@@ -83,10 +83,10 @@ entry	8		symbol
 	2		zero
 	7		0x1A
 entry	8		MOV	RCX,imm64	48 b9 <addr64>  ; entry address
-	18		MOV	RAX,(RCX)	48 8b 01
-	21		ADD	RAX,RCX		48 01 c8
-	24		JMP	RAX		ff e0
-	26		<padding>
+	18		MOV	RAX,-8(RCX)	48 8b 41 f8
+	22		ADD	RAX,RCX		48 01 c8
+	25		JMP	RAX		ff e0
+	27		<padding>
 	32		<next cache>
 
 
@@ -99,27 +99,31 @@ nicely.
 	8		<entry count>
 	12		<type/arity info>       \__ format word
 	14		<gc offset>             /
-entry0	16		<offset>
-	24		<padding>
+	16		<pc offset>
+entry0	24		<padding>
 	28		<type/arity info>
 	30		<gc offset>
-entry1	32		...
-	...
-	16 + 16*n	<variables>
+	32		<pc offset>
+entry1	40		<padding>
+	44		<type/arity info>
+	46		<gc offset>
+	48		<pc offset>
+entry2	...
+	8 + 16*n	<variables>
 
 
 - Trampoline encoding:
 
-	-8		<padding>
-	-4		<type/arity info>
-	-2		<gc offset>
-entry	0		<offset>		08 00 00 00 00 00 00 00
-	8		MOVB	R9,code		41 b1 <code8>
-	11		JMP	n(RSI)		ff a6 <n32>
-	17		<padding>
-	24		<trampoline dependent storage>
+	-16		<padding>
+	-12		<type/arity info>
+	-10		<gc offset>
+	-8		<offset>		08 00 00 00 00 00 00 00
+entry	0		MOVB	R9,code		41 b1 <code8>
+	3		JMP	n(RSI)		ff a6 <n32>
+	9		<padding>
+	16		<trampoline dependent storage>
 
-  Distance from address in rcx to storage: 24.
+  Distance from address in rcx to storage: 16.
 
 */
 
@@ -143,9 +147,11 @@ typedef uint8_t insn_t;
 
 /* Number of insn_t units preceding entry address in which header
    (type and offset info) is stored.  */
-#define CC_ENTRY_HEADER_SIZE (CC_ENTRY_TYPE_SIZE + CC_ENTRY_OFFSET_SIZE)
+#define CC_ENTRY_HEADER_SIZE						\
+  (CC_ENTRY_TYPE_SIZE + CC_ENTRY_OFFSET_SIZE + CC_ENTRY_PC_OFFSET_SIZE)
 #define CC_ENTRY_TYPE_SIZE 2
 #define CC_ENTRY_OFFSET_SIZE 2
+#define CC_ENTRY_PC_OFFSET_SIZE 8
 
 /* Number of insn_t units preceding entry header in which GC trap
    instructions are stored.  This is an approximation: it matches only
@@ -157,7 +163,7 @@ typedef uint8_t insn_t;
 #define CC_ENTRY_GC_TRAP_SIZE 6
 
 #define CC_ENTRY_ADDRESS_PTR(e)		(e)
-#define CC_ENTRY_ADDRESS_PC(e)		((e) + (* ((const int64_t *) (e))))
+#define CC_ENTRY_ADDRESS_PC(e)		((e) + (((const int64_t *) (e))[-1]))
 
 #define CC_RETURN_ADDRESS_PTR(r)	0
 #define CC_RETURN_ADDRESS_PC(r)		(r)
-- 
2.25.1