From a6a461c06f036195c98bcf0f4f31bef7fe18859a Mon Sep 17 00:00:00 2001
From: Taylor R Campbell <campbell@mumble.net>
Date: Thu, 20 Dec 2018 04:58:07 +0000
Subject: [PATCH] Avoid CALL without RET for closure entries, hooks, and
 trampolines.

This will wreck the CPU's return address branch target predictor.

This is an intermediate change en route to using paired CALL/RET for
continuation pushes and pop-returns in order to take advantage of the
CPU's branch target predictor.

WARNING: This changes the format of compiled closures, and as such,
new compiled code requires a new microcode and vice versa.
---
 src/compiler/machines/x86-64/lapgen.scm | 15 ++++++++++--
 src/compiler/machines/x86-64/machin.scm | 14 +++++++----
 src/compiler/machines/x86-64/rules3.scm | 32 ++++++++++++++++++-------
 src/microcode/cmpauxmd/x86-64.m4        |  2 ++
 src/microcode/cmpintmd/x86-64.c         | 13 ++++++----
 src/microcode/cmpintmd/x86-64.h         | 32 +++++++++++++++----------
 6 files changed, 77 insertions(+), 31 deletions(-)

diff --git a/src/compiler/machines/x86-64/lapgen.scm b/src/compiler/machines/x86-64/lapgen.scm
index 102431f9a..39156b324 100644
--- a/src/compiler/machines/x86-64/lapgen.scm
+++ b/src/compiler/machines/x86-64/lapgen.scm
@@ -711,8 +711,19 @@ USA.
 (define-integrable (invoke-hook entry)
   (LAP (JMP ,entry)))
 
-(define-integrable (invoke-hook/call entry)
-  (LAP (CALL ,entry)))
+(define (invoke-hook/call entry)
+  (let* ((get-pc (generate-label 'GET-PC))
+	 (hook-context (generate-label 'HOOK-CONTEXT)))
+    (LAP (CALL (@PCR ,get-pc))
+	(LABEL ,get-pc)
+	 ;; ADD r/m64,imm8		48 83 04 24 xx
+	 ;; JMP r/m64			ff 86 yy yy yy yy
+	 ;; Register displacement for JMP is always >=0x80, so can't
+	 ;; fit in signed byte and thus must use 32-bit displacement.
+	 ;; Hence xx = 0x0b = 11.
+	 (ADD Q (@R ,rsp) (& #x0b))
+	 (JMP ,entry)
+	(LABEL ,hook-context))))
 
 (define-integrable (invoke-interface code)
   (LAP (MOV B (R ,rax) (& ,code))
diff --git a/src/compiler/machines/x86-64/machin.scm b/src/compiler/machines/x86-64/machin.scm
index 5d4cf4141..c874b9657 100644
--- a/src/compiler/machines/x86-64/machin.scm
+++ b/src/compiler/machines/x86-64/machin.scm
@@ -83,7 +83,7 @@ USA.
 
 ;;; See microcode/cmpintmd/x86-64.h for a description of the layout.
 
-(define-integrable closure-entry-size 2)
+(define-integrable closure-entry-size 3) ;units of objects
 
 (define-integrable address-units-per-closure-manifest address-units-per-object)
 (define-integrable address-units-per-entry-format-code 4)
@@ -91,8 +91,12 @@ USA.
 (define-integrable address-units-per-closure-padding 4)
 
 ;;; (MOV Q (R ,rax) (&U <entry>))	48 B8 <eight-byte immediate>
-;;; (CALL (R ,rax))			FF D0
-(define-integrable address-units-per-closure-entry-instructions 12)
+;;; (CALL (@PCR CALL-OFFSET))		E8 00 00 00 00
+;;; (LABEL CALL-OFFSET)
+;;; (JMP (R ,rax))			FF E0
+;;; <padding>				xx xx xx
+(define-integrable address-units-per-closure-entry-call-offset 15)
+(define-integrable address-units-per-closure-entry-instructions 20)
 
 (define-integrable address-units-per-closure-entry
   (+ address-units-per-entry-format-code
@@ -100,8 +104,8 @@ USA.
 
 ;;; Note:
 ;;;
-;;; (= address-units-per-closure-entry #| 16 |#
-;;;    (* closure-entry-size #| 2 |# address-units-per-object #| 8 |#))
+;;; (= address-units-per-closure-entry #| 24 |#
+;;;    (* closure-entry-size #| 3 |# address-units-per-object #| 8 |#))
 
 ;;; Given the number of entries in a closure, and the index of an
 ;;; entry, return the number of words from that entry's closure
diff --git a/src/compiler/machines/x86-64/rules3.scm b/src/compiler/machines/x86-64/rules3.scm
index a5d43d4da..71093be39 100644
--- a/src/compiler/machines/x86-64/rules3.scm
+++ b/src/compiler/machines/x86-64/rules3.scm
@@ -518,18 +518,34 @@ USA.
   (let* ((procedure-label (rtl-procedure/external-label (label->object label)))
 	 (MOV-offset (+ offset address-units-per-entry-format-code))
 	 (imm64-offset (+ MOV-offset 2))
-	 (CALL-offset (+ imm64-offset 8)))
+	 (CALL-offset (+ imm64-offset 8))
+	 (CALL-rel32-offset (+ CALL-offset 1))
+	 (JMP-offset (+ CALL-rel32-offset 4))
+	 (padding-offset (+ JMP-offset 2)))
+    CALL-rel32-offset JMP-offset padding-offset
     (LAP (MOV L (@RO ,regnum:free-pointer ,offset)
 	      (&U ,(make-closure-code-longword min max MOV-offset)))
 	 (LEA Q ,temp (@PCR ,procedure-label))
-	 ;; (MOV Q (R ,rax) (&U <procedure-label>))
-	 ;; The instruction sequence is really `48 b8', but this is a
-	 ;; stupid little-endian architecture.  I want my afternoon
-	 ;; back.
+	 ;; (MOV Q (R ,rax) (&U <procedure-label>))	48 b8
 	 (MOV W (@RO ,regnum:free-pointer ,MOV-offset) (&U #xB848))
 	 (MOV Q (@RO ,regnum:free-pointer ,imm64-offset) ,temp)
-	 ;; (CALL (R ,rax))
-	 (MOV W (@RO ,regnum:free-pointer ,CALL-offset) (&U #xD0FF)))))
+	 ;; (CALL (@PCO 0))				e8 00 00 00 00
+	 ;; (JMP (R ,rax))				ff e0
+	 ;; (PADDING 0 8 #*00000000)			00
+	 (MOV Q ,temp (&U #x00E0FF00000000E8))
+	 (MOV Q (@RO ,regnum:free-pointer ,CALL-offset) ,temp)
+#|
+	 ;; (CALL (@PCO 0))				e8 00 00 00 00
+	 (MOV B (@RO ,regnum:free-pointer ,CALL-offset) (&U #xE8))
+	 (MOV Q (@RO ,regnum:free-pointer ,CALL-rel32-offset) (&U 0))
+	 ;; (JMP (R ,rax))				ff e0
+	 (MOV W (@RO ,regnum:free-pointer ,JMP-offset) (&U #xE0FF))
+	 #|
+	 ;; (PADDING 0 8 #*00000000)			00
+	 (MOV B (@RO ,regnum:free-pointer ,PAD-offset) (&U #x00))
+	 |#
+|#
+	 )))
 
 (define (generate/closure-header internal-label nentries)
   (let* ((rtl-proc (label->object internal-label))
@@ -570,7 +586,7 @@ USA.
 
 (define-integrable (closure-entry-magic)
   (- (make-non-pointer-literal (ucode-type COMPILED-ENTRY) 0)
-     address-units-per-closure-entry-instructions))
+     address-units-per-closure-entry-call-offset))
 
 (define-integrable (make-closure-manifest size)
   (make-multiclosure-manifest 1 size))
diff --git a/src/microcode/cmpauxmd/x86-64.m4 b/src/microcode/cmpauxmd/x86-64.m4
index 7177e0b3a..8773125bd 100644
--- a/src/microcode/cmpauxmd/x86-64.m4
+++ b/src/microcode/cmpauxmd/x86-64.m4
@@ -417,6 +417,8 @@ define_c_label(C_to_interface)
 define_hook_label(trampoline_to_interface)
 define_debugging_label(trampoline_to_interface)
 	OP(pop,q)	REG(rbx)			# trampoline storage
+	# See x86-64.h for trampoline encoding layout.
+	OP(add,q)	TW(IMM(9),REG(rbx))		# adjust ptr
 	jmp	scheme_to_interface
 
 define_hook_label(scheme_to_interface_call)
diff --git a/src/microcode/cmpintmd/x86-64.c b/src/microcode/cmpintmd/x86-64.c
index bdefc2971..a03457bc9 100644
--- a/src/microcode/cmpintmd/x86-64.c
+++ b/src/microcode/cmpintmd/x86-64.c
@@ -104,7 +104,7 @@ compiled_closure_entry (insn_t * start)
 insn_t *
 compiled_closure_next (insn_t * start)
 {
-  return (start + CC_ENTRY_HEADER_SIZE + 12);
+  return (start + CC_ENTRY_HEADER_SIZE + 20);
 }
 
 SCHEME_OBJECT *
@@ -175,7 +175,7 @@ write_uuo_target (insn_t * target, SCHEME_OBJECT * saddr)
 }
 
 #define BYTES_PER_TRAMPOLINE_ENTRY_PADDING 4
-#define OBJECTS_PER_TRAMPOLINE_ENTRY 2
+#define OBJECTS_PER_TRAMPOLINE_ENTRY 3
 
 #define RSI_TRAMPOLINE_TO_INTERFACE_OFFSET				\
   ((COMPILER_REGBLOCK_N_FIXED + (2 * COMPILER_HOOK_SIZE))		\
@@ -199,8 +199,13 @@ store_trampoline_insns (insn_t * entry, uint8_t code)
 {
   (*entry++) = 0xB0;		/* MOV AL,code */
   (*entry++) = code;
-  (*entry++) = 0xFF;		/* CALL /2 disp32(RSI) */
-  (*entry++) = 0x96;
+  (*entry++) = 0xE8;		/* CALL rel32 */
+  (*entry++) = 0x00;		/* zero displacement */
+  (*entry++) = 0x00;
+  (*entry++) = 0x00;
+  (*entry++) = 0x00;
+  (*entry++) = 0xFF;		/* JMP r/m64 */
+  (*entry++) = 0xA6;		/* disp32(RSI) */
   (* ((uint32_t *) entry)) = RSI_TRAMPOLINE_TO_INTERFACE_OFFSET;
   return (false);
 }
diff --git a/src/microcode/cmpintmd/x86-64.h b/src/microcode/cmpintmd/x86-64.h
index bed81db92..926a5c7ac 100644
--- a/src/microcode/cmpintmd/x86-64.h
+++ b/src/microcode/cmpintmd/x86-64.h
@@ -82,9 +82,8 @@ entry	8		symbol
 	0		16-bit arity
 	2		zero
 	7		0x1A
-entry	8		MOV RAX,imm64		0x48 0xB8
-	10		<address>
-	18		JMP (RAX)		0xFF 0xE0
+entry	8		MOV	RAX,imm64	48 b8 <addr64>
+	18		JMP	(RAX)		ff e0
 	19-23		<four bytes of padding>
 	24		<next cache>
 
@@ -98,12 +97,15 @@ nicely.
 	8		<entry count>
 	12		<type/arity info>       \__ format word
 	14		<gc offset>             /
-entry0	16		MOV RAX,imm64		0x48 0xB8
-	18		<address>
-	26		CALL (RAX)		0xFF 0xD0
-	28		<four bytes of padding or next format word>
+entry0	16		MOV	RAX,imm64	48 b8 <imm64>
+	26		CALL	[RIP+0]		e8 00 00 00 00
+	31		JMP	(RAX)		ff e0
+	33		<padding>		00 00 00
+	36		<type/arity info>
+	38		<gc offset>
+entry1	40		...
 	...
-	16*(n+1)	<variables>
+	16 + 24*n	<variables>
 
 
 - Trampoline encoding:
@@ -111,9 +113,13 @@ entry0	16		MOV RAX,imm64		0x48 0xB8
 	-8		<padding>
 	-4		<type/arity info>
 	-2		<gc offset>
-entry	0		MOV	AL,code		0xB0, code-byte
-	2		CALL	n(RSI)		0xFF 0x96 n-longword
-	8		<trampoline dependent storage>
+entry	0		MOV	AL,code		b0 <code8>
+	2		CALL	[RIP+0]		e8 00 00 00 00
+	7		JMP	n(RSI)		ff a6 <n32>
+	13		<padding>		00 00 00
+	16		<trampoline dependent storage>
+
+  Distance from address on stack to trampoline storage: 16 - 7 = 9.
 
 */
 
@@ -145,7 +151,9 @@ typedef uint8_t insn_t;
    instructions are stored.  This is an approximation: it matches only
    those non-closure procedures for which LIAR has generated interrupt
    checks, in which case there is one CALL n(RSI), which is encoded as
-   #xff #x96 <n>, where n is a longword (32 bits).  */
+   #xff #x96 <n>, where n is a longword (32 bits).
+
+   XXX Stop using CALL for this.  */
 #define CC_ENTRY_GC_TRAP_SIZE 6
 
 #define EMBEDDED_CLOSURE_ADDRS_P 1
-- 
2.25.1