From a6a461c06f036195c98bcf0f4f31bef7fe18859a Mon Sep 17 00:00:00 2001 From: Taylor R Campbell <campbell@mumble.net> Date: Thu, 20 Dec 2018 04:58:07 +0000 Subject: [PATCH] Avoid CALL without RET for closure entries, hooks, and trampolines. This will wreck the CPU's return address branch target predictor. This is an intermediate change en route to using paired CALL/RET for continuation pushes and pop-returns in order to take advantage of the CPU's branch target predictor. WARNING: This changes the format of compiled closures, and as such, new compiled code requires a new microcode and vice versa. --- src/compiler/machines/x86-64/lapgen.scm | 15 ++++++++++-- src/compiler/machines/x86-64/machin.scm | 14 +++++++---- src/compiler/machines/x86-64/rules3.scm | 32 ++++++++++++++++++------- src/microcode/cmpauxmd/x86-64.m4 | 2 ++ src/microcode/cmpintmd/x86-64.c | 13 ++++++---- src/microcode/cmpintmd/x86-64.h | 32 +++++++++++++++---------- 6 files changed, 77 insertions(+), 31 deletions(-) diff --git a/src/compiler/machines/x86-64/lapgen.scm b/src/compiler/machines/x86-64/lapgen.scm index 102431f9a..39156b324 100644 --- a/src/compiler/machines/x86-64/lapgen.scm +++ b/src/compiler/machines/x86-64/lapgen.scm @@ -711,8 +711,19 @@ USA. (define-integrable (invoke-hook entry) (LAP (JMP ,entry))) -(define-integrable (invoke-hook/call entry) - (LAP (CALL ,entry))) +(define (invoke-hook/call entry) + (let* ((get-pc (generate-label 'GET-PC)) + (hook-context (generate-label 'HOOK-CONTEXT))) + (LAP (CALL (@PCR ,get-pc)) + (LABEL ,get-pc) + ;; ADD r/m64,imm8 48 83 04 24 xx + ;; JMP r/m64 ff 86 yy yy yy yy + ;; Register displacement for JMP is always >=0x80, so can't + ;; fit in signed byte and thus must use 32-bit displacement. + ;; Hence xx = 0x0b = 11. + (ADD Q (@R ,rsp) (& #x0b)) + (JMP ,entry) + (LABEL ,hook-context)))) (define-integrable (invoke-interface code) (LAP (MOV B (R ,rax) (& ,code)) diff --git a/src/compiler/machines/x86-64/machin.scm b/src/compiler/machines/x86-64/machin.scm index 5d4cf4141..c874b9657 100644 --- a/src/compiler/machines/x86-64/machin.scm +++ b/src/compiler/machines/x86-64/machin.scm @@ -83,7 +83,7 @@ USA. ;;; See microcode/cmpintmd/x86-64.h for a description of the layout. -(define-integrable closure-entry-size 2) +(define-integrable closure-entry-size 3) ;units of objects (define-integrable address-units-per-closure-manifest address-units-per-object) (define-integrable address-units-per-entry-format-code 4) @@ -91,8 +91,12 @@ USA. (define-integrable address-units-per-closure-padding 4) ;;; (MOV Q (R ,rax) (&U <entry>)) 48 B8 <eight-byte immediate> -;;; (CALL (R ,rax)) FF D0 -(define-integrable address-units-per-closure-entry-instructions 12) +;;; (CALL (@PCR CALL-OFFSET)) E8 00 00 00 00 +;;; (LABEL CALL-OFFSET) +;;; (JMP (R ,rax)) FF E0 +;;; <padding> xx xx xx +(define-integrable address-units-per-closure-entry-call-offset 15) +(define-integrable address-units-per-closure-entry-instructions 20) (define-integrable address-units-per-closure-entry (+ address-units-per-entry-format-code @@ -100,8 +104,8 @@ USA. ;;; Note: ;;; -;;; (= address-units-per-closure-entry #| 16 |# -;;; (* closure-entry-size #| 2 |# address-units-per-object #| 8 |#)) +;;; (= address-units-per-closure-entry #| 24 |# +;;; (* closure-entry-size #| 3 |# address-units-per-object #| 8 |#)) ;;; Given the number of entries in a closure, and the index of an ;;; entry, return the number of words from that entry's closure diff --git a/src/compiler/machines/x86-64/rules3.scm b/src/compiler/machines/x86-64/rules3.scm index a5d43d4da..71093be39 100644 --- a/src/compiler/machines/x86-64/rules3.scm +++ b/src/compiler/machines/x86-64/rules3.scm @@ -518,18 +518,34 @@ USA. (let* ((procedure-label (rtl-procedure/external-label (label->object label))) (MOV-offset (+ offset address-units-per-entry-format-code)) (imm64-offset (+ MOV-offset 2)) - (CALL-offset (+ imm64-offset 8))) + (CALL-offset (+ imm64-offset 8)) + (CALL-rel32-offset (+ CALL-offset 1)) + (JMP-offset (+ CALL-rel32-offset 4)) + (padding-offset (+ JMP-offset 2))) + CALL-rel32-offset JMP-offset padding-offset (LAP (MOV L (@RO ,regnum:free-pointer ,offset) (&U ,(make-closure-code-longword min max MOV-offset))) (LEA Q ,temp (@PCR ,procedure-label)) - ;; (MOV Q (R ,rax) (&U <procedure-label>)) - ;; The instruction sequence is really `48 b8', but this is a - ;; stupid little-endian architecture. I want my afternoon - ;; back. + ;; (MOV Q (R ,rax) (&U <procedure-label>)) 48 b8 (MOV W (@RO ,regnum:free-pointer ,MOV-offset) (&U #xB848)) (MOV Q (@RO ,regnum:free-pointer ,imm64-offset) ,temp) - ;; (CALL (R ,rax)) - (MOV W (@RO ,regnum:free-pointer ,CALL-offset) (&U #xD0FF))))) + ;; (CALL (@PCO 0)) e8 00 00 00 00 + ;; (JMP (R ,rax)) ff e0 + ;; (PADDING 0 8 #*00000000) 00 + (MOV Q ,temp (&U #x00E0FF00000000E8)) + (MOV Q (@RO ,regnum:free-pointer ,CALL-offset) ,temp) +#| + ;; (CALL (@PCO 0)) e8 00 00 00 00 + (MOV B (@RO ,regnum:free-pointer ,CALL-offset) (&U #xE8)) + (MOV Q (@RO ,regnum:free-pointer ,CALL-rel32-offset) (&U 0)) + ;; (JMP (R ,rax)) ff e0 + (MOV W (@RO ,regnum:free-pointer ,JMP-offset) (&U #xE0FF)) + #| + ;; (PADDING 0 8 #*00000000) 00 + (MOV B (@RO ,regnum:free-pointer ,PAD-offset) (&U #x00)) + |# +|# + ))) (define (generate/closure-header internal-label nentries) (let* ((rtl-proc (label->object internal-label)) @@ -570,7 +586,7 @@ USA. (define-integrable (closure-entry-magic) (- (make-non-pointer-literal (ucode-type COMPILED-ENTRY) 0) - address-units-per-closure-entry-instructions)) + address-units-per-closure-entry-call-offset)) (define-integrable (make-closure-manifest size) (make-multiclosure-manifest 1 size)) diff --git a/src/microcode/cmpauxmd/x86-64.m4 b/src/microcode/cmpauxmd/x86-64.m4 index 7177e0b3a..8773125bd 100644 --- a/src/microcode/cmpauxmd/x86-64.m4 +++ b/src/microcode/cmpauxmd/x86-64.m4 @@ -417,6 +417,8 @@ define_c_label(C_to_interface) define_hook_label(trampoline_to_interface) define_debugging_label(trampoline_to_interface) OP(pop,q) REG(rbx) # trampoline storage + # See x86-64.h for trampoline encoding layout. + OP(add,q) TW(IMM(9),REG(rbx)) # adjust ptr jmp scheme_to_interface define_hook_label(scheme_to_interface_call) diff --git a/src/microcode/cmpintmd/x86-64.c b/src/microcode/cmpintmd/x86-64.c index bdefc2971..a03457bc9 100644 --- a/src/microcode/cmpintmd/x86-64.c +++ b/src/microcode/cmpintmd/x86-64.c @@ -104,7 +104,7 @@ compiled_closure_entry (insn_t * start) insn_t * compiled_closure_next (insn_t * start) { - return (start + CC_ENTRY_HEADER_SIZE + 12); + return (start + CC_ENTRY_HEADER_SIZE + 20); } SCHEME_OBJECT * @@ -175,7 +175,7 @@ write_uuo_target (insn_t * target, SCHEME_OBJECT * saddr) } #define BYTES_PER_TRAMPOLINE_ENTRY_PADDING 4 -#define OBJECTS_PER_TRAMPOLINE_ENTRY 2 +#define OBJECTS_PER_TRAMPOLINE_ENTRY 3 #define RSI_TRAMPOLINE_TO_INTERFACE_OFFSET \ ((COMPILER_REGBLOCK_N_FIXED + (2 * COMPILER_HOOK_SIZE)) \ @@ -199,8 +199,13 @@ store_trampoline_insns (insn_t * entry, uint8_t code) { (*entry++) = 0xB0; /* MOV AL,code */ (*entry++) = code; - (*entry++) = 0xFF; /* CALL /2 disp32(RSI) */ - (*entry++) = 0x96; + (*entry++) = 0xE8; /* CALL rel32 */ + (*entry++) = 0x00; /* zero displacement */ + (*entry++) = 0x00; + (*entry++) = 0x00; + (*entry++) = 0x00; + (*entry++) = 0xFF; /* JMP r/m64 */ + (*entry++) = 0xA6; /* disp32(RSI) */ (* ((uint32_t *) entry)) = RSI_TRAMPOLINE_TO_INTERFACE_OFFSET; return (false); } diff --git a/src/microcode/cmpintmd/x86-64.h b/src/microcode/cmpintmd/x86-64.h index bed81db92..926a5c7ac 100644 --- a/src/microcode/cmpintmd/x86-64.h +++ b/src/microcode/cmpintmd/x86-64.h @@ -82,9 +82,8 @@ entry 8 symbol 0 16-bit arity 2 zero 7 0x1A -entry 8 MOV RAX,imm64 0x48 0xB8 - 10 <address> - 18 JMP (RAX) 0xFF 0xE0 +entry 8 MOV RAX,imm64 48 b8 <addr64> + 18 JMP (RAX) ff e0 19-23 <four bytes of padding> 24 <next cache> @@ -98,12 +97,15 @@ nicely. 8 <entry count> 12 <type/arity info> \__ format word 14 <gc offset> / -entry0 16 MOV RAX,imm64 0x48 0xB8 - 18 <address> - 26 CALL (RAX) 0xFF 0xD0 - 28 <four bytes of padding or next format word> +entry0 16 MOV RAX,imm64 48 b8 <imm64> + 26 CALL [RIP+0] e8 00 00 00 00 + 31 JMP (RAX) ff e0 + 33 <padding> 00 00 00 + 36 <type/arity info> + 38 <gc offset> +entry1 40 ... ... - 16*(n+1) <variables> + 16 + 24*n <variables> - Trampoline encoding: @@ -111,9 +113,13 @@ entry0 16 MOV RAX,imm64 0x48 0xB8 -8 <padding> -4 <type/arity info> -2 <gc offset> -entry 0 MOV AL,code 0xB0, code-byte - 2 CALL n(RSI) 0xFF 0x96 n-longword - 8 <trampoline dependent storage> +entry 0 MOV AL,code b0 <code8> + 2 CALL [RIP+0] e8 00 00 00 00 + 7 JMP n(RSI) ff a6 <n32> + 13 <padding> 00 00 00 + 16 <trampoline dependent storage> + + Distance from address on stack to trampoline storage: 16 - 7 = 9. */ @@ -145,7 +151,9 @@ typedef uint8_t insn_t; instructions are stored. This is an approximation: it matches only those non-closure procedures for which LIAR has generated interrupt checks, in which case there is one CALL n(RSI), which is encoded as - #xff #x96 <n>, where n is a longword (32 bits). */ + #xff #x96 <n>, where n is a longword (32 bits). + + XXX Stop using CALL for this. */ #define CC_ENTRY_GC_TRAP_SIZE 6 #define EMBEDDED_CLOSURE_ADDRS_P 1 -- 2.25.1