From 09b4d9204b0934205e576b9a615bb8037ea447c4 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Sat, 29 Dec 2018 04:12:28 +0000 Subject: [PATCH] Generate per-invocation jmp instructions. I hypothesize that this will help the CPU's branch target predictor be more precise than having a single jmp instruction inside an assembly hook that actually jumps to an unknown procedure. Empirically, this gives about 5x speed improvement for a microbenchmark involving unknown procedure calls: (define x (make-vector 1)) (define (test-01 x) (define (g) (vector-set! x 0 0)) (g) ((identity-procedure g))) (define (test-10 x) (define (g) (vector-set! x 0 0)) ((identity-procedure g)) (g)) (define (repeat n f x) (show-time (lambda () (do ((i 0 (fix:+ i 1))) ((fix:>= i n)) (f x))))) ; Before: (repeat 10000000 test-01 x) ;process time: 1420 (1370 RUN + 50 GC); real time: 1427 ; After: (repeat 10000000 test-01 x) ;process time: 290 (220 RUN + 70 GC); real time: 312 Caveat: This is on top of a bunch of other experiments. XXX Redo this in isolation. WARNING: This adds hooks to the amd64 compiled code interface, so new compiled code requires a new microcode. (However, a new microcode should handle old compiled code just fine.) --- src/compiler/machines/x86-64/lapgen.scm | 11 +++- src/compiler/machines/x86-64/rules3.scm | 70 ++++++++++++++++++------- src/microcode/cmpauxmd/x86-64.m4 | 57 ++++++++++++++++++++ src/microcode/cmpintmd/x86-64.c | 9 ++++ src/microcode/cmpintmd/x86-64.h | 9 ++++ 5 files changed, 137 insertions(+), 19 deletions(-) diff --git a/src/compiler/machines/x86-64/lapgen.scm b/src/compiler/machines/x86-64/lapgen.scm index 6a1a63ff5..9d24fe7da 100644 --- a/src/compiler/machines/x86-64/lapgen.scm +++ b/src/compiler/machines/x86-64/lapgen.scm @@ -788,7 +788,16 @@ USA. shortcircuit-apply-size-7 shortcircuit-apply-size-8 interrupt-continuation-2 - fixnum-shift) + fixnum-shift + apply-setup + apply-setup-size-1 + apply-setup-size-2 + apply-setup-size-3 + apply-setup-size-4 + apply-setup-size-5 + apply-setup-size-6 + apply-setup-size-7 + apply-setup-size-8) ;; Operation tables diff --git a/src/compiler/machines/x86-64/rules3.scm b/src/compiler/machines/x86-64/rules3.scm index 74c747be8..150900e23 100644 --- a/src/compiler/machines/x86-64/rules3.scm +++ b/src/compiler/machines/x86-64/rules3.scm @@ -62,24 +62,58 @@ USA. (INVOCATION:APPLY (? frame-size) (? continuation)) continuation (expect-no-exit-interrupt-checks) - (LAP ,@(clear-map!) - (POP Q (R ,rbx)) - #| - (MOV Q (R ,rdx) (&U ,frame-size)) - ,@(invoke-interface code:compiler-apply) - |# - ,@(case frame-size - ((1) (invoke-hook entry:compiler-shortcircuit-apply-size-1)) - ((2) (invoke-hook entry:compiler-shortcircuit-apply-size-2)) - ((3) (invoke-hook entry:compiler-shortcircuit-apply-size-3)) - ((4) (invoke-hook entry:compiler-shortcircuit-apply-size-4)) - ((5) (invoke-hook entry:compiler-shortcircuit-apply-size-5)) - ((6) (invoke-hook entry:compiler-shortcircuit-apply-size-6)) - ((7) (invoke-hook entry:compiler-shortcircuit-apply-size-7)) - ((8) (invoke-hook entry:compiler-shortcircuit-apply-size-8)) - (else - (LAP (MOV Q (R ,rdx) (&U ,frame-size)) - ,@(invoke-hook entry:compiler-shortcircuit-apply)))))) + (let ((generic (generate-label 'GENERIC))) + (LAP ,@(clear-map!) + (POP Q (R ,rbx)) + #| + (MOV Q (R ,rdx) (&U ,frame-size)) + ,@(invoke-interface code:compiler-apply) + |# + #| + ,@(case frame-size + ((1) (invoke-hook entry:compiler-shortcircuit-apply-size-1)) + ((2) (invoke-hook entry:compiler-shortcircuit-apply-size-2)) + ((3) (invoke-hook entry:compiler-shortcircuit-apply-size-3)) + ((4) (invoke-hook entry:compiler-shortcircuit-apply-size-4)) + ((5) (invoke-hook entry:compiler-shortcircuit-apply-size-5)) + ((6) (invoke-hook entry:compiler-shortcircuit-apply-size-6)) + ((7) (invoke-hook entry:compiler-shortcircuit-apply-size-7)) + ((8) (invoke-hook entry:compiler-shortcircuit-apply-size-8)) + (else + (LAP (MOV Q (R ,rdx) (&U ,frame-size)) + ,@(invoke-hook entry:compiler-shortcircuit-apply)))) + |# + #| + (POP Q (R ,rcx)) ;Pop tagged entry into RCX. + (MOV Q (R ,rax) (R ,rcx)) ;Copy tagged entry into RAX. + (SHR Q (R ,rax) (&U ,scheme-datum-width)) ;Select tag in RAX. + (AND Q (R ,rcx) (R ,regnum:datum-mask)) ;Select datum in RCX. + (CMP B (R ,rax) (&U ,(ucode-type COMPILED-ENTRY))) ;Check tag. + (JNE (@PCR ,generic)) ;Bail if not compiled entry. + (CMP B (@RO ,rcx -4) (&U ,frame-size)) ;Check arity. + (JNE (@PCR ,generic)) ;Bail if not exact arity match. + (MOV Q (R ,rax) (@R ,rcx)) ;Load offset into RAX. + (ADD Q (R ,rax) (R ,rcx)) ;Add offset to entry address in RAX. + (JMP (R ,rax)) + (LABEL ,generic) + ,@(invoke-hook entry:compiler-shortcircuit-apply) + |# + ,@(case frame-size + ((1) (LAP (CALL ,entry:compiler-apply-setup-size-1))) + ((2) (LAP (CALL ,entry:compiler-apply-setup-size-2))) + ((3) (LAP (CALL ,entry:compiler-apply-setup-size-3))) + ((4) (LAP (CALL ,entry:compiler-apply-setup-size-4))) + ((5) (LAP (CALL ,entry:compiler-apply-setup-size-5))) + ((6) (LAP (CALL ,entry:compiler-apply-setup-size-6))) + ((7) (LAP (CALL ,entry:compiler-apply-setup-size-7))) + ((8) (LAP (CALL ,entry:compiler-apply-setup-size-8))) + (else + (LAP (MOV Q (R ,rdx) (&U ,frame-size)) + (CALL ,entry:compiler-apply-setup)))) + (JNE (@PCR ,generic)) + (JMP (R ,rax)) + (LABEL ,generic) + ,@(invoke-hook entry:compiler-shortcircuit-apply)))) (define-rule statement (INVOCATION:JUMP (? frame-size) (? continuation) (? label)) diff --git a/src/microcode/cmpauxmd/x86-64.m4 b/src/microcode/cmpauxmd/x86-64.m4 index d10c67d72..ce5a808b1 100644 --- a/src/microcode/cmpauxmd/x86-64.m4 +++ b/src/microcode/cmpauxmd/x86-64.m4 @@ -570,6 +570,63 @@ define_call_indirection(primitive_error,36) # OP(mov,b) TW(IMM(HEX(14)),REG(al)) # jmp scheme_to_interface') +# Stack has untagged return address, then tagged entry, rdx has +# argument count. If tagged entry is TC_COMPILED_ENTRY of the correct +# arity, set condition codes for equal, store the untagged entry +# address in rcx, and store the PC in rax. Otherwise, set condition +# codes for not-equal, and leave the stack alone. Either way, pop and +# return. +declare_alignment(2) +define_hook_label(apply_setup) + OP(mov,q) TW(REG(rbx),REG(rax)) # Copy for type code + OP(mov,q) TW(REG(rbx),REG(rcx)) # Copy for address + OP(shr,q) TW(IMM(DATUM_LENGTH),REG(rax)) # Select type code + OP(and,q) TW(rmask,REG(rcx)) # Select datum + OP(cmp,b) TW(IMM(TC_COMPILED_ENTRY),REG(al)) + jne asm_apply_setup_fail + # We now have a compiled entry, so it is safe to compute the + # PC. Do that first, because it sets flags, which are used by + # the caller. + OP(mov,q) TW(IND(REG(rcx)),REG(rax)) # rax := PC offset + OP(add,q) TW(REG(rcx),REG(rax)) # rax := PC + # Now check the frame size. The caller will test the flags + # again for another conditional jump. + OP(movs,bq,x) TW(BOF(-4,REG(rcx)),REG(rax)) # Extract frame size + OP(cmp,q) TW(REG(rax),REG(rdx)) # Compare to nargs+1 + jne asm_apply_setup_fail + ret + +asm_apply_setup_fail: + ret + +define(define_apply_setup_fixed_size, +`declare_alignment(2) +define_hook_label(apply_setup_size_$1) + OP(mov,q) TW(REG(rbx),REG(rax)) # Copy for type code + OP(mov,q) TW(REG(rbx),REG(rcx)) # Copy for address + OP(shr,q) TW(IMM(DATUM_LENGTH),REG(rax)) # Select type code + OP(and,q) TW(rmask,REG(rcx)) # Select datum + OP(cmp,b) TW(IMM(TC_COMPILED_ENTRY),REG(al)) + jne asm_apply_setup_size_$1_fail + OP(mov,q) TW(IND(REG(rcx)),REG(rax)) # rax := PC offset + OP(add,q) TW(REG(rcx),REG(rax)) # rax := PC + OP(cmp,b) TW(IMM($1),BOF(-4,REG(rcx))) # Compare frame size + jne asm_apply_setup_size_$1_fail # to nargs+1 + ret + +asm_apply_setup_size_$1_fail: + OP(mov,q) TW(IMM(HEX($1)),REG(rdx)) + ret') + +define_apply_setup_fixed_size(1) +define_apply_setup_fixed_size(2) +define_apply_setup_fixed_size(3) +define_apply_setup_fixed_size(4) +define_apply_setup_fixed_size(5) +define_apply_setup_fixed_size(6) +define_apply_setup_fixed_size(7) +define_apply_setup_fixed_size(8) + declare_alignment(2) define_hook_label(sc_apply) OP(mov,q) TW(REG(rbx),REG(rax)) # Copy for type code diff --git a/src/microcode/cmpintmd/x86-64.c b/src/microcode/cmpintmd/x86-64.c index 19e168e3b..6a631d552 100644 --- a/src/microcode/cmpintmd/x86-64.c +++ b/src/microcode/cmpintmd/x86-64.c @@ -347,6 +347,15 @@ x86_64_reset_hook (void) SETUP_REGISTER (asm_interrupt_continuation_2); /* 39 */ SETUP_REGISTER (asm_fixnum_shift); /* 40 */ + SETUP_REGISTER (asm_apply_setup); /* 41 */ + SETUP_REGISTER (asm_apply_setup_size_1); /* 42 */ + SETUP_REGISTER (asm_apply_setup_size_2); /* 43 */ + SETUP_REGISTER (asm_apply_setup_size_3); /* 44 */ + SETUP_REGISTER (asm_apply_setup_size_4); /* 45 */ + SETUP_REGISTER (asm_apply_setup_size_5); /* 46 */ + SETUP_REGISTER (asm_apply_setup_size_6); /* 47 */ + SETUP_REGISTER (asm_apply_setup_size_7); /* 48 */ + SETUP_REGISTER (asm_apply_setup_size_8); /* 49 */ #ifdef _MACH_UNIX { diff --git a/src/microcode/cmpintmd/x86-64.h b/src/microcode/cmpintmd/x86-64.h index 4acce5472..6189ab156 100644 --- a/src/microcode/cmpintmd/x86-64.h +++ b/src/microcode/cmpintmd/x86-64.h @@ -255,6 +255,15 @@ extern void asm_sc_apply_size_5 (void); extern void asm_sc_apply_size_6 (void); extern void asm_sc_apply_size_7 (void); extern void asm_sc_apply_size_8 (void); +extern void asm_apply_setup (void); +extern void asm_apply_setup_size_1 (void); +extern void asm_apply_setup_size_2 (void); +extern void asm_apply_setup_size_3 (void); +extern void asm_apply_setup_size_4 (void); +extern void asm_apply_setup_size_5 (void); +extern void asm_apply_setup_size_6 (void); +extern void asm_apply_setup_size_7 (void); +extern void asm_apply_setup_size_8 (void); extern void asm_scheme_to_interface (void); extern void asm_scheme_to_interface_call (void); extern void asm_serialize_cache (void); -- 2.25.1