I hypothesize that this will help the CPU's branch target predictor
be more precise than having a single jmp instruction inside an
assembly hook that actually jumps to an unknown procedure.
Empirically, this gives about 5x speed improvement for a
microbenchmark involving unknown procedure calls:
(define x (make-vector 1))
(define (test-01 x)
(define (g)
(vector-set! x 0 0))
(g)
((identity-procedure g)))
(define (test-10 x)
(define (g)
(vector-set! x 0 0))
((identity-procedure g))
(g))
(define (repeat n f x)
(show-time
(lambda ()
(do ((i 0 (fix:+ i 1)))
((fix:>= i n))
(f x)))))
; Before:
(repeat
10000000 test-01 x)
;process time: 1420 (1370 RUN + 50 GC); real time: 1427
; After:
(repeat
10000000 test-01 x)
;process time: 290 (220 RUN + 70 GC); real time: 312
Caveat: This is on top of a bunch of other experiments.
XXX Redo this in isolation.
WARNING: This adds hooks to the amd64 compiled code interface, so new
compiled code requires a new microcode. (However, a new microcode
should handle old compiled code just fine.)
shortcircuit-apply-size-7
shortcircuit-apply-size-8
interrupt-continuation-2
- fixnum-shift)
+ fixnum-shift
+ apply-setup
+ apply-setup-size-1
+ apply-setup-size-2
+ apply-setup-size-3
+ apply-setup-size-4
+ apply-setup-size-5
+ apply-setup-size-6
+ apply-setup-size-7
+ apply-setup-size-8)
\f
;; Operation tables
(INVOCATION:APPLY (? frame-size) (? continuation))
continuation
(expect-no-exit-interrupt-checks)
- (LAP ,@(clear-map!)
- (POP Q (R ,rbx))
- #|
- (MOV Q (R ,rdx) (&U ,frame-size))
- ,@(invoke-interface code:compiler-apply)
- |#
- ,@(case frame-size
- ((1) (invoke-hook entry:compiler-shortcircuit-apply-size-1))
- ((2) (invoke-hook entry:compiler-shortcircuit-apply-size-2))
- ((3) (invoke-hook entry:compiler-shortcircuit-apply-size-3))
- ((4) (invoke-hook entry:compiler-shortcircuit-apply-size-4))
- ((5) (invoke-hook entry:compiler-shortcircuit-apply-size-5))
- ((6) (invoke-hook entry:compiler-shortcircuit-apply-size-6))
- ((7) (invoke-hook entry:compiler-shortcircuit-apply-size-7))
- ((8) (invoke-hook entry:compiler-shortcircuit-apply-size-8))
- (else
- (LAP (MOV Q (R ,rdx) (&U ,frame-size))
- ,@(invoke-hook entry:compiler-shortcircuit-apply))))))
+ (let ((generic (generate-label 'GENERIC)))
+ (LAP ,@(clear-map!)
+ (POP Q (R ,rbx))
+ #|
+ (MOV Q (R ,rdx) (&U ,frame-size))
+ ,@(invoke-interface code:compiler-apply)
+ |#
+ #|
+ ,@(case frame-size
+ ((1) (invoke-hook entry:compiler-shortcircuit-apply-size-1))
+ ((2) (invoke-hook entry:compiler-shortcircuit-apply-size-2))
+ ((3) (invoke-hook entry:compiler-shortcircuit-apply-size-3))
+ ((4) (invoke-hook entry:compiler-shortcircuit-apply-size-4))
+ ((5) (invoke-hook entry:compiler-shortcircuit-apply-size-5))
+ ((6) (invoke-hook entry:compiler-shortcircuit-apply-size-6))
+ ((7) (invoke-hook entry:compiler-shortcircuit-apply-size-7))
+ ((8) (invoke-hook entry:compiler-shortcircuit-apply-size-8))
+ (else
+ (LAP (MOV Q (R ,rdx) (&U ,frame-size))
+ ,@(invoke-hook entry:compiler-shortcircuit-apply))))
+ |#
+ #|
+ (POP Q (R ,rcx)) ;Pop tagged entry into RCX.
+ (MOV Q (R ,rax) (R ,rcx)) ;Copy tagged entry into RAX.
+ (SHR Q (R ,rax) (&U ,scheme-datum-width)) ;Select tag in RAX.
+ (AND Q (R ,rcx) (R ,regnum:datum-mask)) ;Select datum in RCX.
+ (CMP B (R ,rax) (&U ,(ucode-type COMPILED-ENTRY))) ;Check tag.
+ (JNE (@PCR ,generic)) ;Bail if not compiled entry.
+ (CMP B (@RO ,rcx -4) (&U ,frame-size)) ;Check arity.
+ (JNE (@PCR ,generic)) ;Bail if not exact arity match.
+ (MOV Q (R ,rax) (@R ,rcx)) ;Load offset into RAX.
+ (ADD Q (R ,rax) (R ,rcx)) ;Add offset to entry address in RAX.
+ (JMP (R ,rax))
+ (LABEL ,generic)
+ ,@(invoke-hook entry:compiler-shortcircuit-apply)
+ |#
+ ,@(case frame-size
+ ((1) (LAP (CALL ,entry:compiler-apply-setup-size-1)))
+ ((2) (LAP (CALL ,entry:compiler-apply-setup-size-2)))
+ ((3) (LAP (CALL ,entry:compiler-apply-setup-size-3)))
+ ((4) (LAP (CALL ,entry:compiler-apply-setup-size-4)))
+ ((5) (LAP (CALL ,entry:compiler-apply-setup-size-5)))
+ ((6) (LAP (CALL ,entry:compiler-apply-setup-size-6)))
+ ((7) (LAP (CALL ,entry:compiler-apply-setup-size-7)))
+ ((8) (LAP (CALL ,entry:compiler-apply-setup-size-8)))
+ (else
+ (LAP (MOV Q (R ,rdx) (&U ,frame-size))
+ (CALL ,entry:compiler-apply-setup))))
+ (JNE (@PCR ,generic))
+ (JMP (R ,rax))
+ (LABEL ,generic)
+ ,@(invoke-hook entry:compiler-shortcircuit-apply))))
(define-rule statement
(INVOCATION:JUMP (? frame-size) (? continuation) (? label))
# OP(mov,b) TW(IMM(HEX(14)),REG(al))
# jmp scheme_to_interface')
+# Stack has untagged return address, then tagged entry, rdx has
+# argument count. If tagged entry is TC_COMPILED_ENTRY of the correct
+# arity, set condition codes for equal, store the untagged entry
+# address in rcx, and store the PC in rax. Otherwise, set condition
+# codes for not-equal, and leave the stack alone. Either way, pop and
+# return.
+declare_alignment(2)
+define_hook_label(apply_setup)
+ OP(mov,q) TW(REG(rbx),REG(rax)) # Copy for type code
+ OP(mov,q) TW(REG(rbx),REG(rcx)) # Copy for address
+ OP(shr,q) TW(IMM(DATUM_LENGTH),REG(rax)) # Select type code
+ OP(and,q) TW(rmask,REG(rcx)) # Select datum
+ OP(cmp,b) TW(IMM(TC_COMPILED_ENTRY),REG(al))
+ jne asm_apply_setup_fail
+ # We now have a compiled entry, so it is safe to compute the
+ # PC. Do that first, because it sets flags, which are used by
+ # the caller.
+ OP(mov,q) TW(IND(REG(rcx)),REG(rax)) # rax := PC offset
+ OP(add,q) TW(REG(rcx),REG(rax)) # rax := PC
+ # Now check the frame size. The caller will test the flags
+ # again for another conditional jump.
+ OP(movs,bq,x) TW(BOF(-4,REG(rcx)),REG(rax)) # Extract frame size
+ OP(cmp,q) TW(REG(rax),REG(rdx)) # Compare to nargs+1
+ jne asm_apply_setup_fail
+ ret
+
+asm_apply_setup_fail:
+ ret
+
+define(define_apply_setup_fixed_size,
+`declare_alignment(2)
+define_hook_label(apply_setup_size_$1)
+ OP(mov,q) TW(REG(rbx),REG(rax)) # Copy for type code
+ OP(mov,q) TW(REG(rbx),REG(rcx)) # Copy for address
+ OP(shr,q) TW(IMM(DATUM_LENGTH),REG(rax)) # Select type code
+ OP(and,q) TW(rmask,REG(rcx)) # Select datum
+ OP(cmp,b) TW(IMM(TC_COMPILED_ENTRY),REG(al))
+ jne asm_apply_setup_size_$1_fail
+ OP(mov,q) TW(IND(REG(rcx)),REG(rax)) # rax := PC offset
+ OP(add,q) TW(REG(rcx),REG(rax)) # rax := PC
+ OP(cmp,b) TW(IMM($1),BOF(-4,REG(rcx))) # Compare frame size
+ jne asm_apply_setup_size_$1_fail # to nargs+1
+ ret
+
+asm_apply_setup_size_$1_fail:
+ OP(mov,q) TW(IMM(HEX($1)),REG(rdx))
+ ret')
+
+define_apply_setup_fixed_size(1)
+define_apply_setup_fixed_size(2)
+define_apply_setup_fixed_size(3)
+define_apply_setup_fixed_size(4)
+define_apply_setup_fixed_size(5)
+define_apply_setup_fixed_size(6)
+define_apply_setup_fixed_size(7)
+define_apply_setup_fixed_size(8)
+
declare_alignment(2)
define_hook_label(sc_apply)
OP(mov,q) TW(REG(rbx),REG(rax)) # Copy for type code
SETUP_REGISTER (asm_interrupt_continuation_2); /* 39 */
SETUP_REGISTER (asm_fixnum_shift); /* 40 */
+ SETUP_REGISTER (asm_apply_setup); /* 41 */
+ SETUP_REGISTER (asm_apply_setup_size_1); /* 42 */
+ SETUP_REGISTER (asm_apply_setup_size_2); /* 43 */
+ SETUP_REGISTER (asm_apply_setup_size_3); /* 44 */
+ SETUP_REGISTER (asm_apply_setup_size_4); /* 45 */
+ SETUP_REGISTER (asm_apply_setup_size_5); /* 46 */
+ SETUP_REGISTER (asm_apply_setup_size_6); /* 47 */
+ SETUP_REGISTER (asm_apply_setup_size_7); /* 48 */
+ SETUP_REGISTER (asm_apply_setup_size_8); /* 49 */
#ifdef _MACH_UNIX
{
extern void asm_sc_apply_size_6 (void);
extern void asm_sc_apply_size_7 (void);
extern void asm_sc_apply_size_8 (void);
+extern void asm_apply_setup (void);
+extern void asm_apply_setup_size_1 (void);
+extern void asm_apply_setup_size_2 (void);
+extern void asm_apply_setup_size_3 (void);
+extern void asm_apply_setup_size_4 (void);
+extern void asm_apply_setup_size_5 (void);
+extern void asm_apply_setup_size_6 (void);
+extern void asm_apply_setup_size_7 (void);
+extern void asm_apply_setup_size_8 (void);
extern void asm_scheme_to_interface (void);
extern void asm_scheme_to_interface_call (void);
extern void asm_serialize_cache (void);