From 67adeda088f018ba07ad79e0b875a12d516c4623 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Thu, 27 Dec 2018 03:58:38 +0000 Subject: [PATCH] Use indirection for entry points on amd64. A compiled entry is now a tagged address A pointing to a 64-bit word W such that A + W points to instruction to execute. This adds a memory indirection overhead to unknown procedure calls, but it has the effect that consing a closure only involves writing data memory, not instruction memory that must be reloaded into the CPU's instruction cache. WARNING: This changes the amd64 compiled code interface, so you'll have to build a new compiler running on an old microcode and use that to compile a new system afresh. --- src/compiler/machines/x86-64/insutl.scm | 7 ++ src/compiler/machines/x86-64/lapgen.scm | 5 +- src/compiler/machines/x86-64/machin.scm | 15 +-- src/compiler/machines/x86-64/rules3.scm | 123 ++++++++++++------------ src/microcode/cmpauxmd/x86-64.m4 | 46 +++++---- src/microcode/cmpintmd/x86-64.c | 97 ++++++++++++++----- src/microcode/cmpintmd/x86-64.h | 58 ++++++----- src/microcode/gcloop.c | 11 +-- 8 files changed, 209 insertions(+), 153 deletions(-) diff --git a/src/compiler/machines/x86-64/insutl.scm b/src/compiler/machines/x86-64/insutl.scm index 0de5a9c8c..cef02135b 100644 --- a/src/compiler/machines/x86-64/insutl.scm +++ b/src/compiler/machines/x86-64/insutl.scm @@ -181,6 +181,13 @@ USA. (R/M 5) (BITS (32 `(- ,label (+ *PC* 4)) SIGNED))) + ((@PCRO (? label) (? offset)) + (CATEGORIES MEMORY) + (REX) + (MODE #b00) + (R/M 5) + (BITS (32 `(- (+ ,label ,offset) (+ *PC* 4)) SIGNED))) + ((@PCO (? offset signed-long)) (CATEGORIES MEMORY) (REX) diff --git a/src/compiler/machines/x86-64/lapgen.scm b/src/compiler/machines/x86-64/lapgen.scm index 39156b324..8595951e2 100644 --- a/src/compiler/machines/x86-64/lapgen.scm +++ b/src/compiler/machines/x86-64/lapgen.scm @@ -114,7 +114,8 @@ USA. (set! *external-labels* (cons label *external-labels*)) (LAP (WORD U ,code) (BLOCK-OFFSET ,label) - (LABEL ,label))) + (LABEL ,label) + (QUAD U 8))) (define-integrable (make-code-word min max) (+ (* #x100 min) max)) @@ -221,7 +222,7 @@ USA. (LAP (MOV Q ,target (@PCR ,label-expr)))) (define (load-pc-relative-address target label-expr) - (LAP (LEA Q ,target (@PCR ,label-expr)))) + (LAP (LEA Q ,target (@PCR ,label-expr)))) (define (compare/register*register reg1 reg2) (cond ((register-alias reg1 'GENERAL) diff --git a/src/compiler/machines/x86-64/machin.scm b/src/compiler/machines/x86-64/machin.scm index c874b9657..08c3bf1b9 100644 --- a/src/compiler/machines/x86-64/machin.scm +++ b/src/compiler/machines/x86-64/machin.scm @@ -83,20 +83,15 @@ USA. ;;; See microcode/cmpintmd/x86-64.h for a description of the layout. -(define-integrable closure-entry-size 3) ;units of objects +(define-integrable closure-entry-size 2) ;units of objects (define-integrable address-units-per-closure-manifest address-units-per-object) (define-integrable address-units-per-entry-format-code 4) (define-integrable address-units-per-closure-entry-count 4) (define-integrable address-units-per-closure-padding 4) -;;; (MOV Q (R ,rax) (&U )) 48 B8 -;;; (CALL (@PCR CALL-OFFSET)) E8 00 00 00 00 -;;; (LABEL CALL-OFFSET) -;;; (JMP (R ,rax)) FF E0 -;;; xx xx xx -(define-integrable address-units-per-closure-entry-call-offset 15) -(define-integrable address-units-per-closure-entry-instructions 20) +;;; Just a 64-bit offset and four bytes of padding. +(define-integrable address-units-per-closure-entry-instructions 12) (define-integrable address-units-per-closure-entry (+ address-units-per-entry-format-code @@ -104,8 +99,8 @@ USA. ;;; Note: ;;; -;;; (= address-units-per-closure-entry #| 24 |# -;;; (* closure-entry-size #| 3 |# address-units-per-object #| 8 |#)) +;;; (= address-units-per-closure-entry #| 16 |# +;;; (* closure-entry-size #| 2 |# address-units-per-object #| 8 |#)) ;;; Given the number of entries in a closure, and the index of an ;;; entry, return the number of words from that entry's closure diff --git a/src/compiler/machines/x86-64/rules3.scm b/src/compiler/machines/x86-64/rules3.scm index 71093be39..4c97dd085 100644 --- a/src/compiler/machines/x86-64/rules3.scm +++ b/src/compiler/machines/x86-64/rules3.scm @@ -39,8 +39,10 @@ USA. (cond ((null? checks) (current-bblock-continue! (make-new-sblock - (LAP (POP Q (R ,rax)) ; continuation - (AND Q (R ,rax) (R ,regnum:datum-mask)) ; clear type + (LAP (POP Q (R ,rcx)) ; continuation + (AND Q (R ,rcx) (R ,regnum:datum-mask)) ; clear type + (MOV Q (R ,rax) (@R ,rcx)) ;rax := PC offset + (ADD Q (R ,rax) (R ,rcx)) ;rax := PC (JMP (R ,rax)))))) ((block-association 'POP-RETURN) => current-bblock-continue!) @@ -50,8 +52,10 @@ USA. (let ((interrupt-label (generate-label 'INTERRUPT))) (LAP (CMP Q (R ,regnum:free-pointer) ,reg:compiled-memtop) (JGE (@PCR ,interrupt-label)) - (POP Q (R ,rax)) ; continuation - (AND Q (R ,rax) (R ,regnum:datum-mask)) ; clear type + (POP Q (R ,rcx)) ; continuation + (AND Q (R ,rcx) (R ,regnum:datum-mask)) ; clear type + (MOV Q (R ,rax) (@R ,rcx)) ;rax := PC offset + (ADD Q (R ,rax) (R ,rcx)) ;rax := PC (JMP (R ,rax)) (LABEL ,interrupt-label) ,@(invoke-hook @@ -88,7 +92,12 @@ USA. frame-size continuation (expect-no-exit-interrupt-checks) (LAP ,@(clear-map!) - (JMP (@PCR ,label)))) + ;; Every label for code we can jump to starts with a 64-bit + ;; offset to the actual code, always equal to 8. We could + ;; invent the bookkeeping to map the external label to the + ;; actual code label, but that's more work than I want to do + ;; right now. + (JMP (@PCRO ,label 8)))) (define-rule statement (INVOCATION:COMPUTED-JUMP (? frame-size) (? continuation)) @@ -96,8 +105,10 @@ USA. ;; It expects the procedure at the top of the stack (expect-no-exit-interrupt-checks) (LAP ,@(clear-map!) - (POP Q (R ,rax)) - (AND Q (R ,rax) (R ,regnum:datum-mask)) ;clear type code + (POP Q (R ,rcx)) + (AND Q (R ,rcx) (R ,regnum:datum-mask)) ;clear type code + (MOV Q (R ,rax) (@R ,rcx)) ;rax := PC offset + (ADD Q (R ,rax) (R ,rcx)) ;rax := PC (JMP (R ,rax)))) (define-rule statement @@ -471,8 +482,8 @@ USA. address-units-per-closure-entry-instructions address-units-per-closure-padding)) (free-offset - (+ slots-offset (* size address-units-per-object)))) - (LAP (MOV Q ,temp (&U ,(make-closure-manifest size))) + (+ slots-offset (* (+ 1 size) address-units-per-object)))) + (LAP (MOV Q ,temp (&U ,(make-closure-manifest (+ 1 size)))) (MOV Q (@R ,regnum:free-pointer) ,temp) ;; There's only one entry point here. (MOV L (@RO ,regnum:free-pointer ,data-offset) (&U 1)) @@ -482,7 +493,12 @@ USA. ;; Bump FREE. ,@(with-signed-immediate-operand free-offset (lambda (addend) - (LAP (ADD Q (R ,regnum:free-pointer) ,addend))))))) + (LAP (ADD Q (R ,regnum:free-pointer) ,addend)))) + ;; Set the last component to be the relocation reference point. + (MOV Q ,temp + (&U ,(make-non-pointer-literal (ucode-type COMPILED-ENTRY) 0))) + (OR Q ,temp ,target) + (MOV Q (@RO ,regnum:free-pointer -8) ,temp)))) (define (generate/cons-multiclosure target nentries size entries) (let* ((mtarget (target-register target)) @@ -504,62 +520,54 @@ USA. (free-offset (+ first-format-offset (* nentries address-units-per-closure-entry) - (* size address-units-per-object)))) - (LAP (MOV Q ,temp (&U ,(make-multiclosure-manifest nentries size))) + (* (+ 1 size) address-units-per-object)))) + (LAP (MOV Q ,temp (&U ,(make-multiclosure-manifest nentries (+ 1 size)))) (MOV Q (@R ,regnum:free-pointer) ,temp) (MOV L (@RO ,regnum:free-pointer ,data-offset) (&U ,nentries)) ,@(generate-entries entries first-format-offset) (LEA Q ,target (@RO ,regnum:free-pointer ,first-pc-offset)) ,@(with-signed-immediate-operand free-offset (lambda (addend) - (LAP (ADD Q (R ,regnum:free-pointer) ,addend)))))))) + (LAP (ADD Q (R ,regnum:free-pointer) ,addend)))) + ;; Set the last component to be the relocation reference point. + (MOV Q ,temp + (&U ,(make-non-pointer-literal (ucode-type COMPILED-ENTRY) 0))) + (OR Q ,temp ,target) + (MOV Q (@RO ,regnum:free-pointer -8) ,temp))))) (define (generate-closure-entry label min max offset temp) (let* ((procedure-label (rtl-procedure/external-label (label->object label))) - (MOV-offset (+ offset address-units-per-entry-format-code)) - (imm64-offset (+ MOV-offset 2)) - (CALL-offset (+ imm64-offset 8)) - (CALL-rel32-offset (+ CALL-offset 1)) - (JMP-offset (+ CALL-rel32-offset 4)) - (padding-offset (+ JMP-offset 2))) - CALL-rel32-offset JMP-offset padding-offset + (addr-offset (+ offset address-units-per-entry-format-code)) + (padding-offset (+ addr-offset 8))) + padding-offset (LAP (MOV L (@RO ,regnum:free-pointer ,offset) - (&U ,(make-closure-code-longword min max MOV-offset))) - (LEA Q ,temp (@PCR ,procedure-label)) - ;; (MOV Q (R ,rax) (&U )) 48 b8 - (MOV W (@RO ,regnum:free-pointer ,MOV-offset) (&U #xB848)) - (MOV Q (@RO ,regnum:free-pointer ,imm64-offset) ,temp) - ;; (CALL (@PCO 0)) e8 00 00 00 00 - ;; (JMP (R ,rax)) ff e0 - ;; (PADDING 0 8 #*00000000) 00 - (MOV Q ,temp (&U #x00E0FF00000000E8)) - (MOV Q (@RO ,regnum:free-pointer ,CALL-offset) ,temp) -#| - ;; (CALL (@PCO 0)) e8 00 00 00 00 - (MOV B (@RO ,regnum:free-pointer ,CALL-offset) (&U #xE8)) - (MOV Q (@RO ,regnum:free-pointer ,CALL-rel32-offset) (&U 0)) - ;; (JMP (R ,rax)) ff e0 - (MOV W (@RO ,regnum:free-pointer ,JMP-offset) (&U #xE0FF)) - #| - ;; (PADDING 0 8 #*00000000) 00 - (MOV B (@RO ,regnum:free-pointer ,PAD-offset) (&U #x00)) - |# -|# - ))) + (&U ,(make-closure-code-longword min max addr-offset))) + ;; Set temp := procedure-label + 8 - addr-offset. + (LEA Q ,temp (@PCR (- (+ ,procedure-label 8) ,addr-offset))) + ;; Set temp := procedure-label + 8 - addr-offset - free. + (SUB Q ,temp (R ,regnum:free-pointer)) + ;; Store temp = procedure-label + 8 - (free + addr-offset). + (MOV Q (@RO ,regnum:free-pointer ,addr-offset) ,temp)))) (define (generate/closure-header internal-label nentries) (let* ((rtl-proc (label->object internal-label)) (external-label (rtl-procedure/external-label rtl-proc)) - (checks (get-entry-interrupt-checks))) + (checks (get-entry-interrupt-checks)) + (type (ucode-type COMPILED-ENTRY))) (define (label+adjustment) (LAP ,@(make-external-label internal-entry-code-word external-label) - ;; Assumption: RAX is not in use here. (In fact, it is - ;; used to store the absolute address of this header.) - ;; See comment by CLOSURE-ENTRY-MAGIC to understand - ;; what's going on here. - (MOV Q (R ,rax) (&U ,(closure-entry-magic))) - (ADD Q (@R ,rsp) (R ,rax)) - (LABEL ,internal-label))) + ;; rcx holds the untagged entry address. Push and tag it. + ;; All other temporary registers, notably rax, are free. + (MOV Q (R ,rax) (&U ,(make-non-pointer-literal type 0))) + (OR Q (R ,rcx) (R ,rax)) + (PUSH Q (R ,rcx)) + ;; Jump past a bogus faux offset. We need this because + ;; INVOCATION:JUMP jumps to the label + 8, and at the moment + ;; I haven't found a good way to make it skip the +8 part + ;; for closures. + (JMP (@PCRO ,internal-label 8)) + (LABEL ,internal-label) + (QUAD U 8))) (cond ((zero? nentries) (LAP (EQUATE ,external-label ,internal-label) ,@(simple-procedure-header @@ -575,19 +583,6 @@ USA. (else (label+adjustment))))) -;;; On entry to a closure, the quadword at the top of the stack will -;;; be an untagged pointer to the byte following the CALL instruction -;;; that led the machine there. CLOSURE-ENTRY-MAGIC returns a number -;;; that, when added to this quadword, yields the tagged compiled -;;; entry that was used to invoke the closure. This is what the RTL -;;; deals with, and this is what interrupt handlers want, particularly -;;; for the garbage collector, which wants to find only nice tagged -;;; pointers on the stack. - -(define-integrable (closure-entry-magic) - (- (make-non-pointer-literal (ucode-type COMPILED-ENTRY) 0) - address-units-per-closure-entry-call-offset)) - (define-integrable (make-closure-manifest size) (make-multiclosure-manifest 1 size)) @@ -811,8 +806,10 @@ USA. (lambda (cache) (let ((frame-size (car cache)) (label (cdr cache))) + ;; Must match UUO_LINK_SIZE in cmpintmd/x86-64.h. `((,frame-size . ,label) (,variable . ,(allocate-constant-label)) + (#F . ,(allocate-constant-label)) (#F . ,(allocate-constant-label)))))) (cdr variable.caches))) variable.caches-list)) diff --git a/src/microcode/cmpauxmd/x86-64.m4 b/src/microcode/cmpauxmd/x86-64.m4 index 8773125bd..fd8859cb3 100644 --- a/src/microcode/cmpauxmd/x86-64.m4 +++ b/src/microcode/cmpauxmd/x86-64.m4 @@ -416,9 +416,8 @@ define_c_label(C_to_interface) define_hook_label(trampoline_to_interface) define_debugging_label(trampoline_to_interface) - OP(pop,q) REG(rbx) # trampoline storage - # See x86-64.h for trampoline encoding layout. - OP(add,q) TW(IMM(9),REG(rbx)) # adjust ptr + OP(add,q) TW(IMM(16),REG(rcx)) # trampoline storage + OP(mov,q) TW(REG(rcx),REG(rbx)) # argument in rbx jmp scheme_to_interface define_hook_label(scheme_to_interface_call) @@ -486,7 +485,9 @@ ifdef(`WIN32', # Register block = %rsi OP(mov,q) TW(REG(rax),REG(rcx)) # Preserve if used OP(and,q) TW(rmask,REG(rcx)) # Restore potential dynamic link OP(mov,q) TW(REG(rcx),QOF(REGBLOCK_DLINK(),regs)) - jmp IJMP(REG(rdx)) + OP(mov,q) TW(REG(rdx),REG(rcx)) # rcx := entry addr + OP(add,q) TW(IND(REG(rcx)),REG(rdx)) # rcx := PC + jmp IJMP(REG(rdx)) # Invoke IF_WIN32(` use_external_code(EFR(WinntExceptionTransferHook)) @@ -566,7 +567,9 @@ define_hook_label(sc_apply) OP(movs,bq,x) TW(BOF(-4,REG(rcx)),REG(rax)) # Extract frame size OP(cmp,q) TW(REG(rax),REG(rdx)) # Compare to nargs+1 jne asm_sc_apply_generic - jmp IJMP(REG(rcx)) # Invoke + OP(mov,q) TW(IND(REG(rcx)),REG(rax)) # rax := PC offset + OP(add,q) TW(REG(rcx),REG(rax)) # rax := PC + jmp IJMP(REG(rax)) # Invoke define_debugging_label(asm_sc_apply_generic) OP(mov,q) TW(IMM(HEX(14)),REG(rax)) @@ -583,7 +586,9 @@ define_hook_label(sc_apply_size_$1) jne asm_sc_apply_generic_$1 OP(cmp,b) TW(IMM($1),BOF(-4,REG(rcx))) # Compare frame size jne asm_sc_apply_generic_$1 # to nargs+1 - jmp IJMP(REG(rcx)) + OP(mov,q) TW(IND(REG(rcx)),REG(rax)) # rax := PC offset + OP(add,q) TW(REG(rcx),REG(rax)) # rax := PC + jmp IJMP(REG(rax)) # Invoke asm_sc_apply_generic_$1: OP(mov,q) TW(IMM($1),REG(rdx)) @@ -605,39 +610,40 @@ define_apply_fixed_size(8) ### numeric types are much faster than the rare ones ### (bignums, ratnums, recnums) +declare_alignment(2) +asm_generic_return_rax: + OP(mov,q) TW(REG(rax),QOF(REGBLOCK_VAL(),regs)) + OP(pop,q) REG(rcx) + OP(and,q) TW(rmask,REG(rcx)) + OP(mov,q) TW(IND(REG(rcx)),REG(rax)) + OP(add,q) TW(REG(rcx),REG(rax)) + jmp IJMP(REG(rax)) + declare_alignment(2) asm_generic_fixnum_result: - OP(and,q) TW(rmask,IND(REG(rsp))) OP(or,b) TW(IMM(TC_FIXNUM),REG(al)) OP(ror,q) TW(IMM(TC_LENGTH),REG(rax)) - OP(mov,q) TW(REG(rax),QOF(REGBLOCK_VAL(),regs)) - ret + jmp asm_generic_return_rax declare_alignment(2) asm_generic_flonum_result: - OP(and,q) TW(rmask,IND(REG(rsp))) OP(mov,q) TW(IMM_MANIFEST_NM_VECTOR_1,REG(rcx)) OP(mov,q) TW(REG(rcx),IND(rfree)) movsd TW(REG(xmm0),QOF(FLONUM_DATA_OFFSET,rfree)) OP(mov,q) TW(IMM_FLONUM_0,REG(rax)) OP(or,q) TW(rfree,REG(rax)) OP(lea,q) TW(QOF(FLONUM_STORAGE_SIZE,rfree),rfree) - OP(mov,q) TW(REG(rax),QOF(REGBLOCK_VAL(),regs)) - ret + jmp asm_generic_return_rax declare_alignment(2) asm_generic_return_sharp_t: - OP(and,q) TW(rmask,IND(REG(rsp))) OP(mov,q) TW(IMM_TRUE,REG(rax)) - OP(mov,q) TW(REG(rax),QOF(REGBLOCK_VAL(),regs)) - ret + jmp asm_generic_return_rax declare_alignment(2) asm_generic_return_sharp_f: - OP(and,q) TW(rmask,IND(REG(rsp))) OP(mov,q) TW(IMM_FALSE,REG(rax)) - OP(mov,q) TW(REG(rax),QOF(REGBLOCK_VAL(),regs)) - ret + jmp asm_generic_return_rax define(define_unary_operation, `declare_alignment(2) @@ -838,10 +844,8 @@ asm_generic_divide_zero_by_flo: ucomisd TW(REG(xmm1),REG(xmm0)) jp asm_generic_divide_fail je asm_generic_divide_fail - OP(and,q) TW(rmask,IND(REG(rsp))) OP(mov,q) TW(IMM_FIXNUM_0,REG(rax)) - OP(mov,q) TW(REG(rax),QOF(REGBLOCK_VAL(),regs)) - ret + jmp asm_generic_return_rax asm_generic_divide_flo_by_flo: # Numerator (rdx) and denominator (rbx) are both flonums. diff --git a/src/microcode/cmpintmd/x86-64.c b/src/microcode/cmpintmd/x86-64.c index a03457bc9..3cf4f9aa9 100644 --- a/src/microcode/cmpintmd/x86-64.c +++ b/src/microcode/cmpintmd/x86-64.c @@ -66,18 +66,54 @@ write_cc_entry_offset (cc_entry_offset_t * ceo, insn_t * address) /* Compiled closures */ -/* MOV RAX,imm64 has two bytes of opcode cruft before the imm64. */ +/* start_closure_reloation(scan, ref) + + `scan' points at the manifest of a compiled closure. Initialize + `ref' with whatever we need to relocate the entries in it. */ + +void +start_closure_relocation (SCHEME_OBJECT * scan, reloc_ref_t * ref) +{ + /* The last element of the block is always the tagged first entry of + the closure, which tells us where the closure was in oldspace. */ + (ref->old_addr) = (CC_ENTRY_ADDRESS (* ((CC_BLOCK_ADDR_END (scan)) - 1))); + /* Find the address of the first entry in newspace. */ + (ref->new_addr) + = (tospace_to_newspace + (compiled_closure_entry (compiled_closure_start (scan + 1)))); +} + +/* read_compiled_closure_target(start, ref) + + `start' points to the start of a closure entry in tospace, beginning + with the format word and block offset. `ref' was initialized with + `start_closure_relocation'. Return the untagged compiled entry + address in oldspace that the closure entry points to. */ insn_t * -read_compiled_closure_target (insn_t * start) +read_compiled_closure_target (insn_t * start, reloc_ref_t * ref) { - return (* ((insn_t **) (start + CC_ENTRY_HEADER_SIZE + 2))); + insn_t * addr = (start + CC_ENTRY_HEADER_SIZE); + insn_t * base = (tospace_to_newspace (addr)); + /* If we're relocating, find where base was in the oldspace. */ + if (ref) + base += (ref->old_addr - ref->new_addr); + return (base + (* ((int64_t *) addr)) - 8); } +/* write_compiled_closure_target(target, start) + + `target' is an untagged compiled entry address in newspace. `start' + points to the start of a closure entry in tospace, beginning with + the format word and block offset. Set the closure entry at `start' + to go to `target'. */ + void write_compiled_closure_target (insn_t * target, insn_t * start) { - (* ((insn_t **) (start + CC_ENTRY_HEADER_SIZE + 2))) = target; + insn_t * addr = (start + CC_ENTRY_HEADER_SIZE); + (* ((int64_t *) addr)) = + (target - ((insn_t *) (tospace_to_newspace (addr))) + 8); } unsigned long @@ -104,7 +140,7 @@ compiled_closure_entry (insn_t * start) insn_t * compiled_closure_next (insn_t * start) { - return (start + CC_ENTRY_HEADER_SIZE + 20); + return (start + CC_ENTRY_HEADER_SIZE + 12); } SCHEME_OBJECT * @@ -117,9 +153,7 @@ skip_compiled_closure_padding (insn_t * start) SCHEME_OBJECT compiled_closure_entry_to_target (insn_t * entry) { - /* `entry' points to the start of the MOV RAX,imm64 instruction, - which has two bytes of opcode cruft before the imm64. */ - return (MAKE_CC_ENTRY (* ((long *) (entry + 2)))); + return (MAKE_CC_ENTRY (entry + (* ((int64_t *) entry)) - 8)); } /* Execution caches (UUO links) @@ -151,8 +185,12 @@ read_uuo_frame_size (SCHEME_OBJECT * saddr) insn_t * read_uuo_target (SCHEME_OBJECT * saddr) { - insn_t * mov_addr = ((insn_t *) (saddr + 1)); - return (* ((insn_t **) (mov_addr + 2))); + /* Skip the arity. */ + insn_t * addr = ((insn_t *) (saddr + 1)); + assert ((addr[0]) == 0x48); + assert ((addr[1]) == 0xb9); + /* 0x48 0xb9 */ + return (* ((insn_t **) (&addr[2]))); } insn_t * @@ -166,12 +204,23 @@ write_uuo_target (insn_t * target, SCHEME_OBJECT * saddr) { /* Skip the arity. */ insn_t * addr = ((insn_t *) (saddr + 1)); - (*addr++) = 0x48; /* REX.W (64-bit operand size prefix) */ - (*addr++) = 0xB8; /* MOV RAX,imm64 */ - (* ((insn_t **) addr)) = target; - addr += 8; - (*addr++) = 0xFF; /* JMP reg/mem64 */ - (*addr++) = 0xE0; /* ModR/M for RAX */ + (addr[0]) = 0x48; /* MOV RCX,imm64 */ + (addr[1]) = 0xb9; + (* ((insn_t **) (&addr[2]))) = target; + /* It is tempting to precompute the PC here, but this doesn't work + when the target is a compiled closure, because if we are doing + this during garbage collection, although the closure itself has + been relocated by now, the compiled code block to which it points + has not yet. Maybe it would be worthwhile to arrange the GC to + give us the */ + (addr[10]) = 0x48; /* MOV RAX,(RCX) */ + (addr[11]) = 0x8b; + (addr[12]) = 0x01; + (addr[13]) = 0x48; /* ADD RAX,RCX */ + (addr[14]) = 0x01; + (addr[15]) = 0xc8; + (addr[16]) = 0xff; /* JMP RAX */ + (addr[17]) = 0xe0; } #define BYTES_PER_TRAMPOLINE_ENTRY_PADDING 4 @@ -197,16 +246,12 @@ trampoline_entry_addr (SCHEME_OBJECT * block, unsigned long index) bool store_trampoline_insns (insn_t * entry, uint8_t code) { - (*entry++) = 0xB0; /* MOV AL,code */ - (*entry++) = code; - (*entry++) = 0xE8; /* CALL rel32 */ - (*entry++) = 0x00; /* zero displacement */ - (*entry++) = 0x00; - (*entry++) = 0x00; - (*entry++) = 0x00; - (*entry++) = 0xFF; /* JMP r/m64 */ - (*entry++) = 0xA6; /* disp32(RSI) */ - (* ((uint32_t *) entry)) = RSI_TRAMPOLINE_TO_INTERFACE_OFFSET; + (* ((int64_t *) (&entry[0]))) = 8; + (entry[8]) = 0xb0; /* MOVB RAX,imm8 */ + (entry[9]) = code; + (entry[10]) = 0xff; /* JMP r/m64 */ + (entry[11]) = 0xa6; /* disp32(RSI) */ + (* ((uint32_t *) (&entry[12]))) = RSI_TRAMPOLINE_TO_INTERFACE_OFFSET; return (false); } diff --git a/src/microcode/cmpintmd/x86-64.h b/src/microcode/cmpintmd/x86-64.h index 926a5c7ac..6307bfb6c 100644 --- a/src/microcode/cmpintmd/x86-64.h +++ b/src/microcode/cmpintmd/x86-64.h @@ -74,18 +74,20 @@ modes and jump instructions are all 64 bits by default. 2 zero [TC_FIXNUM | arity] 7 0x1A / entry 8 symbol - 16 - 24 + 16 + 32 After linking 0 16-bit arity 2 zero 7 0x1A -entry 8 MOV RAX,imm64 48 b8 - 18 JMP (RAX) ff e0 - 19-23 - 24 +entry 8 MOV RCX,imm64 48 b9 ; entry address + 18 MOV RAX,(RCX) 48 8b 01 + 21 ADD RAX,RCX 48 01 c8 + 24 JMP RAX ff e0 + 26 + 32 - Closures: @@ -97,15 +99,13 @@ nicely. 8 12 \__ format word 14 / -entry0 16 MOV RAX,imm64 48 b8 - 26 CALL [RIP+0] e8 00 00 00 00 - 31 JMP (RAX) ff e0 - 33 00 00 00 - 36 - 38 -entry1 40 ... +entry0 16 + 24 + 28 + 30 +entry1 32 ... ... - 16 + 24*n + 16 + 16*n - Trampoline encoding: @@ -113,13 +113,12 @@ entry1 40 ... -8 -4 -2 -entry 0 MOV AL,code b0 - 2 CALL [RIP+0] e8 00 00 00 00 - 7 JMP n(RSI) ff a6 - 13 00 00 00 +entry 0 08 00 00 00 00 00 00 00 + 8 MOV AL,code b0 + 10 JMP n(RSI) ff a6 16 - Distance from address on stack to trampoline storage: 16 - 7 = 9. + Distance from address in rcx to storage: 16. */ @@ -158,18 +157,26 @@ typedef uint8_t insn_t; #define EMBEDDED_CLOSURE_ADDRS_P 1 -#define DECLARE_RELOCATION_REFERENCE(name) +typedef struct +{ + insn_t * old_addr; + insn_t * new_addr; +} reloc_ref_t; -#define START_CLOSURE_RELOCATION(scan, ref) do {} while (0) -#define START_OPERATOR_RELOCATION(scan, ref) do {} while (0) +#define DECLARE_RELOCATION_REFERENCE(name) reloc_ref_t name + +#define START_CLOSURE_RELOCATION(scan, ref) \ + start_closure_relocation ((scan), (&ref)) + +#define START_OPERATOR_RELOCATION(scan, ref) do {(void)ref;} while (0) #define OPERATOR_RELOCATION_OFFSET 0 #define READ_COMPILED_CLOSURE_TARGET(a, r) \ - read_compiled_closure_target (a) + read_compiled_closure_target ((a), (&r)) /* Size of execution cache in SCHEME_OBJECTS. */ -#define UUO_LINK_SIZE 3 +#define UUO_LINK_SIZE 4 #define UUO_WORDS_TO_COUNT(nw) ((nw) / UUO_LINK_SIZE) #define UUO_COUNT_TO_WORDS(nc) ((nc) * UUO_LINK_SIZE) @@ -243,7 +250,8 @@ extern void asm_scheme_to_interface_call (void); extern void asm_serialize_cache (void); extern void asm_trampoline_to_interface (void); -extern insn_t * read_compiled_closure_target (insn_t *); +extern void start_closure_relocation (SCHEME_OBJECT *, reloc_ref_t *); +extern insn_t * read_compiled_closure_target (insn_t *, reloc_ref_t *); extern insn_t * read_uuo_target (SCHEME_OBJECT *); extern void x86_64_reset_hook (void); diff --git a/src/microcode/gcloop.c b/src/microcode/gcloop.c index 412786e4a..f5aa71dfb 100644 --- a/src/microcode/gcloop.c +++ b/src/microcode/gcloop.c @@ -737,12 +737,11 @@ DEFINE_GC_HANDLER (gc_handle_manifest_closure) unsigned long count = (compiled_closure_count (scan)); while (count > 0) { - write_compiled_closure_target - ((GC_CC_ENTRY_TO_RAW_ADDRESS - (GC_HANDLE_CC_ENTRY - (GC_RAW_ADDRESS_TO_CC_ENTRY - (READ_COMPILED_CLOSURE_TARGET (start, ref))))), - start); + insn_t * otarget = (READ_COMPILED_CLOSURE_TARGET (start, ref)); + SCHEME_OBJECT oentry = (GC_RAW_ADDRESS_TO_CC_ENTRY (otarget)); + SCHEME_OBJECT nentry = (GC_HANDLE_CC_ENTRY (oentry)); + insn_t * ntarget = (GC_CC_ENTRY_TO_RAW_ADDRESS (nentry)); + write_compiled_closure_target (ntarget, start); start = (compiled_closure_next (start)); count -= 1; } -- 2.25.1