This will wreck the CPU's return address branch target predictor.
This is an intermediate change en route to using paired CALL/RET for
continuation pushes and pop-returns in order to take advantage of the
CPU's branch target predictor.
WARNING: This changes the format of compiled closures, and as such,
new compiled code requires a new microcode and vice versa.
(define-integrable (invoke-hook entry)
(LAP (JMP ,entry)))
-(define-integrable (invoke-hook/call entry)
- (LAP (CALL ,entry)))
+(define (invoke-hook/call entry)
+ (let* ((get-pc (generate-label 'GET-PC))
+ (hook-context (generate-label 'HOOK-CONTEXT)))
+ (LAP (CALL (@PCR ,get-pc))
+ (LABEL ,get-pc)
+ ;; ADD r/m64,imm8 48 83 04 24 xx
+ ;; JMP r/m64 ff 86 yy yy yy yy
+ ;; Register displacement for JMP is always >=0x80, so can't
+ ;; fit in signed byte and thus must use 32-bit displacement.
+ ;; Hence xx = 0x0b = 11.
+ (ADD Q (@R ,rsp) (& #x0b))
+ (JMP ,entry)
+ (LABEL ,hook-context))))
(define-integrable (invoke-interface code)
(LAP (MOV B (R ,rax) (& ,code))
;;; See microcode/cmpintmd/x86-64.h for a description of the layout.
-(define-integrable closure-entry-size 2)
+(define-integrable closure-entry-size 3) ;units of objects
(define-integrable address-units-per-closure-manifest address-units-per-object)
(define-integrable address-units-per-entry-format-code 4)
(define-integrable address-units-per-closure-padding 4)
;;; (MOV Q (R ,rax) (&U <entry>)) 48 B8 <eight-byte immediate>
-;;; (CALL (R ,rax)) FF D0
-(define-integrable address-units-per-closure-entry-instructions 12)
+;;; (CALL (@PCR CALL-OFFSET)) E8 00 00 00 00
+;;; (LABEL CALL-OFFSET)
+;;; (JMP (R ,rax)) FF E0
+;;; <padding> xx xx xx
+(define-integrable address-units-per-closure-entry-call-offset 15)
+(define-integrable address-units-per-closure-entry-instructions 20)
(define-integrable address-units-per-closure-entry
(+ address-units-per-entry-format-code
;;; Note:
;;;
-;;; (= address-units-per-closure-entry #| 16 |#
-;;; (* closure-entry-size #| 2 |# address-units-per-object #| 8 |#))
+;;; (= address-units-per-closure-entry #| 24 |#
+;;; (* closure-entry-size #| 3 |# address-units-per-object #| 8 |#))
;;; Given the number of entries in a closure, and the index of an
;;; entry, return the number of words from that entry's closure
(let* ((procedure-label (rtl-procedure/external-label (label->object label)))
(MOV-offset (+ offset address-units-per-entry-format-code))
(imm64-offset (+ MOV-offset 2))
- (CALL-offset (+ imm64-offset 8)))
+ (CALL-offset (+ imm64-offset 8))
+ (CALL-rel32-offset (+ CALL-offset 1))
+ (JMP-offset (+ CALL-rel32-offset 4))
+ (padding-offset (+ JMP-offset 2)))
+ CALL-rel32-offset JMP-offset padding-offset
(LAP (MOV L (@RO ,regnum:free-pointer ,offset)
(&U ,(make-closure-code-longword min max MOV-offset)))
(LEA Q ,temp (@PCR ,procedure-label))
- ;; (MOV Q (R ,rax) (&U <procedure-label>))
- ;; The instruction sequence is really `48 b8', but this is a
- ;; stupid little-endian architecture. I want my afternoon
- ;; back.
+ ;; (MOV Q (R ,rax) (&U <procedure-label>)) 48 b8
(MOV W (@RO ,regnum:free-pointer ,MOV-offset) (&U #xB848))
(MOV Q (@RO ,regnum:free-pointer ,imm64-offset) ,temp)
- ;; (CALL (R ,rax))
- (MOV W (@RO ,regnum:free-pointer ,CALL-offset) (&U #xD0FF)))))
+ ;; (CALL (@PCO 0)) e8 00 00 00 00
+ ;; (JMP (R ,rax)) ff e0
+ ;; (PADDING 0 8 #*00000000) 00
+ (MOV Q ,temp (&U #x00E0FF00000000E8))
+ (MOV Q (@RO ,regnum:free-pointer ,CALL-offset) ,temp)
+#|
+ ;; (CALL (@PCO 0)) e8 00 00 00 00
+ (MOV B (@RO ,regnum:free-pointer ,CALL-offset) (&U #xE8))
+ (MOV Q (@RO ,regnum:free-pointer ,CALL-rel32-offset) (&U 0))
+ ;; (JMP (R ,rax)) ff e0
+ (MOV W (@RO ,regnum:free-pointer ,JMP-offset) (&U #xE0FF))
+ #|
+ ;; (PADDING 0 8 #*00000000) 00
+ (MOV B (@RO ,regnum:free-pointer ,PAD-offset) (&U #x00))
+ |#
+|#
+ )))
\f
(define (generate/closure-header internal-label nentries)
(let* ((rtl-proc (label->object internal-label))
(define-integrable (closure-entry-magic)
(- (make-non-pointer-literal (ucode-type COMPILED-ENTRY) 0)
- address-units-per-closure-entry-instructions))
+ address-units-per-closure-entry-call-offset))
(define-integrable (make-closure-manifest size)
(make-multiclosure-manifest 1 size))
define_hook_label(trampoline_to_interface)
define_debugging_label(trampoline_to_interface)
OP(pop,q) REG(rbx) # trampoline storage
+ # See x86-64.h for trampoline encoding layout.
+ OP(add,q) TW(IMM(9),REG(rbx)) # adjust ptr
jmp scheme_to_interface
define_hook_label(scheme_to_interface_call)
insn_t *
compiled_closure_next (insn_t * start)
{
- return (start + CC_ENTRY_HEADER_SIZE + 12);
+ return (start + CC_ENTRY_HEADER_SIZE + 20);
}
SCHEME_OBJECT *
}
\f
#define BYTES_PER_TRAMPOLINE_ENTRY_PADDING 4
-#define OBJECTS_PER_TRAMPOLINE_ENTRY 2
+#define OBJECTS_PER_TRAMPOLINE_ENTRY 3
#define RSI_TRAMPOLINE_TO_INTERFACE_OFFSET \
((COMPILER_REGBLOCK_N_FIXED + (2 * COMPILER_HOOK_SIZE)) \
{
(*entry++) = 0xB0; /* MOV AL,code */
(*entry++) = code;
- (*entry++) = 0xFF; /* CALL /2 disp32(RSI) */
- (*entry++) = 0x96;
+ (*entry++) = 0xE8; /* CALL rel32 */
+ (*entry++) = 0x00; /* zero displacement */
+ (*entry++) = 0x00;
+ (*entry++) = 0x00;
+ (*entry++) = 0x00;
+ (*entry++) = 0xFF; /* JMP r/m64 */
+ (*entry++) = 0xA6; /* disp32(RSI) */
(* ((uint32_t *) entry)) = RSI_TRAMPOLINE_TO_INTERFACE_OFFSET;
return (false);
}
0 16-bit arity
2 zero
7 0x1A
-entry 8 MOV RAX,imm64 0x48 0xB8
- 10 <address>
- 18 JMP (RAX) 0xFF 0xE0
+entry 8 MOV RAX,imm64 48 b8 <addr64>
+ 18 JMP (RAX) ff e0
19-23 <four bytes of padding>
24 <next cache>
8 <entry count>
12 <type/arity info> \__ format word
14 <gc offset> /
-entry0 16 MOV RAX,imm64 0x48 0xB8
- 18 <address>
- 26 CALL (RAX) 0xFF 0xD0
- 28 <four bytes of padding or next format word>
+entry0 16 MOV RAX,imm64 48 b8 <imm64>
+ 26 CALL [RIP+0] e8 00 00 00 00
+ 31 JMP (RAX) ff e0
+ 33 <padding> 00 00 00
+ 36 <type/arity info>
+ 38 <gc offset>
+entry1 40 ...
...
- 16*(n+1) <variables>
+ 16 + 24*n <variables>
- Trampoline encoding:
-8 <padding>
-4 <type/arity info>
-2 <gc offset>
-entry 0 MOV AL,code 0xB0, code-byte
- 2 CALL n(RSI) 0xFF 0x96 n-longword
- 8 <trampoline dependent storage>
+entry 0 MOV AL,code b0 <code8>
+ 2 CALL [RIP+0] e8 00 00 00 00
+ 7 JMP n(RSI) ff a6 <n32>
+ 13 <padding> 00 00 00
+ 16 <trampoline dependent storage>
+
+ Distance from address on stack to trampoline storage: 16 - 7 = 9.
*/
\f
instructions are stored. This is an approximation: it matches only
those non-closure procedures for which LIAR has generated interrupt
checks, in which case there is one CALL n(RSI), which is encoded as
- #xff #x96 <n>, where n is a longword (32 bits). */
+ #xff #x96 <n>, where n is a longword (32 bits).
+
+ XXX Stop using CALL for this. */
#define CC_ENTRY_GC_TRAP_SIZE 6
\f
#define EMBEDDED_CLOSURE_ADDRS_P 1