From 1145cbc45354f5e2ca24f86ad525c92526419946 Mon Sep 17 00:00:00 2001 From: Taylor R Campbell Date: Thu, 5 Nov 2009 23:40:26 -0500 Subject: [PATCH] Simplify x86-64 utility interface, and nuke all x87 code from it. The calling convention for microcode utilities now more closely matches the standard calling convention for C on the AMD64, which reduces the amount of code needed in scheme_to_interface. The i386 hack to split the assembly hook addresses between negative addresses and positive addresses is no more; most of what little benefit it added on the i386, it ceased to add on x86-64, since no hooks fit in positive byte offsets from the register block start any longer, and only sixteen would fit in negative byte offsets. Perhaps later it would be worth deciding which sixteen deserve byte offsets (e.g., scheme-to-interface certainly would), but this is simpler for now. Note that there is still a space advantage to invoking a hook (jump to an offset from Registers) over invoking an interface (move its number to AL, jump to scheme-to-interface), since invoking an interface requires going through a hook anyway. These change serves mainly to simplify the code, not to improve performance, although shuffling the calling convention may improve performance as a bonus. The x87 code appears to have been causing problems, and will later be replaced by the AMD64's 128-bit media (SSEn) instructions anyway. The compiler still has some x87 vestiges left in it, but not in any code that is hit, and nearly all of it is commented out now. *** NOTE: Since this changes the calling convention of utilities from compiled code, it breaks all existing x86-64 .com files, which you must recompile with the new compiler. This means that you can't straightforwardly rebuild Scheme the usual way from a prior x86-64 installation. The easiest way to proceed is to start again from a LIARC snapshot and rebootstrap the x86-64 code. Expect more of this to come; in particular, since we now have a reasonable number of machine registers, it may be worthwhile to assign one to be the return value register. --- src/compiler/machines/x86-64/lapgen.scm | 46 +- src/compiler/machines/x86-64/machin.scm | 1 - src/compiler/machines/x86-64/rules3.scm | 64 ++- src/compiler/machines/x86-64/rules4.scm | 9 +- src/compiler/machines/x86-64/rulflo.scm | 3 + src/microcode/cmpauxmd/x86-64.m4 | 580 ++++-------------------- src/microcode/cmpintmd/x86-64.c | 106 ++--- src/microcode/cmpintmd/x86-64.h | 115 +---- 8 files changed, 171 insertions(+), 753 deletions(-) diff --git a/src/compiler/machines/x86-64/lapgen.scm b/src/compiler/machines/x86-64/lapgen.scm index 2d504ed61..a28795447 100644 --- a/src/compiler/machines/x86-64/lapgen.scm +++ b/src/compiler/machines/x86-64/lapgen.scm @@ -214,11 +214,11 @@ USA. ;; We don't have unsigned addressing modes. (byte-offset-reference register offset)) -;++ This computation is probably not quite right. +;;; This returns an offset in objects, not bytes. (define-integrable (pseudo-register-offset register) - (+ (+ (* 16 address-units-per-object) (* 80 address-units-per-object)) - (* 3 (register-renumber register)))) + (+ (+ 16 80) ;Sixteen fixed, eighty hooks. + (register-renumber register))) (define-integrable (pseudo->machine-register source target) (memory->machine-register (pseudo-register-home source) target)) @@ -669,18 +669,10 @@ USA. (offset-reference regnum:regs-pointer register-block/environment-offset)) -(define reg:dynamic-link - (offset-reference regnum:regs-pointer - register-block/dynamic-link-offset)) - (define reg:lexpr-primitive-arity (offset-reference regnum:regs-pointer register-block/lexpr-primitive-arity-offset)) -(define reg:utility-arg-4 - (offset-reference regnum:regs-pointer - register-block/utility-arg4-offset)) - (define reg:stack-guard (offset-reference regnum:regs-pointer register-block/stack-guard-offset)) @@ -726,34 +718,20 @@ USA. (LAP (MOV B (R ,rax) (& ,code)) ,@(invoke-hook/call entry:compiler-scheme-to-interface/call))) -;++ This uses a kludge to number entries by byte offsets from the -;++ registers block, but that works only in the 32-bit i386 version; -;++ for x86-64 version, all the entries' byte indices exceed the range -;++ of signed bytes. But this works for now. - (define-syntax define-entries (sc-macro-transformer (lambda (form environment) environment `(BEGIN - ,@(let loop - ((names (cdddr form)) - (index (cadr form)) - (high (caddr form))) + ,@(let loop ((names (cddr form)) (index (cadr form))) (if (pair? names) - (if (< index high) - (cons `(DEFINE-INTEGRABLE - ,(symbol-append 'ENTRY:COMPILER- - (car names)) - (BYTE-OFFSET-REFERENCE REGNUM:REGS-POINTER - ,index)) - (loop (cdr names) (+ index 8) high)) - (begin - (warn "define-entries: Too many for byte offsets.") - (loop names index (+ high 32000)))) + (cons `(DEFINE-INTEGRABLE + ,(symbol-append 'ENTRY:COMPILER- (car names)) + (BYTE-OFFSET-REFERENCE REGNUM:REGS-POINTER ,index)) + (loop (cdr names) (+ index 8))) '())))))) -(define-entries #x80 #x100 ; (* 16 8) +(define-entries #x80 ; (* 16 8) scheme-to-interface ; Main entry point (only one necessary) scheme-to-interface/call ; Used by rules3&4, for convenience. trampoline-to-interface ; Used by trampolines, for convenience. @@ -769,9 +747,6 @@ USA. link error primitive-error - short-primitive-apply) - -(define-entries #x-100 0 &+ &- &* @@ -796,8 +771,7 @@ USA. shortcircuit-apply-size-6 shortcircuit-apply-size-7 shortcircuit-apply-size-8 - interrupt-continuation-2 - conditionally-serialize) + interrupt-continuation-2) ;; Operation tables diff --git a/src/compiler/machines/x86-64/machin.scm b/src/compiler/machines/x86-64/machin.scm index f4a30ce61..c28e8c63f 100644 --- a/src/compiler/machines/x86-64/machin.scm +++ b/src/compiler/machines/x86-64/machin.scm @@ -248,7 +248,6 @@ USA. (define-integrable register-block/environment-offset 3) (define-integrable register-block/dynamic-link-offset 4) ; compiler temp (define-integrable register-block/lexpr-primitive-arity-offset 7) -(define-integrable register-block/utility-arg4-offset 9) ; closure free (define-integrable register-block/stack-guard-offset 11) (define-integrable (fits-in-signed-byte? value) diff --git a/src/compiler/machines/x86-64/rules3.scm b/src/compiler/machines/x86-64/rules3.scm index 1295ea959..608043ecb 100644 --- a/src/compiler/machines/x86-64/rules3.scm +++ b/src/compiler/machines/x86-64/rules3.scm @@ -64,7 +64,7 @@ USA. continuation (expect-no-exit-interrupt-checks) (LAP ,@(clear-map!) - (POP Q (R ,rcx)) + (POP Q (R ,rbx)) #| (MOV Q (R ,rdx) (&U ,frame-size)) ,@(invoke-interface code:compiler-apply) @@ -104,7 +104,7 @@ USA. continuation (expect-no-exit-interrupt-checks) (LAP ,@(clear-map!) - (LEA Q (R ,rcx) (@PCR ,label)) + (LEA Q (R ,rbx) (@PCR ,label)) (MOV Q (R ,rdx) (&U ,number-pushed)) ,@(invoke-interface code:compiler-lexpr-apply))) @@ -114,8 +114,8 @@ USA. ;; It expects the procedure at the top of the stack (expect-no-exit-interrupt-checks) (LAP ,@(clear-map!) - (POP Q (R ,rcx)) - (AND Q (R ,rcx) (R ,regnum:datum-mask)) ; clear type code + (POP Q (R ,rbx)) + (AND Q (R ,rbx) (R ,regnum:datum-mask)) ; clear type code (MOV Q (R ,rdx) (&U ,number-pushed)) ,@(invoke-interface code:compiler-lexpr-apply))) @@ -139,7 +139,7 @@ USA. continuation (expect-no-exit-interrupt-checks) (let* ((set-extension - (interpreter-call-argument->machine-register! extension rcx)) + (interpreter-call-argument->machine-register! extension rbx)) (set-address (begin (require-register! rdx) (load-pc-relative-address (INST-EA (R ,rdx)) @@ -148,7 +148,7 @@ USA. (LAP ,@set-extension ,@set-address ,@(clear-map!) - (MOV Q (R ,rbx) (&U ,frame-size)) + (MOV Q (R ,rcx) (&U ,frame-size)) ,@(invoke-interface code:compiler-cache-reference-apply)))) (define-rule statement @@ -157,13 +157,13 @@ USA. continuation (expect-no-entry-interrupt-checks) (let* ((set-environment - (interpreter-call-argument->machine-register! environment rcx)) + (interpreter-call-argument->machine-register! environment rbx)) (set-name (object->machine-register! name rdx))) (delete-dead-registers!) (LAP ,@set-environment ,@set-name ,@(clear-map!) - (MOV Q (R ,rbx) (&U ,frame-size)) + (MOV Q (R ,rcx) (&U ,frame-size)) ,@(invoke-interface code:compiler-lookup-apply)))) (define-rule statement @@ -171,9 +171,9 @@ USA. continuation ; ignored (if (eq? primitive compiled-error-procedure) (LAP ,@(clear-map!) - (MOV Q (R ,rcx) (&U ,frame-size)) + (MOV Q (R ,rbx) (&U ,frame-size)) ,@(invoke-hook entry:compiler-error)) - (LAP ,@(object->machine-register! primitive rcx) + (LAP ,@(object->machine-register! primitive rbx) ,@(clear-map!) ,@(let ((arity (primitive-procedure-arity primitive))) (cond ((not (negative? arity)) @@ -621,11 +621,11 @@ USA. ;;; This is invoked by the top level of the LAP generator. (define (generate/quotation-header environment-label free-ref-label n-sections) - (LAP (MOV Q (R ,rcx) ,reg:environment) - (MOV Q (@PCR ,environment-label) (R ,rcx)) + (LAP (MOV Q (R ,rax) ,reg:environment) + (MOV Q (@PCR ,environment-label) (R ,rax)) (LEA Q (R ,rdx) (@PCR ,*block-label*)) - (LEA Q (R ,rbx) (@PCR ,free-ref-label)) - (MOV Q ,reg:utility-arg-4 (&U ,n-sections)) + (LEA Q (R ,rcx) (@PCR ,free-ref-label)) + (MOV Q (R ,r8) (&U ,n-sections)) #| ,@(invoke-interface/call code:compiler-link) |# @@ -639,10 +639,10 @@ USA. n-sections) (LAP (MOV Q (R ,rdx) (@PCR ,code-block-label)) (AND Q (R ,rdx) (R ,regnum:datum-mask)) - (LEA Q (R ,rbx) (@RO ,rdx ,free-ref-offset)) - (MOV Q (R ,rcx) ,reg:environment) - (MOV Q (@RO ,rdx ,environment-offset) (R ,rcx)) - (MOV Q ,reg:utility-arg-4 (&U ,n-sections)) + (LEA Q (R ,rcx) (@RO ,rdx ,free-ref-offset)) + (MOV Q (R ,rax) ,reg:environment) + (MOV Q (@RO ,rdx ,environment-offset) (R ,rax)) + (MOV Q (R ,r8) (&U ,n-sections)) #| ,@(invoke-interface/call code:compiler-link) |# @@ -661,41 +661,39 @@ USA. (PUSH Q (& 0)) (LABEL ,loop) ;; Get index - (MOV Q (R ,rcx) (@R ,rsp)) + (MOV Q (R ,rax) (@R ,rsp)) ;; Get vector (MOV Q (R ,rdx) (@PCR ,vector-label)) ;; Get n-sections for this cc-block - (XOR Q (R ,rbx) (R ,rbx)) - (LEA Q (R ,rax) (@PCR ,bytes)) - (MOV B (R ,rbx) (@RI ,rax ,rcx 1)) + (XOR Q (R ,r8) (R ,r8)) + (LEA Q (R ,rcx) (@PCR ,bytes)) + (MOV B (R ,r8) (@RI ,rcx ,rax 1)) ;; address of vector (AND Q (R ,rdx) (R ,regnum:datum-mask)) - ;; Store n-sections in arg - (MOV Q ,reg:utility-arg-4 (R ,rbx)) ;; vector-ref -> cc block (MOV Q (R ,rdx) (@ROI ,rdx ,address-units-per-object - ,rcx ,address-units-per-object)) + ,rax ,address-units-per-object)) ;; address of cc-block (AND Q (R ,rdx) (R ,regnum:datum-mask)) ;; cc-block length - (MOV Q (R ,rbx) (@R ,rdx)) + (MOV Q (R ,rcx) (@R ,rdx)) ;; Get environment - (MOV Q (R ,rcx) ,reg:environment) + (MOV Q (R ,rax) ,reg:environment) ;; Eliminate length tags - (AND Q (R ,rbx) (R ,regnum:datum-mask)) + (AND Q (R ,rcx) (R ,regnum:datum-mask)) ;; Store environment - (MOV Q (@RI ,rdx ,rbx ,address-units-per-object) (R ,rcx)) + (MOV Q (@RI ,rdx ,rcx ,address-units-per-object) (R ,rax)) ;; Get NMV header - (MOV Q (R ,rcx) (@RO ,rdx ,address-units-per-object)) + (MOV Q (R ,rax) (@RO ,rdx ,address-units-per-object)) ;; Eliminate NMV tag - (AND Q (R ,rcx) (R ,regnum:datum-mask)) + (AND Q (R ,rax) (R ,regnum:datum-mask)) ;; Address of first free reference (LEA Q - (R ,rbx) + (R ,rcx) (@ROI ,rdx ,(* 2 address-units-per-object) - ,rcx ,address-units-per-object)) + ,rax ,address-units-per-object)) ;; Invoke linker ,@(invoke-hook/call entry:compiler-link) ,@(make-external-label (continuation-code-word false) diff --git a/src/compiler/machines/x86-64/rules4.scm b/src/compiler/machines/x86-64/rules4.scm index 785b55abc..1f7c06518 100644 --- a/src/compiler/machines/x86-64/rules4.scm +++ b/src/compiler/machines/x86-64/rules4.scm @@ -55,7 +55,7 @@ USA. cont ; ignored (let* ((set-extension (interpreter-call-argument->machine-register! extension rdx)) - (set-value (interpreter-call-argument->machine-register! value rbx))) + (set-value (interpreter-call-argument->machine-register! value rcx))) (LAP ,@set-extension ,@set-value ,@(clear-map!) @@ -110,7 +110,7 @@ USA. (interpreter-call-argument->machine-register! environment rdx))) (LAP ,@set-environment ,@(clear-map (clear-map!)) - ,@(load-constant (INST-EA (R ,rbx)) name) + ,@(load-constant (INST-EA (R ,rcx)) name) ,@(invoke-interface/call code)))) (define-rule statement @@ -130,10 +130,9 @@ USA. (define (assignment-call code environment name value) (let* ((set-environment (interpreter-call-argument->machine-register! environment rdx)) - (set-value (interpreter-call-argument->machine-register! value rax))) + (set-value (interpreter-call-argument->machine-register! value r8))) (LAP ,@set-environment ,@set-value ,@(clear-map!) - (MOV Q ,reg:utility-arg-4 (R ,rax)) - ,@(load-constant (INST-EA (R ,rbx)) name) + ,@(load-constant (INST-EA (R ,rcx)) name) ,@(invoke-interface/call code)))) \ No newline at end of file diff --git a/src/compiler/machines/x86-64/rulflo.scm b/src/compiler/machines/x86-64/rulflo.scm index 201a761e0..885624014 100644 --- a/src/compiler/machines/x86-64/rulflo.scm +++ b/src/compiler/machines/x86-64/rulflo.scm @@ -27,6 +27,8 @@ USA. ;;; package: (compiler lap-syntaxer) (declare (usual-integrations)) + +#| ;; **** ;; Missing: 2 argument operations and predicates with non-trivial @@ -826,3 +828,4 @@ USA. (if (flo:32-bit-representation-exact? fp-value) (generate-ea (single-flonum->label fp-value) 'S) (generate-ea (double-flonum->label fp-value) 'D))) +|# diff --git a/src/microcode/cmpauxmd/x86-64.m4 b/src/microcode/cmpauxmd/x86-64.m4 index 58d7faf23..d134362f5 100644 --- a/src/microcode/cmpauxmd/x86-64.m4 +++ b/src/microcode/cmpauxmd/x86-64.m4 @@ -71,7 +71,7 @@ ### them, they contain garbage. ### ### Compiled Scheme code uses the following register convention: -### - %rsp containts the Scheme stack pointer, not the C stack +### - %rsp contains the Scheme stack pointer, not the C stack ### pointer. ### - %rsi contains a pointer to the Scheme interpreter's "register" ### block. This block contains the compiler's copy of MemTop, @@ -112,8 +112,6 @@ ### TYPE_CODE_LENGTH ### Normally defined to be 6. Don't change this unless you know ### what you're doing. -### DISABLE_387 -### If defined, do not generate 387 floating-point instructions. ### VALGRIND_MODE ### If defined, modify code to make it work with valgrind. @@ -123,14 +121,6 @@ ifdef(`WIN32', `define(IF_WIN32,`$1')', `define(IF_WIN32,`')') -ifdef(`DISABLE_387', - `define(IF387,`')', - `define(IF387,`$1')') - -ifdef(`DISABLE_387', - `define(IFN387,`$1')', - `define(IFN387,`')') - IF_WIN32(`define(DASM,1)') ifdef(`WCC386R',`define(WCC386,1)') @@ -308,23 +298,7 @@ define(IMM_FALSE, `IMM(HEX(0000000000000000))') define(REGBLOCK_VAL,16) define(REGBLOCK_COMPILER_TEMP,32) -define(REGBLOCK_LEXPR_ACTUALS,56) -define(REGBLOCK_PRIMITIVE,64) -define(REGBLOCK_CLOSURE_FREE,72) - define(REGBLOCK_DLINK,REGBLOCK_COMPILER_TEMP) -define(REGBLOCK_UTILITY_ARG4,REGBLOCK_CLOSURE_FREE) - -define(COMPILER_REGBLOCK_N_FIXED,16) -define(COMPILER_REGBLOCK_N_HOOKS,80) -define(COMPILER_REGBLOCK_N_TEMPS,256) -define(COMPILER_FIXED_SIZE,1) -define(COMPILER_HOOK_SIZE,1) -define(COMPILER_TEMP_SIZE,2) -define(REGBLOCK_SIZE_IN_OBJECTS, - eval((COMPILER_REGBLOCK_N_FIXED*COMPILER_FIXED_SIZE) - +(COMPILER_REGBLOCK_N_HOOKS*COMPILER_HOOK_SIZE) - +(COMPILER_REGBLOCK_N_TEMPS*COMPILER_TEMP_SIZE))) # Define the floating-point processor control word. Always set # round-to-even and double precision. Under Win32, mask all @@ -351,187 +325,24 @@ use_external_data(EVR(utility_table)) ifdef(`WIN32',` use_external_data(EVR(RegistersPtr)) ',` -define_data(Regstart) -allocate_space(Regstart,256) - -define_data(Registers) -allocate_space(Registers,eval(REGBLOCK_SIZE_IN_OBJECTS*8)) +use_external_data(EVR(Registers)) ') -define_data(i387_presence) -allocate_quadword(i387_presence) - define_data(C_Stack_Pointer) allocate_quadword(C_Stack_Pointer) define_data(C_Frame_Pointer) allocate_quadword(C_Frame_Pointer) - -# [TRC 20091025: CPUID is always supported.] -# define_data(x86_64_cpuid_supported) -# allocate_quadword(x86_64_cpuid_supported) - -# [TRC 20091025: The cache synchronization bug does not occur in any -# x86-64 machines of which I am aware.] -# define_data(x86_64_cpuid_needed) -# allocate_quadword(x86_64_cpuid_needed) DECLARE_CODE_SEGMENT() declare_alignment(2) -# [TRC 20091025: We need to check for MMX/SSEn instructions too.] - -define_c_label(x86_64_interface_initialize) - OP(push,q) REG(rbp) - OP(mov,q) TW(REG(rsp),REG(rbp)) - OP(xor,q) TW(REG(rax),REG(rax)) # No 387 available - -# [TRC 20091025: The AMD64 reference manual suggests using the CPUID -# instruction to detect instruction subsets instead.] - -# Unfortunately, the `movl cr0,ecx' instruction is privileged. -# Use the deprecated `smsw cx' instruction instead. - -IF387(` -# OP(mov,q) TW(REG(cr0),REG(rcx)) # Test for 387 presence -ifdef(`VALGRIND_MODE',`',` - smsw REG(cx) - OP(mov,q) TW(IMM(HEX(12)),REG(rdx)) - OP(and,q) TW(REG(rdx),REG(rcx)) - OP(cmp,q) TW(REG(rdx),REG(rcx)) - jne x86_64_initialize_no_fp -') - OP(inc,q) REG(rax) # 387 available - OP(sub,q) TW(IMM(8),REG(rsp)) - fclex - fnstcw WOF(-2,REG(rbp)) - OP(and,w) TW(IMM(HEX(f0e0)),WOF(-2,REG(rbp))) - OP(or,w) TW(IMM(FP_CONTROL_WORD),WOF(-2,REG(rbp))) - fldcw WOF(-2,REG(rbp)) -x86_64_initialize_no_fp: -') - OP(mov,q) TW(REG(rax),ABS(EVR(i387_presence))) - -# [TRC 20091025: CPUID is always supported.] - -# Do a bunch of hair to determine if we need to do cache synchronization. -# See if the CPUID instruction is supported. - -# OP(xor,q) TW(REG(rax),REG(rax)) -# OP(mov,q) TW(REG(rax),ABS(EVR(x86_64_cpuid_supported))) -# OP(mov,q) TW(REG(rax),ABS(EVR(x86_64_cpuid_needed))) - -# First test: can we toggle the AC bit? - -# pushfd -# OP(pop,l) REG(eax) -# OP(mov,l) TW(REG(eax),REG(ecx)) -# OP(xor,l) TW(IMM(HEX(00040000)),REG(eax)) -# OP(push,l) REG(eax) -# popfd -# pushfd -# OP(pop,l) REG(eax) - -# if AC bit can't be toggled, this is a 386 (and doesn't support CPUID). - -# OP(xor,l) TW(REG(ecx),REG(eax)) -# jz no_cpuid_instr -# OP(push,l) REG(ecx) # restore EFLAGS -# popfd - -# Now test to see if the ID bit can be toggled. - -# OP(mov,l) TW(REG(ecx),REG(eax)) -# OP(xor,l) TW(IMM(HEX(00200000)),REG(eax)) -# OP(push,l) REG(eax) -# popfd -# pushfd -# OP(pop,l) REG(eax) - -# if ID bit can't be toggled, this is a 486 that doesn't support CPUID. - -# OP(xor,l) TW(REG(ecx),REG(eax)) -# jz no_cpuid_instr -# OP(push,l) REG(ecx) # restore EFLAGS -# popfd - -# Now we know that cpuid is supported. - -# OP(mov,q) TW(IMM(HEX(00000001)),ABS(EVR(x86_64_cpuid_supported))) - -# Next, use the CPUID instruction to determine the processor type. - -# OP(push,l) REG(ebx) -# OP(xor,l) TW(REG(eax),REG(eax)) -# cpuid - -# Check that CPUID accepts argument 1. - -# OP(cmp,l) TW(IMM(HEX(00000001)),REG(eax)) -# jl done_setting_up_cpuid - -# Detect "GenuineIntel". - -# OP(cmp,l) TW(IMM(HEX(756e6547)),REG(ebx)) -# jne not_intel_cpu -# OP(cmp,l) TW(IMM(HEX(49656e69)),REG(edx)) -# jne not_intel_cpu -# OP(cmp,l) TW(IMM(HEX(6c65746e)),REG(ecx)) -# jne not_intel_cpu - -# For CPU families 4 (486), 5 (Pentium), or 6 (Pentium Pro, Pentium -# II, Pentium III), don't use CPUID synchronization. - -# OP(mov,l) TW(IMM(HEX(01)),REG(eax)) -# cpuid -# OP(shr,l) TW(IMM(HEX(08)),REG(eax)) -# OP(and,l) TW(IMM(HEX(0000000F)),REG(eax)) -# OP(cmp,l) TW(IMM(HEX(4)),REG(eax)) -# jl done_setting_up_cpuid -# OP(cmp,l) TW(IMM(HEX(6)),REG(eax)) -# jg done_setting_up_cpuid +# C_to_interface passes control from C into Scheme. To C it is a +# unary procedure; its one argument is passed in rdi. It saves the +# state of the C world (the C frame pointer and stack pointer) and +# then passes control to interface_to_scheme to set up the state of +# the Scheme world. # -# jmp cpuid_not_needed -# -#not_intel_cpu: - -# Detect "AuthenticAMD". - -# OP(cmp,l) TW(IMM(HEX(68747541)),REG(ebx)) -# jne not_amd_cpu -# OP(cmp,l) TW(IMM(HEX(69746e65)),REG(edx)) -# jne not_amd_cpu -# OP(cmp,l) TW(IMM(HEX(444d4163)),REG(ecx)) -# jne not_amd_cpu - -# Problem appears to exist only on Athlon models 1, 3, and 4. - -# OP(mov,l) TW(IMM(HEX(01)),REG(eax)) -# cpuid - -# OP(mov,l) TW(REG(eax),REG(ecx)) -# OP(shr,l) TW(IMM(HEX(08)),REG(eax)) -# OP(and,l) TW(IMM(HEX(0000000F)),REG(eax)) -# OP(cmp,l) TW(IMM(HEX(6)),REG(eax)) # family 6 = Athlon -# jne done_setting_up_cpuid - -# OP(mov,l) TW(REG(ecx),REG(eax)) -# OP(shr,l) TW(IMM(HEX(04)),REG(eax)) -# OP(and,l) TW(IMM(HEX(0000000F)),REG(eax)) -# OP(cmp,l) TW(IMM(HEX(6)),REG(eax)) # model 6 and up OK -# jge done_setting_up_cpuid -# OP(cmp,l) TW(IMM(HEX(2)),REG(eax)) # model 2 OK -# je done_setting_up_cpuid - -# OP(mov,l) TW(IMM(HEX(00000001)),ABS(EVR(x86_64_cpuid_needed))) - -#not_amd_cpu: -#done_setting_up_cpuid: -# OP(pop,l) REG(ebx) -#no_cpuid_instr: - leave - ret - # Note: The AMD64 ABI mandates that on entry to a function, RSP - 8 # must be a multiple of 0x10; that is, the stack must be 128-bit # aligned. We push six quadwords onto the stack, but there is already @@ -557,68 +368,46 @@ define_c_label(C_to_interface) define_hook_label(trampoline_to_interface) define_debugging_label(trampoline_to_interface) - OP(pop,q) REG(rcx) # trampoline storage + OP(pop,q) REG(rbx) # trampoline storage jmp scheme_to_interface define_hook_label(scheme_to_interface_call) define_debugging_label(scheme_to_interface_call) - OP(pop,q) REG(rcx) # arg1 = ret. add - OP(add,q) TW(IMM(4),REG(rcx)) # Skip format info + OP(pop,q) REG(rbx) # arg1 = ret. add + OP(add,q) TW(IMM(4),REG(rbx)) # Skip format info # jmp scheme_to_interface +# scheme_to_interface passes control from compiled Scheme code to a +# microcode utility. The arguments for the utility go respectively in +# rbx, rdx, rcx, and r8. This loosely matches the AMD64 calling +# convention, where arguments go respectively in rdi, rsi, rdx, rcx, +# and r8. The differences are that scheme_to_interface uses rdi as an +# implicit first argument to the utility, and rsi is used in compiled +# code for the registers block, since the compiler can't allocate it +# as a general-purpose register because it doesn't admit byte-size +# operations. Moreover, Scheme uses rdi as the free pointer register, +# which we have to save here in a location unknown to Scheme (the C +# `Free' variable), so it can't be set by compiled code. + define_hook_label(scheme_to_interface) define_debugging_label(scheme_to_interface) - -# These two moves must happen _before_ the ffree instructions below. -# Otherwise recovery from SIGFPE there will fail. OP(mov,q) TW(REG(rsp),ABS(EVR(stack_pointer))) OP(mov,q) TW(rfree,ABS(EVR(Free))) - -# [TRC 20091025: I think this should be excised.] - -IF387(` - OP(cmp,q) TW(IMM(0),ABS(EVR(i387_presence))) - je scheme_to_interface_proceed - ffree ST(0) # Free floating "regs" - ffree ST(1) - ffree ST(2) - ffree ST(3) - ffree ST(4) - ffree ST(5) - ffree ST(6) - ffree ST(7) -scheme_to_interface_proceed: -') - OP(mov,q) TW(ABS(EVR(C_Stack_Pointer)),REG(rsp)) OP(mov,q) TW(ABS(EVR(C_Frame_Pointer)),REG(rbp)) OP(sub,q) TW(IMM(16),REG(rsp)) # alloc struct return - - # Shuffle Scheme -> AMD64 calling conventions: - # struct pointer -> rdi - # rcx -> rsi - # rdx -> rdx - # rbx -> rcx - # arg4 -> r8 - # Parallel assignment problems: - # arg4 depends on rsi: do arg4->r8 first - # target depends on rcx (why?): use r11 as a temporary - # [TRC 20091025: Perhaps we can rearrange LIAR to generate - # arguments in the registers we want, to avoid this - # shuffling.] - - OP(mov,q) TW(REG(rcx),REG(r11)) - - OP(xor,q) TW(REG(rcx),REG(rcx)) - OP(mov,b) TW(REG(al),REG(cl)) - OP(lea,q) TW(ABS(EVR(utility_table)),REG(rax)) - OP(mov,q) TW(SDX(,REG(rax),REG(rcx),8),REG(rax)) - - OP(mov,q) TW(REG(rsp),REG(rdi)) - OP(mov,q) TW(DOF(REGBLOCK_UTILITY_ARG4(),regs),REG(r8)) - OP(mov,q) TW(REG(r11),REG(rsi)) - OP(mov,q) TW(REG(rbx),REG(rcx)) + OP(mov,q) TW(REG(rsp),REG(rdi)) # Structure is first argument. + OP(mov,q) TW(REG(rbx),REG(rsi)) # rbx -> second argument. + + # Find the utility. rbx is now free as a temporary register + # to hold the utility table. rax initially stores the utility + # number in its low eight bits and possibly garbage in the + # rest; mask it off and then use it as an index into the + # utility table, scaled by 8 (bytes per pointer). + OP(lea,q) TW(ABS(EVR(utility_table)),REG(rbx)) + OP(and,q) TW(IMM(HEX(ff)),REG(rax)) + OP(mov,q) TW(SDX(,REG(rbx),REG(rax),8),REG(rax)) call IJMP(REG(rax)) @@ -628,29 +417,12 @@ define_debugging_label(scheme_to_interface_return) jmp IJMP(REG(rax)) # Invoke handler define_c_label(interface_to_scheme) -IF387(` - OP(cmp,q) TW(IMM(0),ABS(EVR(i387_presence))) - je interface_to_scheme_proceed - ffree ST(0) # Free floating "regs" - ffree ST(1) - ffree ST(2) - ffree ST(3) - ffree ST(4) - ffree ST(5) - ffree ST(6) - ffree ST(7) -interface_to_scheme_proceed: -') - # Register block = %rsi - # Scheme offset in NT -ifdef(`WIN32', +ifdef(`WIN32', # Register block = %rsi ` OP(mov,q) TW(ABS(EVR(RegistersPtr)),regs)', ` OP(lea,q) TW(ABS(EVR(Registers)),regs)') - OP(mov,q) TW(ABS(EVR(Free)),rfree) # Free pointer = %rdi OP(mov,q) TW(DOF(REGBLOCK_VAL(),regs),REG(rax)) # Value/dynamic link OP(mov,q) TW(IMM(ADDRESS_MASK),rmask) # = %rbp - OP(mov,q) TW(ABS(EVR(stack_pointer)),REG(rsp)) OP(mov,q) TW(REG(rax),REG(rcx)) # Preserve if used OP(and,q) TW(rmask,REG(rcx)) # Restore potential dynamic link @@ -665,19 +437,6 @@ define_code_label(EFR(callWinntExceptionTransferHook)) ') define_c_label(interface_to_C) -IF387(` - OP(cmp,q) TW(IMM(0),ABS(EVR(i387_presence))) - je interface_to_C_proceed - ffree ST(0) # Free floating "regs" - ffree ST(1) - ffree ST(2) - ffree ST(3) - ffree ST(4) - ffree ST(5) - ffree ST(6) - ffree ST(7) -interface_to_C_proceed:') - OP(mov,q) TW(REG(rdx),REG(rax)) # Set up result # We need a dummy register for the POP (which is three bytes # shorter than ADD $8,RSP); since we're about to pop into r15 @@ -691,33 +450,6 @@ interface_to_C_proceed:') leave ret -# [TRC 20091025: The cache synchronization bug does not occur in any -# x86-64 machines of which I am aware.] - -#define_code_label(EFR(x86_64_cache_synchronize)) -# OP(push,q) REG(rbp) -# OP(mov,q) TW(REG(rsp),REG(rbp)) -# OP(push,q) REG(rbx) -# OP(xor,q) TW(REG(rax),REG(rax)) -# cpuid -# OP(pop,q) REG(rbx) -# leave -# ret - -### Run the CPUID instruction for serialization. - -#define_hook_label(serialize_cache) -# pushad -# OP(xor,q) TW(REG(rax),REG(rax)) -# cpuid -# popad -# ret - -### Stub to be used in place of above on machines that don't need it. - -#define_hook_label(dont_serialize_cache) -# ret - ### Assembly language hooks used to reduce code size. ### There is no time advantage to using these over using ### scheme_to_interface (or scheme_to_interface_call), but the @@ -743,27 +475,6 @@ define_hook_label(interrupt_dlink) OP(mov,b) TW(IMM(HEX(19)),REG(al)) jmp scheme_to_interface_call -### -### This saves even more instructions than primitive_apply -### When the PC is not available. Instead of jumping here, -### a call instruction is used, and the longword offset to -### the primitive object follows the call instruction. -### This code loads the primitive object and merges with -### apply_primitive -### -### [TRC 20091025: But on the x86-64, we have RIP-relative -### addressing, so we don't need this.] -### - -#declare_alignment(2) -#define_hook_label(short_primitive_apply) -# OP(pop,l) REG(edx) # offset pointer -# OP(mov,l) TW(IND(REG(edx)),REG(ecx)) # offset -# # Primitive object -# OP(mov,l) TW(IDX(REG(edx),REG(ecx)),REG(ecx)) -# # Merge -# jmp hook_reference(primitive_apply) - declare_alignment(2) define_jump_indirection(primitive_apply,12) @@ -787,17 +498,16 @@ define_call_indirection(primitive_error,36) declare_alignment(2) define_hook_label(sc_apply) - OP(mov,q) TW(REG(rcx),REG(rax)) # Copy for type code - OP(mov,q) TW(REG(rcx),REG(rbx)) # Copy for address + OP(mov,q) TW(REG(rbx),REG(rax)) # Copy for type code + OP(mov,q) TW(REG(rbx),REG(rcx)) # Copy for address OP(shr,q) TW(IMM(DATUM_LENGTH),REG(rax)) # Select type code - OP(and,q) TW(rmask,REG(rbx)) # Select datum + OP(and,q) TW(rmask,REG(rcx)) # Select datum OP(cmp,b) TW(IMM(TC_COMPILED_ENTRY),REG(al)) jne asm_sc_apply_generic - # [TRC 20091025: How big are the frame sizes?] - OP(movs,bq,x) TW(BOF(-4,REG(rbx)),REG(rax)) # Extract frame size + OP(movs,bq,x) TW(BOF(-4,REG(rcx)),REG(rax)) # Extract frame size OP(cmp,q) TW(REG(rax),REG(rdx)) # Compare to nargs+1 jne asm_sc_apply_generic - jmp IJMP(REG(rbx)) # Invoke + jmp IJMP(REG(rcx)) # Invoke define_debugging_label(asm_sc_apply_generic) OP(mov,q) TW(IMM(HEX(14)),REG(rax)) @@ -806,16 +516,15 @@ define_debugging_label(asm_sc_apply_generic) define(define_apply_fixed_size, `declare_alignment(2) define_hook_label(sc_apply_size_$1) - OP(mov,q) TW(REG(rcx),REG(rax)) # Copy for type code - OP(mov,q) TW(REG(rcx),REG(rbx)) # Copy for address + OP(mov,q) TW(REG(rbx),REG(rax)) # Copy for type code + OP(mov,q) TW(REG(rbx),REG(rcx)) # Copy for address OP(shr,q) TW(IMM(DATUM_LENGTH),REG(rax)) # Select type code - OP(and,q) TW(rmask,REG(rbx)) # Select datum + OP(and,q) TW(rmask,REG(rcx)) # Select datum OP(cmp,b) TW(IMM(TC_COMPILED_ENTRY),REG(al)) jne asm_sc_apply_generic_$1 - # [TRC 20091025: How big are the frame sizes?] - OP(cmp,b) TW(IMM($1),BOF(-4,REG(rbx))) # Compare frame size - jne asm_sc_apply_generic_$1 # to nargs+1 - jmp IJMP(REG(rbx)) + OP(cmp,b) TW(IMM($1),BOF(-4,REG(rcx))) # Compare frame size + jne asm_sc_apply_generic_$1 # to nargs+1 + jmp IJMP(REG(rcx)) asm_sc_apply_generic_$1: OP(mov,q) TW(IMM($1),REG(rdx)) @@ -837,25 +546,6 @@ define_apply_fixed_size(8) ### numeric types are much faster than the rare ones ### (bignums, ratnums, recnums) -IF387(`declare_alignment(2) -asm_generic_flonum_result: - # The MOV instruction can take a 64-bit immediate operand only - # if the target is a register, so we store the manifest in rax - # before moving it to memory. - OP(mov,q) TW(IMM_MANIFEST_NM_VECTOR_1,REG(rax)) - OP(mov,q) TW(REG(rax), IND(rfree)) - # The OR instruction cannot take a 64-bit immediate either, so - # we need to store the tag in rax first, shift it up, and then - # OR the datum into it. - OP(mov,q) TW(IMM(TC_FLONUM),REG(rax)) - OP(shl,q) TW(IMM(DATUM_LENGTH),REG(rax)) - OP(or,q) TW(rfree,REG(rax)) - OP(fstp,l) DOF(8,rfree) # fstpd - OP(and,q) TW(rmask,IND(REG(rsp))) - OP(add,q) TW(IMM(16),rfree) - OP(mov,q) TW(REG(rax),DOF(REGBLOCK_VAL(),regs)) - ret - declare_alignment(2) asm_generic_fixnum_result: OP(and,q) TW(rmask,IND(REG(rsp))) @@ -876,7 +566,7 @@ asm_generic_return_sharp_f: OP(and,q) TW(rmask,IND(REG(rsp))) OP(mov,q) TW(IMM_FALSE,REG(rax)) OP(mov,q) TW(REG(rax),LOF(REGBLOCK_VAL(),regs)) - ret') + ret define(define_unary_operation, `declare_alignment(2) @@ -885,13 +575,7 @@ define_hook_label(generic_$1) OP(mov,q) TW(REG(rdx),REG(rax)) OP(shr,q) TW(IMM(DATUM_LENGTH),REG(rax)) OP(cmp,b) TW(IMM(TC_FIXNUM),REG(al)) - je asm_generic_$1_fix - OP(cmp,b) TW(IMM(TC_FLONUM),REG(al)) jne asm_generic_$1_fail - OP(and,q) TW(rmask,REG(rdx)) - fld1 - OP($4,l) DOF(8,REG(rdx)) - jmp asm_generic_flonum_result asm_generic_$1_fix: OP(mov,q) TW(REG(rdx),REG(rax)) @@ -911,17 +595,7 @@ define_hook_label(generic_$1) OP(mov,q) TW(REG(rdx),REG(rax)) OP(shr,q) TW(IMM(DATUM_LENGTH),REG(rax)) OP(cmp,b) TW(IMM(TC_FIXNUM),REG(al)) - je asm_generic_$1_fix - OP(cmp,b) TW(IMM(TC_FLONUM),REG(al)) jne asm_generic_$1_fail - OP(and,q) TW(rmask,REG(rdx)) - OP(fld,l) DOF(8,REG(rdx)) - ftst - fstsw REG(ax) - fstp ST(0) - sahf - $4 asm_generic_return_sharp_t - jmp asm_generic_return_sharp_f asm_generic_$1_fix: OP(mov,q) TW(REG(rdx),REG(rax)) @@ -936,6 +610,10 @@ asm_generic_$1_fail: jmp scheme_to_interface') define(define_binary_operation, +`define_binary_operation_with_fixup($1,$2,$3, + `OP(shl,q) TW(IMM(TC_LENGTH),REG(rax))')') + +define(define_binary_operation_with_fixup, `declare_alignment(2) define_hook_label(generic_$1) OP(pop,q) REG(rdx) @@ -947,93 +625,22 @@ define_hook_label(generic_$1) OP(cmp,b) TW(REG(al),REG(cl)) jne asm_generic_$1_fail OP(cmp,b) TW(IMM(TC_FIXNUM),REG(al)) - je asm_generic_$1_fix - OP(cmp,b) TW(IMM(TC_FLONUM),REG(al)) - je asm_generic_$1_flo - -asm_generic_$1_fail: - OP(push,q) REG(rbx) - OP(push,q) REG(rdx) - OP(mov,b) TW(IMM(HEX($2)),REG(al)) - jmp scheme_to_interface + jne asm_generic_$1_fail asm_generic_$1_fix: OP(mov,q) TW(REG(rdx),REG(rax)) OP(mov,q) TW(REG(rbx),REG(rcx)) - OP(shl,q) TW(IMM(TC_LENGTH),REG(rax)) + $4 # Set up rax. OP(shl,q) TW(IMM(TC_LENGTH),REG(rcx)) - $5 OP($3,q) TW(REG(rcx),REG(rax)) # subq - jo asm_generic_$1_fail - jmp asm_generic_fixnum_result - -asm_generic_$1_flo: - OP(and,q) TW(rmask,REG(rdx)) - OP(and,q) TW(rmask,REG(rbx)) - OP(fld,l) DOF(8,REG(rdx)) # fldd - OP($4,l) DOF(8,REG(rbx)) # fsubl - jmp asm_generic_flonum_result') - -IF387(`declare_alignment(2) -define_hook_label(generic_divide) - OP(pop,q) REG(rdx) - OP(pop,q) REG(rbx) - OP(mov,q) TW(REG(rdx),REG(rax)) - OP(mov,q) TW(REG(rbx),REG(rcx)) - OP(shr,q) TW(IMM(DATUM_LENGTH),REG(rax)) - OP(shr,q) TW(IMM(DATUM_LENGTH),REG(rcx)) - OP(cmp,b) TW(IMM(TC_FIXNUM),REG(al)) - je asm_generic_divide_fix - OP(cmp,b) TW(IMM(TC_FLONUM),REG(al)) - jne asm_generic_divide_fail - OP(cmp,b) TW(IMM(TC_FLONUM),REG(cl)) - je asm_generic_divide_flo_flo - OP(cmp,b) TW(IMM(TC_FIXNUM),REG(cl)) - jne asm_generic_divide_fail - OP(mov,q) TW(REG(rbx),REG(rcx)) - OP(shl,q) TW(IMM(TC_LENGTH),REG(rcx)) - je asm_generic_divide_fail - OP(and,q) TW(rmask,REG(rdx)) - OP(sar,q) TW(IMM(TC_LENGTH),REG(rcx)) - OP(fld,l) DOF(8,REG(rdx)) # fldd - OP(mov,q) TW(REG(rcx),IND(rfree)) - OP(fidiv,l) IND(rfree) - jmp asm_generic_flonum_result - -asm_generic_divide_fix: - OP(cmp,b) TW(IMM(TC_FLONUM),REG(cl)) - jne asm_generic_divide_fail - OP(mov,q) TW(REG(rdx),REG(rcx)) - OP(shl,q) TW(IMM(TC_LENGTH),REG(rcx)) - je asm_generic_divide_fail - OP(and,q) TW(rmask,REG(rbx)) - OP(sar,q) TW(IMM(TC_LENGTH),REG(rcx)) - OP(fld,l) DOF(8,REG(rbx)) # fldd - OP(mov,q) TW(REG(rcx),IND(rfree)) - OP(fidivr,l) IND(rfree) - jmp asm_generic_flonum_result - -asm_generic_divide_flo_flo: - OP(mov,q) TW(REG(rbx),REG(rcx)) - OP(and,q) TW(rmask,REG(rcx)) - OP(fld,l) DOF(8,REG(rcx)) # fldd - ftst - fstsw REG(ax) - sahf - je asm_generic_divide_by_zero - OP(and,q) TW(rmask,REG(rdx)) - OP(fdivr,l) DOF(8,REG(rdx)) - jmp asm_generic_flonum_result - -asm_generic_divide_by_zero: - fstp ST(0) # Pop second arg - -asm_generic_divide_fail: + jno asm_generic_fixnum_result + +asm_generic_$1_fail: OP(push,q) REG(rbx) OP(push,q) REG(rdx) - OP(mov,b) TW(IMM(HEX(23)),REG(al)) + OP(mov,b) TW(IMM(HEX($2)),REG(al)) jmp scheme_to_interface') - + define(define_binary_predicate, `declare_alignment(2) define_hook_label(generic_$1) @@ -1048,6 +655,7 @@ define_hook_label(generic_$1) OP(cmp,b) TW(IMM(TC_FIXNUM),REG(al)) jne asm_generic_$1_fail +asm_generic_$1_fix: OP(shl,q) TW(IMM(TC_LENGTH),REG(rdx)) OP(shl,q) TW(IMM(TC_LENGTH),REG(rbx)) OP(cmp,q) TW(REG(rbx),REG(rdx)) @@ -1060,30 +668,32 @@ asm_generic_$1_fail: OP(mov,b) TW(IMM(HEX($2)),REG(al)) jmp scheme_to_interface') -IF387(`define_unary_operation(decrement,22,sub,fsubr) -define_unary_operation(increment,26,add,fadd) - -define_unary_predicate(negative,2a,jl,jb) -define_unary_predicate(positive,2c,jg,ja) -define_unary_predicate(zero,2d,je,je) - -# define_binary_operation(name,index,fix*fix,flo*flo, fixup) -# define_binary_operation( $1, $2, $3, $4, $5) -# The fixup is optional; only multiplication needs it to shift the -# result back down by six bits. -define_binary_operation(add,2b,add,fadd) -define_binary_operation(subtract,28,sub,fsub) -define_binary_operation(multiply,29,imul,fmul, - `OP(shr,q) TW(IMM(6),REG(rax))') -# Divide needs to check for 0, so we cant really use the following -# define_binary_operation(divide,23,NONE,fdiv) - -# define_binary_predicate(name,index,fix*fix,flo*flo) -define_binary_predicate(equal,24,je,je) -define_binary_predicate(greater,25,jg,ja) -define_binary_predicate(less,27,jl,jb)') - -IFN387(`define_jump_indirection(generic_decrement,22) +#define_unary_operation(decrement,22,sub) +#define_unary_operation(increment,26,add) + +#define_unary_predicate(negative,2a,jl) +#define_unary_predicate(positive,2c,jg) +#define_unary_predicate(zero,2d,je) + +# define_binary_operation(name,index,op) +# define_binary_operation( $1, $2,$3) +#define_binary_operation(add,2b,add) +#define_binary_operation(subtract,28,sub) + +# No fixup -- leave it unshifted. +#define_binary_operation_with_fixup(multiply,29,imul) + +# define_binary_predicate(name,index,jcc) +#define_binary_predicate(equal,24,je) +#define_binary_predicate(greater,25,jg) +#define_binary_predicate(less,27,jl) + +# At the moment, there is no advantage to using the above code, and in +# fact using it is a waste, since the compiler open-codes the fixnum +# case already. Later, the above code will also handle floating-point +# arguments, which the compiler does not open-code. + +define_jump_indirection(generic_decrement,22) define_jump_indirection(generic_divide,23) define_jump_indirection(generic_equal,24) define_jump_indirection(generic_greater,25) @@ -1094,30 +704,10 @@ define_jump_indirection(generic_multiply,29) define_jump_indirection(generic_negative,2a) define_jump_indirection(generic_add,2b) define_jump_indirection(generic_positive,2c) -define_jump_indirection(generic_zero,2d)') - -# These don't currently differ according to whether there -# is a 387 or not. - +define_jump_indirection(generic_zero,2d) define_jump_indirection(generic_quotient,37) define_jump_indirection(generic_remainder,38) define_jump_indirection(generic_modulo,39) - -define_jump_indirection(nofp_decrement,22) -define_jump_indirection(nofp_divide,23) -define_jump_indirection(nofp_equal,24) -define_jump_indirection(nofp_greater,25) -define_jump_indirection(nofp_increment,26) -define_jump_indirection(nofp_less,27) -define_jump_indirection(nofp_subtract,28) -define_jump_indirection(nofp_multiply,29) -define_jump_indirection(nofp_negative,2a) -define_jump_indirection(nofp_add,2b) -define_jump_indirection(nofp_positive,2c) -define_jump_indirection(nofp_zero,2d) -define_jump_indirection(nofp_quotient,37) -define_jump_indirection(nofp_remainder,38) -define_jump_indirection(nofp_modulo,39) IFDASM(`end') diff --git a/src/microcode/cmpintmd/x86-64.c b/src/microcode/cmpintmd/x86-64.c index a7cc7f0d4..3f364c144 100644 --- a/src/microcode/cmpintmd/x86-64.c +++ b/src/microcode/cmpintmd/x86-64.c @@ -198,10 +198,9 @@ store_trampoline_insns (insn_t * entry, byte_t code) { (*entry++) = 0xB0; /* MOV AL,code */ (*entry++) = code; - (*entry++) = 0xFF; /* CALL /2 disp32(ESI) */ + (*entry++) = 0xFF; /* CALL /2 disp32(RSI) */ (*entry++) = 0x96; (* ((uint32_t *) entry)) = RSI_TRAMPOLINE_TO_INTERFACE_OFFSET; - X86_64_CACHE_SYNCHRONIZE (); return (false); } @@ -223,7 +222,6 @@ x86_64_reset_hook (void) { int offset = (COMPILER_REGBLOCK_N_FIXED * (sizeof (SCHEME_OBJECT))); unsigned char * rsi_value = ((unsigned char *) Registers); - bool fp_support_present = (x86_64_interface_initialize ()); /* These must match machines/x86-64/lapgen.scm */ @@ -249,78 +247,36 @@ x86_64_reset_hook (void) SETUP_REGISTER (asm_link); /* 12 */ SETUP_REGISTER (asm_error); /* 13 */ SETUP_REGISTER (asm_primitive_error); /* 14 */ - /* [TRC 20091025: This was an i386 hack for when the PC is not - available, which on x86-64 it always is. */ - /* SETUP_REGISTER (asm_short_primitive_apply); */ /* 15 */ - - /* No more room for positive offsets without going to 32-bit - offsets! */ - - /* This is a hack to make all the hooks be addressable with byte - offsets (instead of longword offsets). The register block - extends to negative offsets as well, so all the following hooks - are accessed with negative offsets, and all fit in a byte. */ - - /* [TRC 20091029: This hack doesn't work any longer; this code - should be cleaned up, since we must use longword offsets anyway.] - */ - - offset = -256; - if (fp_support_present) - { - SETUP_REGISTER (asm_generic_add); /* -32 */ - SETUP_REGISTER (asm_generic_subtract); /* -31 */ - SETUP_REGISTER (asm_generic_multiply); /* -30 */ - SETUP_REGISTER (asm_generic_divide); /* -29 */ - SETUP_REGISTER (asm_generic_equal); /* -28 */ - SETUP_REGISTER (asm_generic_less); /* -27 */ - SETUP_REGISTER (asm_generic_greater); /* -26 */ - SETUP_REGISTER (asm_generic_increment); /* -25 */ - SETUP_REGISTER (asm_generic_decrement); /* -24 */ - SETUP_REGISTER (asm_generic_zero); /* -23 */ - SETUP_REGISTER (asm_generic_positive); /* -22 */ - SETUP_REGISTER (asm_generic_negative); /* -21 */ - SETUP_REGISTER (asm_generic_quotient); /* -20 */ - SETUP_REGISTER (asm_generic_remainder); /* -19 */ - SETUP_REGISTER (asm_generic_modulo); /* -18 */ - } - else - { - SETUP_REGISTER (asm_nofp_add); /* -32 */ - SETUP_REGISTER (asm_nofp_subtract); /* -31 */ - SETUP_REGISTER (asm_nofp_multiply); /* -30 */ - SETUP_REGISTER (asm_nofp_divide); /* -29 */ - SETUP_REGISTER (asm_nofp_equal); /* -28 */ - SETUP_REGISTER (asm_nofp_less); /* -27 */ - SETUP_REGISTER (asm_nofp_greater); /* -26 */ - SETUP_REGISTER (asm_nofp_increment); /* -25 */ - SETUP_REGISTER (asm_nofp_decrement); /* -24 */ - SETUP_REGISTER (asm_nofp_zero); /* -23 */ - SETUP_REGISTER (asm_nofp_positive); /* -22 */ - SETUP_REGISTER (asm_nofp_negative); /* -21 */ - SETUP_REGISTER (asm_nofp_quotient); /* -20 */ - SETUP_REGISTER (asm_nofp_remainder); /* -19 */ - SETUP_REGISTER (asm_nofp_modulo); /* -18 */ - } - - SETUP_REGISTER (asm_sc_apply); /* -17 */ - SETUP_REGISTER (asm_sc_apply_size_1); /* -16 */ - SETUP_REGISTER (asm_sc_apply_size_2); /* -15 */ - SETUP_REGISTER (asm_sc_apply_size_3); /* -14 */ - SETUP_REGISTER (asm_sc_apply_size_4); /* -13 */ - SETUP_REGISTER (asm_sc_apply_size_5); /* -12 */ - SETUP_REGISTER (asm_sc_apply_size_6); /* -11 */ - SETUP_REGISTER (asm_sc_apply_size_7); /* -10 */ - SETUP_REGISTER (asm_sc_apply_size_8); /* -9 */ - SETUP_REGISTER (asm_interrupt_continuation_2); /* -8 */ - /* [TRC 20091025: The cache synchronization bug does not occur in any - x86-64 machines of which I am aware.] - - if (x86_64_cpuid_needed) - SETUP_REGISTER (asm_serialize_cache); /\* -7 *\/ - else - SETUP_REGISTER (asm_dont_serialize_cache); /\* -7 *\/ - */ + SETUP_REGISTER (asm_generic_add); /* 15 */ + SETUP_REGISTER (asm_generic_subtract); /* 16 */ + SETUP_REGISTER (asm_generic_multiply); /* 17 */ + SETUP_REGISTER (asm_generic_divide); /* 18 */ + SETUP_REGISTER (asm_generic_equal); /* 19 */ + SETUP_REGISTER (asm_generic_less); /* 20 */ + SETUP_REGISTER (asm_generic_greater); /* 21 */ + SETUP_REGISTER (asm_generic_increment); /* 22 */ + SETUP_REGISTER (asm_generic_decrement); /* 23 */ + SETUP_REGISTER (asm_generic_zero); /* 24 */ + SETUP_REGISTER (asm_generic_positive); /* 25 */ + SETUP_REGISTER (asm_generic_negative); /* 26 */ + SETUP_REGISTER (asm_generic_quotient); /* 27 */ + SETUP_REGISTER (asm_generic_remainder); /* 28 */ + SETUP_REGISTER (asm_generic_modulo); /* 29 */ + SETUP_REGISTER (asm_sc_apply); /* 30 */ + SETUP_REGISTER (asm_sc_apply_size_1); /* 31 */ + SETUP_REGISTER (asm_sc_apply_size_2); /* 32 */ + SETUP_REGISTER (asm_sc_apply_size_3); /* 33 */ + SETUP_REGISTER (asm_sc_apply_size_4); /* 34 */ + SETUP_REGISTER (asm_sc_apply_size_5); /* 35 */ + SETUP_REGISTER (asm_sc_apply_size_6); /* 36 */ + SETUP_REGISTER (asm_sc_apply_size_7); /* 37 */ + SETUP_REGISTER (asm_sc_apply_size_8); /* 38 */ + + /* Logically, this should be up by the other interrupt routines, but + I just wrote all those numbers above by hand and am too exhausted + by that gruelling effort to be inclined to go to the trouble to + renumber them now. */ + SETUP_REGISTER (asm_interrupt_continuation_2); /* 39 */ #ifdef _MACH_UNIX { diff --git a/src/microcode/cmpintmd/x86-64.h b/src/microcode/cmpintmd/x86-64.h index 6076bfdf6..6b29d7db0 100644 --- a/src/microcode/cmpintmd/x86-64.h +++ b/src/microcode/cmpintmd/x86-64.h @@ -30,53 +30,6 @@ USA. /* -Problems with the AMD x86-64 instruction set architecture -==================================================== - -1. Jumps are PC-relative. There are absolute jumps, assuming the PC - is in a data location, or with immediate destinations that include - a segment descriptor (16 bits). The short forms have a PC-relative - offset defined with respect to the immediately following - instruction. - -Problem: Closures and execute caches need their address in old space - in order to be relocated correctly. - -Fix: - -For execute caches we can define a new linker field, called -load-relocation-address which on every GC/relocation stores the new -address and the old contents into global variables and stores the new -address in the field. Alternatively the difference between the new -address and the old contents can be stored into a single global -variable, and this can be used, together with the new address of each -cache, to find the old code. - -For closures the code that reads the header (manifest closure) can do -the same. - - -2. The CALL and JMP instructions do not accept 64-bit displacements. - -Problem: We want heaps bigger than 4 GB. - -Fix: - -Assemble more than one instruction for closure entry points, expanding -them even more. Yech. - - -3. The stack pointer register (RSP) cannot be used as the base in - (base + displacement) addressing mode. - -Problem: Common operation in the compiler, which assumes direct access - to the stack. - -Fix: Use base + indexed mode, which allows specification of RSP as - base and nullification of the index (by using RSP again). This is - one byte longer than otherwise, but... - - Register assignments ==================== @@ -92,10 +45,6 @@ RDI (7) Free Pointer R8-R15 Unassigned -The dynamic link and value "registers" are not processor registers. -Slots in the register array must be reserved for them. -[TRC 20091025: Later, we ought to use machine registers for these.] - The Free Pointer is RDI because RDI is the implicit base register for the memory-to-memory move instructions, and the string store instruction. Perhaps we can make use of it. @@ -165,51 +114,17 @@ entry 0 MOV AL,code 0xB0, code-byte 2 CALL n(RSI) 0xFF 0x96 n-longword 8 - -[TRC 20091027: The next two are wrong; need to update.] - -- GC & interrupt check at procedure/continuation entry: - -gc_lab -7 CALL n(RSI) 0xFF 0x56 n-byte - -4 - -2 -entry 0 CMP RDI,(RSI) 0x48 0x39 0x3e - 3 JAE gc_lab 0x73 -12 - 5 - - -- GC & interrupt check at closure entry: - -gc_lab -11 ADD (RSP),&offset 0x83 0x04 0x24 offset-byte - -7 JMP n(RSI) 0xFF 0x66 n-byte - -4 - -2 -entry 0 ADD (RSP),&magic 0x81 0x04 0x24 magic-longword - 7 CMP RDI,(RSI) 0x39 0x3e - 9 JAE gc_lab 0x73 0xea (= -22) - 11 - -The magic value depends on the closure because of canonicalization. - -The ADD instruction at offset -11 is not present for the 0th closure -entry, since it is the canonical entry point. Its format depends on -the value of offset, since the sign-extending forms often suffice. - -offset = entry_number * entry_size -magic = ([TC_COMPILED_ENTRY | 0] - (offset + length_of_CALL_instruction)) - */ #define ASM_RESET_HOOK x86_64_reset_hook -#define FPE_RESET_TRAPS x86_64_interface_initialize #define CMPINT_USE_STRUCS 1 /* These next definitions must agree with "cmpauxmd/x86-64.m4", which is where the register block is allocated. */ #define COMPILER_REGBLOCK_N_FIXED 16 -/* Big enough to hold 80-bit floating-point value: */ -#define COMPILER_TEMP_SIZE 2 +/* Size in objects of the largest quantities that RTL registers can hold. */ +#define COMPILER_TEMP_SIZE 1 #define COMPILER_REGBLOCK_N_TEMPS 256 #define COMPILER_REGBLOCK_N_HOOKS 80 #define COMPILER_HOOK_SIZE 1 @@ -217,8 +132,6 @@ magic = ([TC_COMPILED_ENTRY | 0] - (offset + length_of_CALL_instruction)) #define COMPILER_REGBLOCK_EXTRA_SIZE \ (COMPILER_REGBLOCK_N_HOOKS * COMPILER_HOOK_SIZE) -#define REGBLOCK_ALLOCATED_BY_INTERFACE true - typedef byte_t insn_t; /* Number of insn_t units preceding entry address in which header @@ -228,8 +141,11 @@ typedef byte_t insn_t; #define CC_ENTRY_OFFSET_SIZE 2 /* Number of insn_t units preceding entry header in which GC trap - instructions are stored. */ -#define CC_ENTRY_GC_TRAP_SIZE 3 + instructions are stored. This is an approximation: it matches only + those non-closure procedures for which LIAR has generated interrupt + checks, in which case there is one CALL n(RSI), which is encoded as + #xff #x96 , where n is a longword (32 bits). */ +#define CC_ENTRY_GC_TRAP_SIZE 6 #define EMBEDDED_CLOSURE_ADDRS_P 1 @@ -250,23 +166,6 @@ typedef byte_t insn_t; #define UUO_COUNT_TO_WORDS(nc) ((nc) * UUO_LINK_SIZE) #define READ_UUO_TARGET(a, r) read_uuo_target (a) - -#define FLUSH_I_CACHE() X86_64_CACHE_SYNCHRONIZE () -#define FLUSH_I_CACHE_REGION(address, nwords) X86_64_CACHE_SYNCHRONIZE () -#define PUSH_D_CACHE_REGION(address, nwords) X86_64_CACHE_SYNCHRONIZE () - -/* [TRC 20091025: The cache synchronization bug does not occur in any - x86-64 machines of which I am aware.] */ - -#define X86_64_CACHE_SYNCHRONIZE() do {} while (0) - -/* -#define X86_64_CACHE_SYNCHRONIZE() do \ -{ \ - if (x86_64_cpuid_needed) \ - x86_64_cache_synchronize (); \ -} while (false) -*/ #if defined(__WIN32__) && defined(__WATCOMC__) # define ASM_ENTRY_POINT(name) (__cdecl name) -- 2.25.1